You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

380 lines
14 KiB
Rust

//! Prompt construction for the LLM generation pipeline.
//!
//! Builds system and user prompts for:
//! - **Search pass** (Pass 1): web search and initial article discovery
//! - **Per-article classify**: per-article classification and summarization
//!
//! Prompts are provider-agnostic and parameterized by user settings.
use crate::models::settings::UserSettings;
use crate::models::source::Source;
/// Build the system prompt and user prompt for the search pass (Pass 1).
///
/// The search pass instructs the LLM to find recent news articles
/// matching the user's theme and categories, using web search grounding.
///
/// # Arguments
/// * `settings` — User's configured settings (theme, categories, etc.)
/// * `sources` — User's custom sources to prioritize
/// * `current_date` — Formatted date string for the prompt
/// * `recent_domains` — Domains used in recent syntheses to avoid if possible
pub fn build_search_prompt(
settings: &UserSettings,
sources: &[Source],
current_date: &str,
recent_domains: &[String],
category_gaps: Option<&[(String, i32)]>,
) -> (String, String) {
let sources_text = if sources.is_empty() {
String::new()
} else {
let list = sources
.iter()
.map(|s| format!("- {} ({})", s.title, s.url))
.collect::<Vec<_>>()
.join("\n");
format!(
"\nEn plus des sources par defaut, tu DOIS imperativement consulter \
et integrer les informations provenant de ces sources personnalisees :\n{}\n",
list
)
};
let categories_text = settings
.categories
.iter()
.enumerate()
.map(|(i, cat)| format!("{}. {}", i + 1, cat))
.collect::<Vec<_>>()
.join("\n");
let behavior = if settings.search_agent_behavior.is_empty() {
"Tu peux egalement utiliser d'autres sources pertinentes trouvees via la recherche Google."
.to_string()
} else {
settings.search_agent_behavior.clone()
};
let system_prompt = format!(
"Tu es un assistant IA precis. Tu dois TOUJOURS fournir des URLs completes et exactes. \
Ne tronque jamais les URLs. Tu dois te concentrer UNIQUEMENT sur les actualites des {} \
derniers jours.",
settings.max_age_days
);
let user_prompt = format!(
"Aujourd'hui, nous sommes le {date}.\n\
Tu es un expert en analyse de l'actualite sur le theme : \"{theme}\".\n\
Ta tache est de rechercher les actualites STRICTEMENT des {days} derniers jours.\n\
Ne retourne AUCUNE actualite datant de plus de {days} jours.\n\n\
Tu DOIS imperativement t'appuyer sur le contenu des sites web pertinents pour ce theme.\
{sources}\
{behavior}\n\n\
La synthese doit etre divisee en {count} grandes sections :\n\
{categories}\n\n\
Pour chaque categorie, fournis exactement {max_items} actualites.\n\
Pour chaque actualite, fournis un titre provisoire, l'URL source exacte et complete, \
et un resume provisoire.\n\
Ne retourne JAMAIS des URLs de pages d'accueil (homepage). Fournis toujours des liens \
directs vers des articles specifiques avec un chemin complet (pas juste le nom de domaine).\n\
Ne change jamais les URLs retournees, et ne les tronque jamais. \
Retourne le resultat au format JSON en utilisant les cles category_0, category_1, etc. \
correspondant a l'ordre des sections ci-dessus.",
date = current_date,
theme = settings.theme,
days = settings.max_age_days,
sources = sources_text,
behavior = behavior,
count = settings.categories.len(),
categories = categories_text,
max_items = settings.max_items_per_category,
);
let user_prompt = if recent_domains.is_empty() {
user_prompt
} else {
let domains_list = recent_domains.join(", ");
format!(
"{}\n\nEvite si possible les sources deja utilisees dans les syntheses precedentes : {}.",
user_prompt, domains_list
)
};
// If we have specific category gaps (Phase 2), replace the generic "N per category" line
let user_prompt = if let Some(gaps) = category_gaps {
let gaps_text = gaps
.iter()
.map(|(cat, needed)| format!("- {} : {} articles", cat, needed))
.collect::<Vec<_>>()
.join("\n");
user_prompt.replace(
&format!("Pour chaque categorie, fournis exactement {} actualites.", settings.max_items_per_category),
&format!("Fournis le nombre d'articles suivant par categorie :\n{}", gaps_text),
)
} else {
user_prompt
};
(system_prompt, user_prompt)
}
/// Build a prompt for per-article classification and summarization.
///
/// The LLM classifies the article into a category and generates a title + summary.
pub fn build_article_classify_prompt(
title: &str,
body_snippet: &str,
categories: &[String],
summary_length: i32,
) -> (String, String) {
let system_prompt =
"Tu es un assistant qui analyse des articles d'actualite. \
Tu dois classer l'article dans une categorie et generer un titre et un resume. \
Reponds uniquement au format JSON demande."
.to_string();
let categories_list = categories
.iter()
.map(|c| format!("- \"{}\"", c))
.collect::<Vec<_>>()
.join("\n");
let summary_instruction = match summary_length {
1 => "Genere un titre clair et un resume de 3 a 4 lignes.",
2 => "Genere un titre clair et un resume de 6 a 8 lignes.",
_ => "Genere un titre clair et un resume detaille de 12 a 15 lignes.",
};
let user_prompt = format!(
"Voici un article d'actualite.\n\n\
Titre : {title}\n\n\
Contenu (extrait) :\n{body}\n\n\
Categories disponibles :\n{categories}\n\n\
Classe cet article dans la categorie la plus appropriee.\n\
Si aucune categorie ne correspond, utilise \"Autre\".\n\
{summary_instruction}\n\
Si le titre fourni est vide, genere un titre a partir du contenu.\n\
Extrais la date de publication de l'article au format YYYY-MM-DD. \
Si la date n'est pas disponible, retourne une chaine vide.\n\
Determine si ce contenu est un veritable article d'actualite. \
Retourne is_article=true pour un article, false pour une page de contact, \
mentions legales, page de navigation, FAQ, etc.",
title = if title.is_empty() { "(pas de titre)" } else { title },
body = body_snippet,
categories = categories_list,
summary_instruction = summary_instruction,
);
(system_prompt, user_prompt)
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Utc;
use uuid::Uuid;
fn test_settings() -> UserSettings {
UserSettings {
user_id: Uuid::nil(),
theme: "Intelligence Artificielle".to_string(),
max_age_days: 7,
categories: vec![
"Annonces majeures".to_string(),
"Recherche et innovation".to_string(),
],
max_items_per_category: 4,
max_articles_per_source: 3,
max_links_per_source: 8,
use_brave_search: false,
article_history_days: 90,
batch_size: 5,
summary_length: 3,
source_extraction_window: 3,
search_agent_behavior: String::new(),
ai_provider: String::new(),
ai_model: String::new(),
ai_model_websearch: String::new(),
rate_limit_max_requests: None,
rate_limit_time_window_seconds: None,
updated_at: Utc::now(),
}
}
#[test]
fn search_prompt_includes_theme() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("Intelligence Artificielle"));
}
#[test]
fn search_prompt_includes_date() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("lundi 21 mars 2026"));
}
#[test]
fn search_prompt_includes_max_age() {
let settings = test_settings();
let (system, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("7 derniers jours"));
assert!(system.contains("7"));
}
#[test]
fn search_prompt_includes_categories() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("1. Annonces majeures"));
assert!(user_prompt.contains("2. Recherche et innovation"));
assert!(user_prompt.contains("2 grandes sections"));
}
#[test]
fn search_prompt_includes_max_items() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("4 actualites"));
}
#[test]
fn search_prompt_includes_custom_sources() {
let settings = test_settings();
let sources = vec![
Source {
id: Uuid::nil(),
user_id: Uuid::nil(),
title: "TechCrunch".into(),
url: "https://techcrunch.com".into(),
created_at: Utc::now(),
},
Source {
id: Uuid::nil(),
user_id: Uuid::nil(),
title: "The Verge".into(),
url: "https://theverge.com".into(),
created_at: Utc::now(),
},
];
let (_, user_prompt) = build_search_prompt(&settings, &sources, "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("TechCrunch (https://techcrunch.com)"));
assert!(user_prompt.contains("The Verge (https://theverge.com)"));
assert!(user_prompt.contains("sources personnalisees"));
}
#[test]
fn search_prompt_no_sources_no_section() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(!user_prompt.contains("sources personnalisees"));
}
#[test]
fn search_prompt_custom_behavior() {
let mut settings = test_settings();
settings.search_agent_behavior =
"Concentre-toi sur les sources europeennes.".to_string();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("Concentre-toi sur les sources europeennes."));
assert!(!user_prompt.contains("recherche Google"));
}
#[test]
fn search_prompt_default_behavior_when_empty() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("recherche Google"));
}
#[test]
fn search_prompt_warns_against_homepage_urls() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("pages d'accueil"));
assert!(user_prompt.contains("articles specifiques"));
}
#[test]
fn search_prompt_includes_recent_domains_avoidance() {
let settings = test_settings();
let sources = vec![];
let date = "lundi 17 mars 2026";
let domains = vec!["techcrunch.com".to_string(), "theverge.com".to_string()];
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &domains, None);
assert!(user_prompt.contains("Evite si possible"));
assert!(user_prompt.contains("techcrunch.com"));
assert!(user_prompt.contains("theverge.com"));
}
#[test]
fn search_prompt_no_avoidance_when_domains_empty() {
let settings = test_settings();
let sources = vec![];
let date = "lundi 17 mars 2026";
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
assert!(!user_prompt.contains("Evite si possible"));
}
#[test]
fn search_prompt_with_category_gaps() {
let settings = test_settings();
let sources = vec![];
let date = "lundi 17 mars 2026";
let gaps = vec![
("AI News".to_string(), 2),
("Cybersecurity".to_string(), 4),
];
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], Some(&gaps));
assert!(user_prompt.contains("AI News : 2 articles"));
assert!(user_prompt.contains("Cybersecurity : 4 articles"));
assert!(!user_prompt.contains("exactement"));
}
#[test]
fn search_prompt_without_gaps_uses_default() {
let settings = test_settings();
let sources = vec![];
let date = "lundi 17 mars 2026";
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
assert!(user_prompt.contains("exactement"));
}
#[test]
fn article_classify_prompt_includes_content() {
let (sys, user) = build_article_classify_prompt(
"GPT-5 Released",
"OpenAI released GPT-5 today",
&["AI News".into(), "Autre".into()],
3,
);
assert!(user.contains("GPT-5 Released"));
assert!(user.contains("AI News"));
assert!(user.contains("Autre"));
assert!(sys.contains("classer"));
}
#[test]
fn article_classify_prompt_handles_empty_title() {
let (_, user) = build_article_classify_prompt("", "Some content", &["Tech".into(), "Autre".into()], 3);
assert!(user.contains("(pas de titre)"));
}
#[test]
fn article_classify_prompt_short_summary() {
let (_, user) = build_article_classify_prompt("Title", "Content", &["AI".into()], 1);
assert!(user.contains("3 a 4 lignes"));
}
#[test]
fn article_classify_prompt_detailed_summary() {
let (_, user) = build_article_classify_prompt("Title", "Content", &["AI".into()], 3);
assert!(user.contains("12 a 15 lignes"));
}
}