//! Prompt construction for the LLM generation pipeline. //! //! Builds system and user prompts for: //! - **Search pass** (Pass 1): web search and initial article discovery //! - **Per-article classify**: per-article classification and summarization //! //! Prompts are provider-agnostic and parameterized by user settings. use crate::models::settings::UserSettings; use crate::models::source::Source; /// Build the system prompt and user prompt for the search pass (Pass 1). /// /// The search pass instructs the LLM to find recent news articles /// matching the user's theme and categories, using web search grounding. /// /// # Arguments /// * `settings` — User's configured settings (theme, categories, etc.) /// * `sources` — User's custom sources to prioritize /// * `current_date` — Formatted date string for the prompt /// * `recent_domains` — Domains used in recent syntheses to avoid if possible pub fn build_search_prompt( settings: &UserSettings, sources: &[Source], current_date: &str, recent_domains: &[String], category_gaps: Option<&[(String, i32)]>, ) -> (String, String) { let sources_text = if sources.is_empty() { String::new() } else { let list = sources .iter() .map(|s| format!("- {} ({})", s.title, s.url)) .collect::>() .join("\n"); format!( "\nEn plus des sources par defaut, tu DOIS imperativement consulter \ et integrer les informations provenant de ces sources personnalisees :\n{}\n", list ) }; let categories_text = settings .categories .iter() .enumerate() .map(|(i, cat)| format!("{}. {}", i + 1, cat)) .collect::>() .join("\n"); let behavior = if settings.search_agent_behavior.is_empty() { "Tu peux egalement utiliser d'autres sources pertinentes trouvees via la recherche Google." .to_string() } else { settings.search_agent_behavior.clone() }; let system_prompt = format!( "Tu es un assistant IA precis. Tu dois TOUJOURS fournir des URLs completes et exactes. \ Ne tronque jamais les URLs. Tu dois te concentrer UNIQUEMENT sur les actualites des {} \ derniers jours.", settings.max_age_days ); let user_prompt = format!( "Aujourd'hui, nous sommes le {date}.\n\ Tu es un expert en analyse de l'actualite sur le theme : \"{theme}\".\n\ Ta tache est de rechercher les actualites STRICTEMENT des {days} derniers jours.\n\ Ne retourne AUCUNE actualite datant de plus de {days} jours.\n\n\ Tu DOIS imperativement t'appuyer sur le contenu des sites web pertinents pour ce theme.\ {sources}\ {behavior}\n\n\ La synthese doit etre divisee en {count} grandes sections :\n\ {categories}\n\n\ Pour chaque categorie, fournis exactement {max_items} actualites.\n\ Pour chaque actualite, fournis un titre provisoire, l'URL source exacte et complete, \ et un resume provisoire.\n\ Ne retourne JAMAIS des URLs de pages d'accueil (homepage). Fournis toujours des liens \ directs vers des articles specifiques avec un chemin complet (pas juste le nom de domaine).\n\ Ne change jamais les URLs retournees, et ne les tronque jamais. \ Retourne le resultat au format JSON en utilisant les cles category_0, category_1, etc. \ correspondant a l'ordre des sections ci-dessus.", date = current_date, theme = settings.theme, days = settings.max_age_days, sources = sources_text, behavior = behavior, count = settings.categories.len(), categories = categories_text, max_items = settings.max_items_per_category, ); let user_prompt = if recent_domains.is_empty() { user_prompt } else { let domains_list = recent_domains.join(", "); format!( "{}\n\nEvite si possible les sources deja utilisees dans les syntheses precedentes : {}.", user_prompt, domains_list ) }; // If we have specific category gaps (Phase 2), replace the generic "N per category" line let user_prompt = if let Some(gaps) = category_gaps { let gaps_text = gaps .iter() .map(|(cat, needed)| format!("- {} : {} articles", cat, needed)) .collect::>() .join("\n"); user_prompt.replace( &format!("Pour chaque categorie, fournis exactement {} actualites.", settings.max_items_per_category), &format!("Fournis le nombre d'articles suivant par categorie :\n{}", gaps_text), ) } else { user_prompt }; (system_prompt, user_prompt) } /// Build a prompt for per-article classification and summarization. /// /// The LLM classifies the article into a category and generates a title + summary. pub fn build_article_classify_prompt( title: &str, body_snippet: &str, categories: &[String], summary_length: i32, ) -> (String, String) { let system_prompt = "Tu es un assistant qui analyse des articles d'actualite. \ Tu dois classer l'article dans une categorie et generer un titre et un resume. \ Reponds uniquement au format JSON demande." .to_string(); let categories_list = categories .iter() .map(|c| format!("- \"{}\"", c)) .collect::>() .join("\n"); let summary_instruction = match summary_length { 1 => "Genere un titre clair et un resume de 3 a 4 lignes.", 2 => "Genere un titre clair et un resume de 6 a 8 lignes.", _ => "Genere un titre clair et un resume detaille de 12 a 15 lignes.", }; let user_prompt = format!( "Voici un article d'actualite.\n\n\ Titre : {title}\n\n\ Contenu (extrait) :\n{body}\n\n\ Categories disponibles :\n{categories}\n\n\ Classe cet article dans la categorie la plus appropriee.\n\ Si aucune categorie ne correspond, utilise \"Autre\".\n\ {summary_instruction}\n\ Si le titre fourni est vide, genere un titre a partir du contenu.\n\ Extrais la date de publication de l'article au format YYYY-MM-DD. \ Si la date n'est pas disponible, retourne une chaine vide.\n\ Determine si ce contenu est un veritable article d'actualite. \ Retourne is_article=true pour un article, false pour une page de contact, \ mentions legales, page de navigation, FAQ, etc.", title = if title.is_empty() { "(pas de titre)" } else { title }, body = body_snippet, categories = categories_list, summary_instruction = summary_instruction, ); (system_prompt, user_prompt) } #[cfg(test)] mod tests { use super::*; use chrono::Utc; use uuid::Uuid; fn test_settings() -> UserSettings { UserSettings { user_id: Uuid::nil(), theme: "Intelligence Artificielle".to_string(), max_age_days: 7, categories: vec![ "Annonces majeures".to_string(), "Recherche et innovation".to_string(), ], max_items_per_category: 4, max_articles_per_source: 3, max_links_per_source: 8, use_brave_search: false, article_history_days: 90, batch_size: 5, summary_length: 3, source_extraction_window: 3, search_agent_behavior: String::new(), ai_provider: String::new(), ai_model: String::new(), ai_model_websearch: String::new(), rate_limit_max_requests: None, rate_limit_time_window_seconds: None, updated_at: Utc::now(), } } #[test] fn search_prompt_includes_theme() { let settings = test_settings(); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("Intelligence Artificielle")); } #[test] fn search_prompt_includes_date() { let settings = test_settings(); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("lundi 21 mars 2026")); } #[test] fn search_prompt_includes_max_age() { let settings = test_settings(); let (system, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("7 derniers jours")); assert!(system.contains("7")); } #[test] fn search_prompt_includes_categories() { let settings = test_settings(); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("1. Annonces majeures")); assert!(user_prompt.contains("2. Recherche et innovation")); assert!(user_prompt.contains("2 grandes sections")); } #[test] fn search_prompt_includes_max_items() { let settings = test_settings(); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("4 actualites")); } #[test] fn search_prompt_includes_custom_sources() { let settings = test_settings(); let sources = vec![ Source { id: Uuid::nil(), user_id: Uuid::nil(), title: "TechCrunch".into(), url: "https://techcrunch.com".into(), created_at: Utc::now(), }, Source { id: Uuid::nil(), user_id: Uuid::nil(), title: "The Verge".into(), url: "https://theverge.com".into(), created_at: Utc::now(), }, ]; let (_, user_prompt) = build_search_prompt(&settings, &sources, "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("TechCrunch (https://techcrunch.com)")); assert!(user_prompt.contains("The Verge (https://theverge.com)")); assert!(user_prompt.contains("sources personnalisees")); } #[test] fn search_prompt_no_sources_no_section() { let settings = test_settings(); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(!user_prompt.contains("sources personnalisees")); } #[test] fn search_prompt_custom_behavior() { let mut settings = test_settings(); settings.search_agent_behavior = "Concentre-toi sur les sources europeennes.".to_string(); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("Concentre-toi sur les sources europeennes.")); assert!(!user_prompt.contains("recherche Google")); } #[test] fn search_prompt_default_behavior_when_empty() { let settings = test_settings(); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("recherche Google")); } #[test] fn search_prompt_warns_against_homepage_urls() { let settings = test_settings(); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("pages d'accueil")); assert!(user_prompt.contains("articles specifiques")); } #[test] fn search_prompt_includes_recent_domains_avoidance() { let settings = test_settings(); let sources = vec![]; let date = "lundi 17 mars 2026"; let domains = vec!["techcrunch.com".to_string(), "theverge.com".to_string()]; let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &domains, None); assert!(user_prompt.contains("Evite si possible")); assert!(user_prompt.contains("techcrunch.com")); assert!(user_prompt.contains("theverge.com")); } #[test] fn search_prompt_no_avoidance_when_domains_empty() { let settings = test_settings(); let sources = vec![]; let date = "lundi 17 mars 2026"; let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None); assert!(!user_prompt.contains("Evite si possible")); } #[test] fn search_prompt_with_category_gaps() { let settings = test_settings(); let sources = vec![]; let date = "lundi 17 mars 2026"; let gaps = vec![ ("AI News".to_string(), 2), ("Cybersecurity".to_string(), 4), ]; let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], Some(&gaps)); assert!(user_prompt.contains("AI News : 2 articles")); assert!(user_prompt.contains("Cybersecurity : 4 articles")); assert!(!user_prompt.contains("exactement")); } #[test] fn search_prompt_without_gaps_uses_default() { let settings = test_settings(); let sources = vec![]; let date = "lundi 17 mars 2026"; let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None); assert!(user_prompt.contains("exactement")); } #[test] fn article_classify_prompt_includes_content() { let (sys, user) = build_article_classify_prompt( "GPT-5 Released", "OpenAI released GPT-5 today", &["AI News".into(), "Divers".into()], 3, ); assert!(user.contains("GPT-5 Released")); assert!(user.contains("AI News")); assert!(user.contains("Divers")); assert!(sys.contains("classer")); } #[test] fn article_classify_prompt_handles_empty_title() { let (_, user) = build_article_classify_prompt("", "Some content", &["Tech".into(), "Divers".into()], 3); assert!(user.contains("(pas de titre)")); } #[test] fn article_classify_prompt_short_summary() { let (_, user) = build_article_classify_prompt("Title", "Content", &["AI".into()], 1); assert!(user.contains("3 a 4 lignes")); } #[test] fn article_classify_prompt_detailed_summary() { let (_, user) = build_article_classify_prompt("Title", "Content", &["AI".into()], 3); assert!(user.contains("12 a 15 lignes")); } }