You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

269 lines
10 KiB
Rust

//! Prompt construction for the two-pass LLM generation pipeline.
//!
//! Builds system and user prompts for:
//! - **Search pass** (Pass 1): web search and initial article discovery
//! - **Rewrite pass** (Pass 2): rewrite summaries using scraped content
//!
//! Prompts are provider-agnostic and parameterized by user settings.
use crate::models::settings::UserSettings;
use crate::models::source::Source;
use crate::models::synthesis::ScrapedNewsItem;
/// Build the system prompt and user prompt for the search pass (Pass 1).
///
/// The search pass instructs the LLM to find recent news articles
/// matching the user's theme and categories, using web search grounding.
///
/// # Arguments
/// * `settings` — User's configured settings (theme, categories, etc.)
/// * `sources` — User's custom sources to prioritize
/// * `current_date` — Formatted date string for the prompt
pub fn build_search_prompt(
settings: &UserSettings,
sources: &[Source],
current_date: &str,
) -> (String, String) {
let sources_text = if sources.is_empty() {
String::new()
} else {
let list = sources
.iter()
.map(|s| format!("- {} ({})", s.title, s.url))
.collect::<Vec<_>>()
.join("\n");
format!(
"\nEn plus des sources par defaut, tu DOIS imperativement consulter \
et integrer les informations provenant de ces sources personnalisees :\n{}\n",
list
)
};
let categories_text = settings
.categories
.iter()
.enumerate()
.map(|(i, cat)| format!("{}. {}", i + 1, cat))
.collect::<Vec<_>>()
.join("\n");
let behavior = if settings.search_agent_behavior.is_empty() {
"Tu peux egalement utiliser d'autres sources pertinentes trouvees via la recherche Google."
.to_string()
} else {
settings.search_agent_behavior.clone()
};
let system_prompt = format!(
"Tu es un assistant IA precis. Tu dois TOUJOURS fournir des URLs completes et exactes. \
Ne tronque jamais les URLs. Tu dois te concentrer UNIQUEMENT sur les actualites des {} \
derniers jours.",
settings.max_age_days
);
let user_prompt = format!(
"Aujourd'hui, nous sommes le {date}.\n\
Tu es un expert en analyse de l'actualite sur le theme : \"{theme}\".\n\
Ta tache est de rechercher les actualites STRICTEMENT des {days} derniers jours.\n\
Ne retourne AUCUNE actualite datant de plus de {days} jours.\n\n\
Tu DOIS imperativement t'appuyer sur le contenu des sites web pertinents pour ce theme.\
{sources}\
{behavior}\n\n\
La synthese doit etre divisee en {count} grandes sections :\n\
{categories}\n\n\
Pour chaque categorie, fournis au maximum {max_items} actualites.\n\
Pour chaque actualite, fournis un titre provisoire, l'URL source exacte et complete, \
et un resume provisoire.\n\
Retourne le resultat au format JSON en utilisant les cles category_0, category_1, etc. \
correspondant a l'ordre des sections ci-dessus.",
date = current_date,
theme = settings.theme,
days = settings.max_age_days,
sources = sources_text,
behavior = behavior,
count = settings.categories.len(),
categories = categories_text,
max_items = settings.max_items_per_category,
);
(system_prompt, user_prompt)
}
/// Build the system prompt and user prompt for the rewrite pass (Pass 2).
///
/// The rewrite pass takes scraped article content and asks the LLM to
/// rewrite titles and summaries to faithfully reflect the actual content.
///
/// # Arguments
/// * `scraped_data` — Map of category key to scraped news items with content
pub fn build_rewrite_prompt(
scraped_data: &std::collections::HashMap<String, Vec<ScrapedNewsItem>>,
) -> (String, String) {
let system_prompt =
"Tu es un assistant IA precis. Tu dois generer des titres et resumes fideles \
au contenu fourni."
.to_string();
let data_json = serde_json::to_string_pretty(scraped_data).unwrap_or_default();
let user_prompt = format!(
"Tu es un expert en analyse de l'actualite.\n\
Voici une liste d'articles d'actualite classes par categorie, avec leur contenu textuel \
brut extrait des sites web ('scrapedContent').\n\
Ta tache est de reecrire le 'title' et le 'summary' (4 ou 5 lignes) pour chaque article \
afin qu'ils refletent EXACTEMENT et FIDELEMENT le contenu textuel fourni.\n\
Si le 'scrapedContent' est vide ou insuffisant, utilise le titre et le resume originaux \
pour faire au mieux.\n\
Conserve EXACTEMENT les memes URLs. Ne supprime aucun article de cette liste.\n\n\
Donnees des articles :\n{data}",
data = data_json,
);
(system_prompt, user_prompt)
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Utc;
use uuid::Uuid;
fn test_settings() -> UserSettings {
UserSettings {
user_id: Uuid::nil(),
theme: "Intelligence Artificielle".to_string(),
max_age_days: 7,
categories: vec![
"Annonces majeures".to_string(),
"Recherche et innovation".to_string(),
],
max_items_per_category: 4,
search_agent_behavior: String::new(),
ai_provider: String::new(),
ai_model: String::new(),
ai_model_writing: String::new(),
rate_limit_max_requests: None,
rate_limit_time_window_seconds: None,
updated_at: Utc::now(),
}
}
#[test]
fn search_prompt_includes_theme() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
assert!(user_prompt.contains("Intelligence Artificielle"));
}
#[test]
fn search_prompt_includes_date() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
assert!(user_prompt.contains("lundi 21 mars 2026"));
}
#[test]
fn search_prompt_includes_max_age() {
let settings = test_settings();
let (system, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
assert!(user_prompt.contains("7 derniers jours"));
assert!(system.contains("7"));
}
#[test]
fn search_prompt_includes_categories() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
assert!(user_prompt.contains("1. Annonces majeures"));
assert!(user_prompt.contains("2. Recherche et innovation"));
assert!(user_prompt.contains("2 grandes sections"));
}
#[test]
fn search_prompt_includes_max_items() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
assert!(user_prompt.contains("4 actualites"));
}
#[test]
fn search_prompt_includes_custom_sources() {
let settings = test_settings();
let sources = vec![
Source {
id: Uuid::nil(),
user_id: Uuid::nil(),
title: "TechCrunch".into(),
url: "https://techcrunch.com".into(),
created_at: Utc::now(),
},
Source {
id: Uuid::nil(),
user_id: Uuid::nil(),
title: "The Verge".into(),
url: "https://theverge.com".into(),
created_at: Utc::now(),
},
];
let (_, user_prompt) = build_search_prompt(&settings, &sources, "lundi 21 mars 2026");
assert!(user_prompt.contains("TechCrunch (https://techcrunch.com)"));
assert!(user_prompt.contains("The Verge (https://theverge.com)"));
assert!(user_prompt.contains("sources personnalisees"));
}
#[test]
fn search_prompt_no_sources_no_section() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
assert!(!user_prompt.contains("sources personnalisees"));
}
#[test]
fn search_prompt_custom_behavior() {
let mut settings = test_settings();
settings.search_agent_behavior =
"Concentre-toi sur les sources europeennes.".to_string();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
assert!(user_prompt.contains("Concentre-toi sur les sources europeennes."));
assert!(!user_prompt.contains("recherche Google"));
}
#[test]
fn search_prompt_default_behavior_when_empty() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
assert!(user_prompt.contains("recherche Google"));
}
#[test]
fn rewrite_prompt_includes_instructions() {
let mut data = std::collections::HashMap::new();
data.insert(
"category_0".to_string(),
vec![ScrapedNewsItem {
title: "Test Article".into(),
url: "https://example.com".into(),
summary: "A summary".into(),
scraped_content: "Full article text here...".into(),
}],
);
let (system, user_prompt) = build_rewrite_prompt(&data);
assert!(system.contains("fideles"));
assert!(user_prompt.contains("scrapedContent"));
assert!(user_prompt.contains("Test Article"));
assert!(user_prompt.contains("https://example.com"));
assert!(user_prompt.contains("Ne supprime aucun article"));
}
#[test]
fn rewrite_prompt_with_empty_data() {
let data = std::collections::HashMap::new();
let (_, user_prompt) = build_rewrite_prompt(&data);
// Should still produce a valid prompt with empty data
assert!(user_prompt.contains("Donnees des articles"));
}
}