You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
391 lines
15 KiB
Rust
391 lines
15 KiB
Rust
//! Prompt construction for the LLM generation pipeline.
|
|
//!
|
|
//! Builds system and user prompts for:
|
|
//! - **Search pass** (Pass 1): web search and initial article discovery
|
|
//! - **Per-article classify**: per-article classification and summarization
|
|
//!
|
|
//! Prompts are provider-agnostic and parameterized by user settings.
|
|
|
|
use crate::models::settings::UserSettings;
|
|
use crate::models::source::Source;
|
|
|
|
/// Build the system prompt and user prompt for the search pass (Pass 1).
|
|
///
|
|
/// The search pass instructs the LLM to find recent news articles
|
|
/// matching the user's theme and categories, using web search grounding.
|
|
///
|
|
/// # Arguments
|
|
/// * `settings` — User's configured settings (theme, categories, etc.)
|
|
/// * `sources` — User's custom sources to prioritize
|
|
/// * `current_date` — Formatted date string for the prompt
|
|
/// * `recent_domains` — Domains used in recent syntheses to avoid if possible
|
|
pub fn build_search_prompt(
|
|
settings: &UserSettings,
|
|
sources: &[Source],
|
|
current_date: &str,
|
|
recent_domains: &[String],
|
|
category_gaps: Option<&[(String, i32)]>,
|
|
) -> (String, String) {
|
|
let sources_text = if sources.is_empty() {
|
|
String::new()
|
|
} else {
|
|
let list = sources
|
|
.iter()
|
|
.map(|s| format!("- {} ({})", s.title, s.url))
|
|
.collect::<Vec<_>>()
|
|
.join("\n");
|
|
format!(
|
|
"\nEn plus des sources par defaut, tu DOIS imperativement consulter \
|
|
et integrer les informations provenant de ces sources personnalisees :\n{}\n",
|
|
list
|
|
)
|
|
};
|
|
|
|
let categories_text = settings
|
|
.categories
|
|
.iter()
|
|
.enumerate()
|
|
.map(|(i, cat)| format!("{}. {}", i + 1, cat))
|
|
.collect::<Vec<_>>()
|
|
.join("\n");
|
|
|
|
let behavior = if settings.search_agent_behavior.is_empty() {
|
|
"Tu peux egalement utiliser d'autres sources pertinentes trouvees via la recherche Google."
|
|
.to_string()
|
|
} else {
|
|
settings.search_agent_behavior.clone()
|
|
};
|
|
|
|
let system_prompt = format!(
|
|
"Tu es un assistant IA precis. Tu dois TOUJOURS fournir des URLs completes et exactes. \
|
|
Ne tronque jamais les URLs. Tu dois te concentrer UNIQUEMENT sur les actualites des {} \
|
|
derniers jours.",
|
|
settings.max_age_days
|
|
);
|
|
|
|
let user_prompt = format!(
|
|
"Aujourd'hui, nous sommes le {date}.\n\
|
|
Tu es un expert en analyse de l'actualite sur le theme : \"{theme}\".\n\
|
|
Ta tache est de rechercher les actualites STRICTEMENT des {days} derniers jours.\n\
|
|
Ne retourne AUCUNE actualite datant de plus de {days} jours.\n\n\
|
|
Tu DOIS imperativement t'appuyer sur le contenu des sites web pertinents pour ce theme.\
|
|
{sources}\
|
|
{behavior}\n\n\
|
|
La synthese doit etre divisee en {count} grandes sections :\n\
|
|
{categories}\n\n\
|
|
Pour chaque categorie, fournis exactement {max_items} actualites.\n\
|
|
Pour chaque actualite, fournis un titre provisoire, l'URL source exacte et complete, \
|
|
et un resume provisoire.\n\
|
|
Ne retourne JAMAIS des URLs de pages d'accueil (homepage). Fournis toujours des liens \
|
|
directs vers des articles specifiques avec un chemin complet (pas juste le nom de domaine).\n\
|
|
Ne change jamais les URLs retournees, et ne les tronque jamais. \
|
|
Retourne le resultat au format JSON en utilisant les cles category_0, category_1, etc. \
|
|
correspondant a l'ordre des sections ci-dessus.",
|
|
date = current_date,
|
|
theme = settings.theme,
|
|
days = settings.max_age_days,
|
|
sources = sources_text,
|
|
behavior = behavior,
|
|
count = settings.categories.len(),
|
|
categories = categories_text,
|
|
max_items = settings.max_items_per_category,
|
|
);
|
|
|
|
let user_prompt = if recent_domains.is_empty() {
|
|
user_prompt
|
|
} else {
|
|
let domains_list = recent_domains.join(", ");
|
|
format!(
|
|
"{}\n\nEvite si possible les sources deja utilisees dans les syntheses precedentes : {}.",
|
|
user_prompt, domains_list
|
|
)
|
|
};
|
|
|
|
// If we have specific category gaps (Phase 2), replace the generic "N per category" line
|
|
let user_prompt = if let Some(gaps) = category_gaps {
|
|
let gaps_text = gaps
|
|
.iter()
|
|
.map(|(cat, needed)| format!("- {} : {} articles", cat, needed))
|
|
.collect::<Vec<_>>()
|
|
.join("\n");
|
|
user_prompt.replace(
|
|
&format!("Pour chaque categorie, fournis exactement {} actualites.", settings.max_items_per_category),
|
|
&format!("Fournis le nombre d'articles suivant par categorie :\n{}", gaps_text),
|
|
)
|
|
} else {
|
|
user_prompt
|
|
};
|
|
|
|
(system_prompt, user_prompt)
|
|
}
|
|
|
|
/// Build a prompt for LLM-assisted link extraction from a source page.
|
|
pub fn build_link_extraction_prompt(body_html: &str) -> (String, String) {
|
|
let system_prompt =
|
|
"Tu es un assistant qui analyse des pages web. \
|
|
Tu dois identifier les liens vers des articles d'actualite. \
|
|
Reponds uniquement au format JSON demande."
|
|
.to_string();
|
|
|
|
let body_truncated: String = body_html.chars().take(12000).collect();
|
|
|
|
let user_prompt = format!(
|
|
"Voici le contenu HTML du body d'une page de blog ou de site d'actualites.\n\n\
|
|
{body}\n\n\
|
|
Extrais UNIQUEMENT les URLs qui pointent vers des articles \
|
|
(pas les liens de navigation, tags, categories, login, pages statiques, topics, \
|
|
archive, companies, events, company, event, collections, etc.).\n\
|
|
Retourne les URLs completes, sans les modifier, dans le format JSON demande. \
|
|
Ne change jamais les URLs retournees, et ne les tronque jamais.",
|
|
body = body_truncated,
|
|
);
|
|
|
|
(system_prompt, user_prompt)
|
|
}
|
|
|
|
/// Build a prompt for per-article classification and summarization.
|
|
///
|
|
/// The LLM classifies the article into a category and generates a title + summary.
|
|
pub fn build_article_classify_prompt(
|
|
title: &str,
|
|
body_snippet: &str,
|
|
categories: &[String],
|
|
) -> (String, String) {
|
|
let system_prompt =
|
|
"Tu es un assistant qui analyse des articles d'actualite. \
|
|
Tu dois classer l'article dans une categorie et generer un titre et un resume. \
|
|
Reponds uniquement au format JSON demande."
|
|
.to_string();
|
|
|
|
let categories_list = categories
|
|
.iter()
|
|
.map(|c| format!("- \"{}\"", c))
|
|
.collect::<Vec<_>>()
|
|
.join("\n");
|
|
|
|
let user_prompt = format!(
|
|
"Voici un article d'actualite.\n\n\
|
|
Titre : {title}\n\n\
|
|
Contenu (extrait) :\n{body}\n\n\
|
|
Categories disponibles :\n{categories}\n\n\
|
|
Classe cet article dans la categorie la plus appropriee.\n\
|
|
Si aucune categorie ne correspond, utilise \"Autre\".\n\
|
|
Genere un titre clair et un resume de 4 a 5 lignes.\n\
|
|
Si le titre fourni est vide, genere un titre a partir du contenu.",
|
|
title = if title.is_empty() { "(pas de titre)" } else { title },
|
|
body = body_snippet,
|
|
categories = categories_list,
|
|
);
|
|
|
|
(system_prompt, user_prompt)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use chrono::Utc;
|
|
use uuid::Uuid;
|
|
|
|
fn test_settings() -> UserSettings {
|
|
UserSettings {
|
|
user_id: Uuid::nil(),
|
|
theme: "Intelligence Artificielle".to_string(),
|
|
max_age_days: 7,
|
|
categories: vec![
|
|
"Annonces majeures".to_string(),
|
|
"Recherche et innovation".to_string(),
|
|
],
|
|
max_items_per_category: 4,
|
|
max_articles_per_source: 3,
|
|
use_llm_for_source_links: false,
|
|
article_history_days: 90,
|
|
batch_size: 5,
|
|
search_agent_behavior: String::new(),
|
|
ai_provider: String::new(),
|
|
ai_model: String::new(),
|
|
ai_model_websearch: String::new(),
|
|
rate_limit_max_requests: None,
|
|
rate_limit_time_window_seconds: None,
|
|
updated_at: Utc::now(),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_includes_theme() {
|
|
let settings = test_settings();
|
|
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
|
|
assert!(user_prompt.contains("Intelligence Artificielle"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_includes_date() {
|
|
let settings = test_settings();
|
|
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
|
|
assert!(user_prompt.contains("lundi 21 mars 2026"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_includes_max_age() {
|
|
let settings = test_settings();
|
|
let (system, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
|
|
assert!(user_prompt.contains("7 derniers jours"));
|
|
assert!(system.contains("7"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_includes_categories() {
|
|
let settings = test_settings();
|
|
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
|
|
assert!(user_prompt.contains("1. Annonces majeures"));
|
|
assert!(user_prompt.contains("2. Recherche et innovation"));
|
|
assert!(user_prompt.contains("2 grandes sections"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_includes_max_items() {
|
|
let settings = test_settings();
|
|
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
|
|
assert!(user_prompt.contains("4 actualites"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_includes_custom_sources() {
|
|
let settings = test_settings();
|
|
let sources = vec![
|
|
Source {
|
|
id: Uuid::nil(),
|
|
user_id: Uuid::nil(),
|
|
title: "TechCrunch".into(),
|
|
url: "https://techcrunch.com".into(),
|
|
created_at: Utc::now(),
|
|
},
|
|
Source {
|
|
id: Uuid::nil(),
|
|
user_id: Uuid::nil(),
|
|
title: "The Verge".into(),
|
|
url: "https://theverge.com".into(),
|
|
created_at: Utc::now(),
|
|
},
|
|
];
|
|
|
|
let (_, user_prompt) = build_search_prompt(&settings, &sources, "lundi 21 mars 2026", &[], None);
|
|
assert!(user_prompt.contains("TechCrunch (https://techcrunch.com)"));
|
|
assert!(user_prompt.contains("The Verge (https://theverge.com)"));
|
|
assert!(user_prompt.contains("sources personnalisees"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_no_sources_no_section() {
|
|
let settings = test_settings();
|
|
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
|
|
assert!(!user_prompt.contains("sources personnalisees"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_custom_behavior() {
|
|
let mut settings = test_settings();
|
|
settings.search_agent_behavior =
|
|
"Concentre-toi sur les sources europeennes.".to_string();
|
|
|
|
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
|
|
assert!(user_prompt.contains("Concentre-toi sur les sources europeennes."));
|
|
assert!(!user_prompt.contains("recherche Google"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_default_behavior_when_empty() {
|
|
let settings = test_settings();
|
|
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
|
|
assert!(user_prompt.contains("recherche Google"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_warns_against_homepage_urls() {
|
|
let settings = test_settings();
|
|
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
|
|
assert!(user_prompt.contains("pages d'accueil"));
|
|
assert!(user_prompt.contains("articles specifiques"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_includes_recent_domains_avoidance() {
|
|
let settings = test_settings();
|
|
let sources = vec![];
|
|
let date = "lundi 17 mars 2026";
|
|
let domains = vec!["techcrunch.com".to_string(), "theverge.com".to_string()];
|
|
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &domains, None);
|
|
assert!(user_prompt.contains("Evite si possible"));
|
|
assert!(user_prompt.contains("techcrunch.com"));
|
|
assert!(user_prompt.contains("theverge.com"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_no_avoidance_when_domains_empty() {
|
|
let settings = test_settings();
|
|
let sources = vec![];
|
|
let date = "lundi 17 mars 2026";
|
|
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
|
|
assert!(!user_prompt.contains("Evite si possible"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_with_category_gaps() {
|
|
let settings = test_settings();
|
|
let sources = vec![];
|
|
let date = "lundi 17 mars 2026";
|
|
let gaps = vec![
|
|
("AI News".to_string(), 2),
|
|
("Cybersecurity".to_string(), 4),
|
|
];
|
|
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], Some(&gaps));
|
|
assert!(user_prompt.contains("AI News : 2 articles"));
|
|
assert!(user_prompt.contains("Cybersecurity : 4 articles"));
|
|
assert!(!user_prompt.contains("exactement"));
|
|
}
|
|
|
|
#[test]
|
|
fn search_prompt_without_gaps_uses_default() {
|
|
let settings = test_settings();
|
|
let sources = vec![];
|
|
let date = "lundi 17 mars 2026";
|
|
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
|
|
assert!(user_prompt.contains("exactement"));
|
|
}
|
|
|
|
#[test]
|
|
fn link_extraction_prompt_includes_body() {
|
|
let (sys, user) = build_link_extraction_prompt("<a href='/post'>P</a>");
|
|
assert!(user.contains("<a href='/post'>"));
|
|
assert!(user.contains("articles"));
|
|
assert!(sys.contains("liens"));
|
|
assert!(!user.contains("<head>"));
|
|
}
|
|
|
|
#[test]
|
|
fn link_extraction_prompt_truncates_body() {
|
|
let long_body = "x".repeat(20000);
|
|
let (_, user) = build_link_extraction_prompt(&long_body);
|
|
assert!(user.len() < 18000); // 12000 chars of body + prompt text
|
|
}
|
|
|
|
#[test]
|
|
fn article_classify_prompt_includes_content() {
|
|
let (sys, user) = build_article_classify_prompt(
|
|
"GPT-5 Released",
|
|
"OpenAI released GPT-5 today",
|
|
&["AI News".into(), "Autre".into()],
|
|
);
|
|
assert!(user.contains("GPT-5 Released"));
|
|
assert!(user.contains("AI News"));
|
|
assert!(user.contains("Autre"));
|
|
assert!(sys.contains("classer"));
|
|
}
|
|
|
|
#[test]
|
|
fn article_classify_prompt_handles_empty_title() {
|
|
let (_, user) = build_article_classify_prompt("", "Some content", &["Tech".into(), "Autre".into()]);
|
|
assert!(user.contains("(pas de titre)"));
|
|
}
|
|
|
|
}
|