|
|
|
|
@ -1,14 +1,13 @@
|
|
|
|
|
//! Prompt construction for the two-pass LLM generation pipeline.
|
|
|
|
|
//! Prompt construction for the LLM generation pipeline.
|
|
|
|
|
//!
|
|
|
|
|
//! Builds system and user prompts for:
|
|
|
|
|
//! - **Search pass** (Pass 1): web search and initial article discovery
|
|
|
|
|
//! - **Rewrite pass** (Pass 2): rewrite summaries using scraped content
|
|
|
|
|
//! - **Per-article classify**: per-article classification and summarization
|
|
|
|
|
//!
|
|
|
|
|
//! Prompts are provider-agnostic and parameterized by user settings.
|
|
|
|
|
|
|
|
|
|
use crate::models::settings::UserSettings;
|
|
|
|
|
use crate::models::source::Source;
|
|
|
|
|
use crate::models::synthesis::ScrapedNewsItem;
|
|
|
|
|
|
|
|
|
|
/// Build the system prompt and user prompt for the search pass (Pass 1).
|
|
|
|
|
///
|
|
|
|
|
@ -119,43 +118,6 @@ pub fn build_search_prompt(
|
|
|
|
|
(system_prompt, user_prompt)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Build the system prompt and user prompt for the rewrite pass (Pass 2).
|
|
|
|
|
///
|
|
|
|
|
/// The rewrite pass takes scraped article content and asks the LLM to
|
|
|
|
|
/// rewrite titles and summaries to faithfully reflect the actual content.
|
|
|
|
|
///
|
|
|
|
|
/// # Arguments
|
|
|
|
|
/// * `scraped_data` — Map of category key to scraped news items with content
|
|
|
|
|
pub fn build_rewrite_prompt(
|
|
|
|
|
scraped_data: &std::collections::HashMap<String, Vec<ScrapedNewsItem>>,
|
|
|
|
|
) -> (String, String) {
|
|
|
|
|
let system_prompt =
|
|
|
|
|
"Tu es un assistant IA precis. Tu dois generer des titres et resumes fideles \
|
|
|
|
|
au contenu fourni."
|
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
|
|
let data_json = serde_json::to_string_pretty(scraped_data).unwrap_or_default();
|
|
|
|
|
|
|
|
|
|
let user_prompt = format!(
|
|
|
|
|
"Tu es un expert en analyse de l'actualite.\n\
|
|
|
|
|
Voici une liste d'articles d'actualite classes par categorie, avec leur contenu textuel \
|
|
|
|
|
brut extrait des sites web ('scrapedContent').\n\
|
|
|
|
|
Ta tache est de reecrire le 'title' et le 'summary' (4 ou 5 lignes) pour chaque article \
|
|
|
|
|
afin qu'ils refletent EXACTEMENT et FIDELEMENT le contenu textuel fourni.\n\
|
|
|
|
|
Pour chaque article, un 'originalTitle' extrait de la page web est fourni. Utilise ce \
|
|
|
|
|
titre original comme base pour le titre final. Regles linguistiques: les titres en anglais \
|
|
|
|
|
restent en anglais, les titres en francais restent en francais, les autres langues sont \
|
|
|
|
|
traduites en francais.\n\
|
|
|
|
|
Si le 'scrapedContent' est vide ou insuffisant, utilise le titre et le resume originaux \
|
|
|
|
|
pour faire au mieux.\n\
|
|
|
|
|
Conserve EXACTEMENT les memes URLs. Ne supprime aucun article de cette liste.\n\n\
|
|
|
|
|
Donnees des articles :\n{data}",
|
|
|
|
|
data = data_json,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
(system_prompt, user_prompt)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Build a prompt for LLM-assisted link extraction from a source page.
|
|
|
|
|
pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) {
|
|
|
|
|
let system_prompt =
|
|
|
|
|
@ -180,31 +142,6 @@ pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String
|
|
|
|
|
(system_prompt, user_prompt)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Build a prompt for LLM-assisted article content extraction.
|
|
|
|
|
pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (String, String) {
|
|
|
|
|
let system_prompt =
|
|
|
|
|
"Tu es un assistant qui analyse des articles web. \
|
|
|
|
|
Tu dois extraire les informations structurees de l'article. \
|
|
|
|
|
Reponds uniquement au format JSON demande."
|
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
|
|
let user_prompt = format!(
|
|
|
|
|
"Voici le contenu d'une page web.\n\n\
|
|
|
|
|
<head>\n{head}\n</head>\n\n\
|
|
|
|
|
Contenu textuel de la page :\n{body}\n\n\
|
|
|
|
|
Extrais les informations suivantes :\n\
|
|
|
|
|
- title : le titre de l'article\n\
|
|
|
|
|
- published_date : la date de publication au format ISO 8601 (YYYY-MM-DDTHH:MM:SSZ), \
|
|
|
|
|
ou une chaine vide si introuvable\n\
|
|
|
|
|
- body_text : le contenu principal de l'article (pas la navigation, pas les pubs)\n\
|
|
|
|
|
- is_error_page : true si c'est une page d'erreur/404, false sinon",
|
|
|
|
|
head = head_html,
|
|
|
|
|
body = body_text,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
(system_prompt, user_prompt)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Build a prompt for per-article classification and summarization.
|
|
|
|
|
///
|
|
|
|
|
/// The LLM classifies the article into a category and generates a title + summary.
|
|
|
|
|
@ -242,64 +179,6 @@ pub fn build_article_classify_prompt(
|
|
|
|
|
(system_prompt, user_prompt)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Build a prompt for classifying scraped articles into categories.
|
|
|
|
|
///
|
|
|
|
|
/// # Arguments
|
|
|
|
|
/// * `articles` — scraped articles to classify (title + body snippet used)
|
|
|
|
|
/// * `categories` — user categories + "Autre"
|
|
|
|
|
/// * `max_per_category` — max items allowed per category
|
|
|
|
|
/// * `filled_counts` — how many items already fill each category (for Phase 2)
|
|
|
|
|
pub fn build_classification_prompt(
|
|
|
|
|
articles: &[ScrapedNewsItem],
|
|
|
|
|
categories: &[String],
|
|
|
|
|
max_per_category: i32,
|
|
|
|
|
filled_counts: &std::collections::HashMap<String, usize>,
|
|
|
|
|
) -> (String, String) {
|
|
|
|
|
let system_prompt =
|
|
|
|
|
"Tu es un assistant qui classe des articles dans des categories. \
|
|
|
|
|
Reponds uniquement au format JSON demande."
|
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
|
|
let articles_json: Vec<serde_json::Value> = articles
|
|
|
|
|
.iter()
|
|
|
|
|
.enumerate()
|
|
|
|
|
.map(|(i, a)| {
|
|
|
|
|
let snippet: String = a.scraped_content.chars().take(500).collect();
|
|
|
|
|
serde_json::json!({
|
|
|
|
|
"index": i,
|
|
|
|
|
"title": a.title,
|
|
|
|
|
"url": a.url,
|
|
|
|
|
"snippet": snippet
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
let categories_info: Vec<String> = categories
|
|
|
|
|
.iter()
|
|
|
|
|
.map(|cat| {
|
|
|
|
|
let filled = filled_counts.get(cat).copied().unwrap_or(0);
|
|
|
|
|
let remaining = (max_per_category as usize).saturating_sub(filled);
|
|
|
|
|
if remaining == 1 {
|
|
|
|
|
format!("- \"{}\" (encore 1 place)", cat)
|
|
|
|
|
} else {
|
|
|
|
|
format!("- \"{}\" (encore {} places)", cat, remaining)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
let user_prompt = format!(
|
|
|
|
|
"Voici une liste d'articles :\n{articles}\n\n\
|
|
|
|
|
Categories disponibles :\n{categories}\n\n\
|
|
|
|
|
Classe chaque article dans la categorie la plus appropriee. \
|
|
|
|
|
Si un article ne correspond a aucune categorie, classe-le dans \"Autre\".\n\
|
|
|
|
|
Respecte le nombre de places restantes par categorie.",
|
|
|
|
|
articles = serde_json::to_string_pretty(&articles_json).unwrap_or_default(),
|
|
|
|
|
categories = categories_info.join("\n"),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
(system_prompt, user_prompt)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests {
|
|
|
|
|
use super::*;
|
|
|
|
|
@ -426,39 +305,6 @@ mod tests {
|
|
|
|
|
assert!(user_prompt.contains("articles specifiques"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn rewrite_prompt_includes_instructions() {
|
|
|
|
|
let mut data = std::collections::HashMap::new();
|
|
|
|
|
data.insert(
|
|
|
|
|
"category_0".to_string(),
|
|
|
|
|
vec![ScrapedNewsItem {
|
|
|
|
|
title: "Test Article".into(),
|
|
|
|
|
url: "https://example.com".into(),
|
|
|
|
|
summary: "A summary".into(),
|
|
|
|
|
original_title: "Original Test Article".into(),
|
|
|
|
|
scraped_content: "Full article text here...".into(),
|
|
|
|
|
source_url: None,
|
|
|
|
|
}],
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let (system, user_prompt) = build_rewrite_prompt(&data);
|
|
|
|
|
assert!(system.contains("fideles"));
|
|
|
|
|
assert!(user_prompt.contains("scrapedContent"));
|
|
|
|
|
assert!(user_prompt.contains("Test Article"));
|
|
|
|
|
assert!(user_prompt.contains("https://example.com"));
|
|
|
|
|
assert!(user_prompt.contains("Ne supprime aucun article"));
|
|
|
|
|
assert!(user_prompt.contains("originalTitle"));
|
|
|
|
|
assert!(user_prompt.contains("titre original comme base"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn rewrite_prompt_with_empty_data() {
|
|
|
|
|
let data = std::collections::HashMap::new();
|
|
|
|
|
let (_, user_prompt) = build_rewrite_prompt(&data);
|
|
|
|
|
// Should still produce a valid prompt with empty data
|
|
|
|
|
assert!(user_prompt.contains("Donnees des articles"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn search_prompt_includes_recent_domains_avoidance() {
|
|
|
|
|
let settings = test_settings();
|
|
|
|
|
@ -480,52 +326,6 @@ mod tests {
|
|
|
|
|
assert!(!user_prompt.contains("Evite si possible"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn classification_prompt_includes_categories_and_articles() {
|
|
|
|
|
let articles = vec![
|
|
|
|
|
ScrapedNewsItem {
|
|
|
|
|
title: "GPT-5 Released".into(),
|
|
|
|
|
url: "https://openai.com/blog/gpt5".into(),
|
|
|
|
|
summary: "s".into(),
|
|
|
|
|
original_title: "t".into(),
|
|
|
|
|
scraped_content: "OpenAI released GPT-5 today with major improvements".into(),
|
|
|
|
|
source_url: None,
|
|
|
|
|
},
|
|
|
|
|
];
|
|
|
|
|
let categories = vec!["AI News".to_string(), "Autre".to_string()];
|
|
|
|
|
let filled = std::collections::HashMap::new();
|
|
|
|
|
let (_, user_prompt) = build_classification_prompt(&articles, &categories, 4, &filled);
|
|
|
|
|
assert!(user_prompt.contains("GPT-5 Released"));
|
|
|
|
|
assert!(user_prompt.contains("AI News"));
|
|
|
|
|
assert!(user_prompt.contains("Autre"));
|
|
|
|
|
assert!(user_prompt.contains("encore 4 places"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn classification_prompt_shows_reduced_capacity() {
|
|
|
|
|
let articles = vec![
|
|
|
|
|
ScrapedNewsItem {
|
|
|
|
|
title: "T".into(), url: "https://a.com/1".into(),
|
|
|
|
|
summary: "s".into(), original_title: "t".into(),
|
|
|
|
|
scraped_content: "Content".into(), source_url: None,
|
|
|
|
|
},
|
|
|
|
|
];
|
|
|
|
|
let categories = vec!["AI News".to_string(), "Autre".to_string()];
|
|
|
|
|
let mut filled = std::collections::HashMap::new();
|
|
|
|
|
filled.insert("AI News".to_string(), 3);
|
|
|
|
|
let (_, user_prompt) = build_classification_prompt(&articles, &categories, 4, &filled);
|
|
|
|
|
assert!(user_prompt.contains("encore 1 place"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn classification_prompt_system_is_french() {
|
|
|
|
|
let articles = vec![];
|
|
|
|
|
let categories = vec!["Autre".to_string()];
|
|
|
|
|
let filled = std::collections::HashMap::new();
|
|
|
|
|
let (system, _) = build_classification_prompt(&articles, &categories, 4, &filled);
|
|
|
|
|
assert!(system.contains("classe"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn search_prompt_with_category_gaps() {
|
|
|
|
|
let settings = test_settings();
|
|
|
|
|
@ -584,11 +384,4 @@ mod tests {
|
|
|
|
|
assert!(user.contains("(pas de titre)"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn article_extraction_prompt_includes_content() {
|
|
|
|
|
let (_, user) = build_article_extraction_prompt("<meta name='date'>", "Article body here");
|
|
|
|
|
assert!(user.contains("Article body here"));
|
|
|
|
|
assert!(user.contains("published_date"));
|
|
|
|
|
assert!(user.contains("is_error_page"));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|