refactor: remove old classification, rewrite, and article extraction prompts/schemas

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 3 months ago
parent bb716b5dc2
commit 0b180eb75c

@ -82,31 +82,6 @@ pub fn build_category_schema(categories: &[String], max_items_per_category: i32)
})
}
/// Build a JSON Schema for the article classification response.
///
/// The LLM returns an array of assignments mapping article indices to category names.
pub fn build_classification_schema() -> Value {
serde_json::json!({
"type": "object",
"properties": {
"assignments": {
"type": "array",
"items": {
"type": "object",
"properties": {
"index": { "type": "integer", "description": "Article index from the input list" },
"category": { "type": "string", "description": "Category name to assign this article to" }
},
"required": ["index", "category"],
"additionalProperties": false
}
}
},
"required": ["assignments"],
"additionalProperties": false
})
}
/// Build a JSON Schema for per-article classification and summarization.
pub fn build_article_classify_schema() -> Value {
serde_json::json!({
@ -136,21 +111,6 @@ pub fn build_link_extraction_schema() -> Value {
})
}
/// Build a JSON Schema for LLM article content extraction response.
pub fn build_article_extraction_schema() -> Value {
serde_json::json!({
"type": "object",
"properties": {
"title": { "type": "string", "description": "Article title" },
"published_date": { "type": "string", "description": "ISO 8601 date or empty string if not found" },
"body_text": { "type": "string", "description": "Main article content" },
"is_error_page": { "type": "boolean", "description": "True if this is an error/404 page" }
},
"required": ["title", "published_date", "body_text", "is_error_page"],
"additionalProperties": false
})
}
#[cfg(test)]
mod tests {
use super::*;
@ -331,19 +291,6 @@ mod tests {
assert_eq!(props["category_1"]["description"], "R&D / Innovation");
}
#[test]
fn classification_schema_has_assignments_array() {
let schema = build_classification_schema();
assert_eq!(schema["type"], "object");
let assignments = &schema["properties"]["assignments"];
assert_eq!(assignments["type"], "array");
let item_props = &assignments["items"]["properties"];
assert!(item_props.get("index").is_some());
assert!(item_props.get("category").is_some());
assert_eq!(assignments["items"]["additionalProperties"], false);
assert_eq!(schema["additionalProperties"], false);
}
#[test]
fn article_classify_schema_has_all_fields() {
let schema = build_article_classify_schema();
@ -361,16 +308,4 @@ mod tests {
assert_eq!(schema["additionalProperties"], false);
}
#[test]
fn article_extraction_schema_strict_mode_compatible() {
let schema = build_article_extraction_schema();
let props = schema["properties"].as_object().unwrap();
assert!(props.contains_key("title"));
assert!(props.contains_key("published_date"));
assert!(props.contains_key("body_text"));
assert!(props.contains_key("is_error_page"));
assert_eq!(schema["additionalProperties"], false);
// published_date is string (not union type) for OpenAI strict mode
assert_eq!(props["published_date"]["type"], "string");
}
}

@ -1,14 +1,13 @@
//! Prompt construction for the two-pass LLM generation pipeline.
//! Prompt construction for the LLM generation pipeline.
//!
//! Builds system and user prompts for:
//! - **Search pass** (Pass 1): web search and initial article discovery
//! - **Rewrite pass** (Pass 2): rewrite summaries using scraped content
//! - **Per-article classify**: per-article classification and summarization
//!
//! Prompts are provider-agnostic and parameterized by user settings.
use crate::models::settings::UserSettings;
use crate::models::source::Source;
use crate::models::synthesis::ScrapedNewsItem;
/// Build the system prompt and user prompt for the search pass (Pass 1).
///
@ -119,43 +118,6 @@ pub fn build_search_prompt(
(system_prompt, user_prompt)
}
/// Build the system prompt and user prompt for the rewrite pass (Pass 2).
///
/// The rewrite pass takes scraped article content and asks the LLM to
/// rewrite titles and summaries to faithfully reflect the actual content.
///
/// # Arguments
/// * `scraped_data` — Map of category key to scraped news items with content
pub fn build_rewrite_prompt(
scraped_data: &std::collections::HashMap<String, Vec<ScrapedNewsItem>>,
) -> (String, String) {
let system_prompt =
"Tu es un assistant IA precis. Tu dois generer des titres et resumes fideles \
au contenu fourni."
.to_string();
let data_json = serde_json::to_string_pretty(scraped_data).unwrap_or_default();
let user_prompt = format!(
"Tu es un expert en analyse de l'actualite.\n\
Voici une liste d'articles d'actualite classes par categorie, avec leur contenu textuel \
brut extrait des sites web ('scrapedContent').\n\
Ta tache est de reecrire le 'title' et le 'summary' (4 ou 5 lignes) pour chaque article \
afin qu'ils refletent EXACTEMENT et FIDELEMENT le contenu textuel fourni.\n\
Pour chaque article, un 'originalTitle' extrait de la page web est fourni. Utilise ce \
titre original comme base pour le titre final. Regles linguistiques: les titres en anglais \
restent en anglais, les titres en francais restent en francais, les autres langues sont \
traduites en francais.\n\
Si le 'scrapedContent' est vide ou insuffisant, utilise le titre et le resume originaux \
pour faire au mieux.\n\
Conserve EXACTEMENT les memes URLs. Ne supprime aucun article de cette liste.\n\n\
Donnees des articles :\n{data}",
data = data_json,
);
(system_prompt, user_prompt)
}
/// Build a prompt for LLM-assisted link extraction from a source page.
pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) {
let system_prompt =
@ -180,31 +142,6 @@ pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String
(system_prompt, user_prompt)
}
/// Build a prompt for LLM-assisted article content extraction.
pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (String, String) {
let system_prompt =
"Tu es un assistant qui analyse des articles web. \
Tu dois extraire les informations structurees de l'article. \
Reponds uniquement au format JSON demande."
.to_string();
let user_prompt = format!(
"Voici le contenu d'une page web.\n\n\
<head>\n{head}\n</head>\n\n\
Contenu textuel de la page :\n{body}\n\n\
Extrais les informations suivantes :\n\
- title : le titre de l'article\n\
- published_date : la date de publication au format ISO 8601 (YYYY-MM-DDTHH:MM:SSZ), \
ou une chaine vide si introuvable\n\
- body_text : le contenu principal de l'article (pas la navigation, pas les pubs)\n\
- is_error_page : true si c'est une page d'erreur/404, false sinon",
head = head_html,
body = body_text,
);
(system_prompt, user_prompt)
}
/// Build a prompt for per-article classification and summarization.
///
/// The LLM classifies the article into a category and generates a title + summary.
@ -242,64 +179,6 @@ pub fn build_article_classify_prompt(
(system_prompt, user_prompt)
}
/// Build a prompt for classifying scraped articles into categories.
///
/// # Arguments
/// * `articles` — scraped articles to classify (title + body snippet used)
/// * `categories` — user categories + "Autre"
/// * `max_per_category` — max items allowed per category
/// * `filled_counts` — how many items already fill each category (for Phase 2)
pub fn build_classification_prompt(
articles: &[ScrapedNewsItem],
categories: &[String],
max_per_category: i32,
filled_counts: &std::collections::HashMap<String, usize>,
) -> (String, String) {
let system_prompt =
"Tu es un assistant qui classe des articles dans des categories. \
Reponds uniquement au format JSON demande."
.to_string();
let articles_json: Vec<serde_json::Value> = articles
.iter()
.enumerate()
.map(|(i, a)| {
let snippet: String = a.scraped_content.chars().take(500).collect();
serde_json::json!({
"index": i,
"title": a.title,
"url": a.url,
"snippet": snippet
})
})
.collect();
let categories_info: Vec<String> = categories
.iter()
.map(|cat| {
let filled = filled_counts.get(cat).copied().unwrap_or(0);
let remaining = (max_per_category as usize).saturating_sub(filled);
if remaining == 1 {
format!("- \"{}\" (encore 1 place)", cat)
} else {
format!("- \"{}\" (encore {} places)", cat, remaining)
}
})
.collect();
let user_prompt = format!(
"Voici une liste d'articles :\n{articles}\n\n\
Categories disponibles :\n{categories}\n\n\
Classe chaque article dans la categorie la plus appropriee. \
Si un article ne correspond a aucune categorie, classe-le dans \"Autre\".\n\
Respecte le nombre de places restantes par categorie.",
articles = serde_json::to_string_pretty(&articles_json).unwrap_or_default(),
categories = categories_info.join("\n"),
);
(system_prompt, user_prompt)
}
#[cfg(test)]
mod tests {
use super::*;
@ -426,39 +305,6 @@ mod tests {
assert!(user_prompt.contains("articles specifiques"));
}
#[test]
fn rewrite_prompt_includes_instructions() {
let mut data = std::collections::HashMap::new();
data.insert(
"category_0".to_string(),
vec![ScrapedNewsItem {
title: "Test Article".into(),
url: "https://example.com".into(),
summary: "A summary".into(),
original_title: "Original Test Article".into(),
scraped_content: "Full article text here...".into(),
source_url: None,
}],
);
let (system, user_prompt) = build_rewrite_prompt(&data);
assert!(system.contains("fideles"));
assert!(user_prompt.contains("scrapedContent"));
assert!(user_prompt.contains("Test Article"));
assert!(user_prompt.contains("https://example.com"));
assert!(user_prompt.contains("Ne supprime aucun article"));
assert!(user_prompt.contains("originalTitle"));
assert!(user_prompt.contains("titre original comme base"));
}
#[test]
fn rewrite_prompt_with_empty_data() {
let data = std::collections::HashMap::new();
let (_, user_prompt) = build_rewrite_prompt(&data);
// Should still produce a valid prompt with empty data
assert!(user_prompt.contains("Donnees des articles"));
}
#[test]
fn search_prompt_includes_recent_domains_avoidance() {
let settings = test_settings();
@ -480,52 +326,6 @@ mod tests {
assert!(!user_prompt.contains("Evite si possible"));
}
#[test]
fn classification_prompt_includes_categories_and_articles() {
let articles = vec![
ScrapedNewsItem {
title: "GPT-5 Released".into(),
url: "https://openai.com/blog/gpt5".into(),
summary: "s".into(),
original_title: "t".into(),
scraped_content: "OpenAI released GPT-5 today with major improvements".into(),
source_url: None,
},
];
let categories = vec!["AI News".to_string(), "Autre".to_string()];
let filled = std::collections::HashMap::new();
let (_, user_prompt) = build_classification_prompt(&articles, &categories, 4, &filled);
assert!(user_prompt.contains("GPT-5 Released"));
assert!(user_prompt.contains("AI News"));
assert!(user_prompt.contains("Autre"));
assert!(user_prompt.contains("encore 4 places"));
}
#[test]
fn classification_prompt_shows_reduced_capacity() {
let articles = vec![
ScrapedNewsItem {
title: "T".into(), url: "https://a.com/1".into(),
summary: "s".into(), original_title: "t".into(),
scraped_content: "Content".into(), source_url: None,
},
];
let categories = vec!["AI News".to_string(), "Autre".to_string()];
let mut filled = std::collections::HashMap::new();
filled.insert("AI News".to_string(), 3);
let (_, user_prompt) = build_classification_prompt(&articles, &categories, 4, &filled);
assert!(user_prompt.contains("encore 1 place"));
}
#[test]
fn classification_prompt_system_is_french() {
let articles = vec![];
let categories = vec!["Autre".to_string()];
let filled = std::collections::HashMap::new();
let (system, _) = build_classification_prompt(&articles, &categories, 4, &filled);
assert!(system.contains("classe"));
}
#[test]
fn search_prompt_with_category_gaps() {
let settings = test_settings();
@ -584,11 +384,4 @@ mod tests {
assert!(user.contains("(pas de titre)"));
}
#[test]
fn article_extraction_prompt_includes_content() {
let (_, user) = build_article_extraction_prompt("<meta name='date'>", "Article body here");
assert!(user.contains("Article body here"));
assert!(user.contains("published_date"));
assert!(user.contains("is_error_page"));
}
}

@ -29,8 +29,8 @@ use crate::models::synthesis::{
};
use crate::services::encryption;
use crate::services::llm::factory::create_provider;
use crate::services::llm::schema::{build_category_schema, build_classification_schema};
use crate::services::prompts::{self, build_classification_prompt};
use crate::services::llm::schema::build_category_schema;
use crate::services::prompts;
use crate::services::scraper;
use crate::services::source_scraper;
@ -496,45 +496,9 @@ async fn run_generation_inner(
emit_progress(tx, "classifying", "Classification des articles...", 35);
check_rate_limit(state, &user_rate_limiter, &provider_name)?;
let (class_system, class_user) = build_classification_prompt(
&valid_articles,
&classification_categories,
settings.max_items_per_category,
&filled_counts,
);
let class_schema = build_classification_schema();
let llm_start = std::time::Instant::now();
let class_response = provider
.call_llm(
&model_research,
&class_system,
&class_user,
&class_schema,
)
.await?;
let llm_duration = llm_start.elapsed().as_millis() as u64;
log_llm_call(&state.pool, user_id, job_id, "classification_phase1", &model_research,
&class_system, &class_user, &class_response, llm_duration).await;
// 1e. Parse classification and fill categories
let (phase1_classified, phase1_overflow) = parse_classification_response(
&class_response,
&valid_articles,
&classification_categories,
settings.max_items_per_category,
&mut filled_counts,
);
all_overflow.extend(phase1_overflow);
// Merge into all_scraped and track URLs
for (cat_key, items) in phase1_classified {
for item in &items {
seen_urls.insert(item.url.to_lowercase());
}
all_scraped.entry(cat_key).or_default().extend(items);
}
// TODO(Task 5): replace with per-article classify pipeline
let _ = (&valid_articles, &classification_categories, &filled_counts);
let _ = (); // phase1 classification stub
// 1f. Enforce max_articles_per_source across all categories
// (reuse domain counting logic)
@ -770,44 +734,9 @@ async fn run_generation_inner(
emit_progress(tx, "classifying", "Classification des resultats web...", 70);
check_rate_limit(state, &user_rate_limiter, &provider_name)?;
let (class_system, class_user) = build_classification_prompt(
&phase2_articles,
&classification_categories,
settings.max_items_per_category,
&filled_counts,
);
let class_schema = build_classification_schema();
let llm_start = std::time::Instant::now();
let class_response = provider
.call_llm(
&model_research,
&class_system,
&class_user,
&class_schema,
)
.await?;
let llm_duration = llm_start.elapsed().as_millis() as u64;
log_llm_call(&state.pool, user_id, job_id, "classification_phase2", &model_research,
&class_system, &class_user, &class_response, llm_duration).await;
let (phase2_classified, phase2_overflow) = parse_classification_response(
&class_response,
&phase2_articles,
&classification_categories,
settings.max_items_per_category,
&mut filled_counts,
);
all_overflow.extend(phase2_overflow);
// Merge Phase 2 into all_scraped
for (cat_key, items) in phase2_classified {
for item in &items {
seen_urls.insert(item.url.to_lowercase());
}
all_scraped.entry(cat_key).or_default().extend(items);
}
// TODO(Task 5): replace with per-article classify pipeline
let _ = (&phase2_articles, &classification_categories, &filled_counts);
let _ = (); // phase2 classification stub
}
}
@ -876,16 +805,13 @@ async fn run_generation_inner(
emit_progress(tx, "rewrite", "Redaction des resumes...", 80);
check_rate_limit(state, &user_rate_limiter, &provider_name)?;
let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&all_scraped);
// TODO(Task 5): rewrite pass replaced by per-article classify pipeline
let rewrite_schema = build_rewrite_schema(&all_scraped, &settings.categories);
let _ = rewrite_schema;
let llm_start = std::time::Instant::now();
let final_results = provider
.call_llm(&model_writing, &rewrite_system, &rewrite_user, &rewrite_schema)
.await?;
let llm_duration = llm_start.elapsed().as_millis() as u64;
log_llm_call(&state.pool, user_id, job_id, "rewrite", &model_writing,
&rewrite_system, &rewrite_user, &final_results, llm_duration).await;
let _ = llm_start;
let final_results = serde_json::Value::Object(serde_json::Map::new()); // stub: replaced in Task 5
emit_progress(tx, "finalizing", "Finalisation...", 90);
let mut final_sections = build_final_sections(&final_results, &settings.categories)?;
@ -1724,43 +1650,14 @@ async fn scrape_single_article_with_llm(
return (String::new(), String::new(), final_url);
}
let (system, user) = crate::services::prompts::build_article_extraction_prompt(
"",
&content.body_text,
);
let schema = crate::services::llm::schema::build_article_extraction_schema();
match provider.call_llm(&model, &system, &user, &schema).await {
Ok(response) => {
let title = response.get("title").and_then(|t| t.as_str()).unwrap_or("").to_string();
let body = response.get("body_text").and_then(|b| b.as_str()).unwrap_or("").to_string();
let is_error = response.get("is_error_page").and_then(|e| e.as_bool()).unwrap_or(false);
let date_str = response.get("published_date").and_then(|d| d.as_str()).unwrap_or("");
if is_error || body.trim().is_empty() {
return (String::new(), String::new(), final_url);
}
if !date_str.is_empty() {
if let Ok(date) = chrono::DateTime::parse_from_rfc3339(date_str) {
if scraper::is_article_too_old(Some(date.with_timezone(&chrono::Utc)), max_age_days) {
tracing::warn!(url = url, "LLM-extracted article too old");
return (String::new(), String::new(), final_url);
}
}
}
(body, title, final_url)
}
Err(e) => {
tracing::warn!(url = url, error = %e, "LLM extraction failed, using heuristic fallback");
// TODO(Task 5): LLM article extraction removed; use heuristic fallback only.
// The provider and model parameters are kept for future use.
let _ = (provider, model);
if scraper::is_article_too_old(content.published_date, max_age_days) {
return (String::new(), String::new(), final_url);
}
let title = content.title.unwrap_or_default();
(content.body_text, title, final_url)
}
}
}
/// Build the final sections array from the LLM's rewrite output.

Loading…
Cancel
Save