From 0b180eb75c44a6a671a1afca759707b21972eca4 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Wed, 25 Mar 2026 00:53:24 +0100 Subject: [PATCH] refactor: remove old classification, rewrite, and article extraction prompts/schemas Co-Authored-By: Claude Sonnet 4.6 --- backend/src/services/llm/schema.rs | 65 --------- backend/src/services/prompts.rs | 211 +---------------------------- backend/src/services/synthesis.rs | 141 +++---------------- 3 files changed, 21 insertions(+), 396 deletions(-) diff --git a/backend/src/services/llm/schema.rs b/backend/src/services/llm/schema.rs index 21d84dc..428cee2 100644 --- a/backend/src/services/llm/schema.rs +++ b/backend/src/services/llm/schema.rs @@ -82,31 +82,6 @@ pub fn build_category_schema(categories: &[String], max_items_per_category: i32) }) } -/// Build a JSON Schema for the article classification response. -/// -/// The LLM returns an array of assignments mapping article indices to category names. -pub fn build_classification_schema() -> Value { - serde_json::json!({ - "type": "object", - "properties": { - "assignments": { - "type": "array", - "items": { - "type": "object", - "properties": { - "index": { "type": "integer", "description": "Article index from the input list" }, - "category": { "type": "string", "description": "Category name to assign this article to" } - }, - "required": ["index", "category"], - "additionalProperties": false - } - } - }, - "required": ["assignments"], - "additionalProperties": false - }) -} - /// Build a JSON Schema for per-article classification and summarization. pub fn build_article_classify_schema() -> Value { serde_json::json!({ @@ -136,21 +111,6 @@ pub fn build_link_extraction_schema() -> Value { }) } -/// Build a JSON Schema for LLM article content extraction response. -pub fn build_article_extraction_schema() -> Value { - serde_json::json!({ - "type": "object", - "properties": { - "title": { "type": "string", "description": "Article title" }, - "published_date": { "type": "string", "description": "ISO 8601 date or empty string if not found" }, - "body_text": { "type": "string", "description": "Main article content" }, - "is_error_page": { "type": "boolean", "description": "True if this is an error/404 page" } - }, - "required": ["title", "published_date", "body_text", "is_error_page"], - "additionalProperties": false - }) -} - #[cfg(test)] mod tests { use super::*; @@ -331,19 +291,6 @@ mod tests { assert_eq!(props["category_1"]["description"], "R&D / Innovation"); } - #[test] - fn classification_schema_has_assignments_array() { - let schema = build_classification_schema(); - assert_eq!(schema["type"], "object"); - let assignments = &schema["properties"]["assignments"]; - assert_eq!(assignments["type"], "array"); - let item_props = &assignments["items"]["properties"]; - assert!(item_props.get("index").is_some()); - assert!(item_props.get("category").is_some()); - assert_eq!(assignments["items"]["additionalProperties"], false); - assert_eq!(schema["additionalProperties"], false); - } - #[test] fn article_classify_schema_has_all_fields() { let schema = build_article_classify_schema(); @@ -361,16 +308,4 @@ mod tests { assert_eq!(schema["additionalProperties"], false); } - #[test] - fn article_extraction_schema_strict_mode_compatible() { - let schema = build_article_extraction_schema(); - let props = schema["properties"].as_object().unwrap(); - assert!(props.contains_key("title")); - assert!(props.contains_key("published_date")); - assert!(props.contains_key("body_text")); - assert!(props.contains_key("is_error_page")); - assert_eq!(schema["additionalProperties"], false); - // published_date is string (not union type) for OpenAI strict mode - assert_eq!(props["published_date"]["type"], "string"); - } } diff --git a/backend/src/services/prompts.rs b/backend/src/services/prompts.rs index 8a1603d..9ce8b6d 100644 --- a/backend/src/services/prompts.rs +++ b/backend/src/services/prompts.rs @@ -1,14 +1,13 @@ -//! Prompt construction for the two-pass LLM generation pipeline. +//! Prompt construction for the LLM generation pipeline. //! //! Builds system and user prompts for: //! - **Search pass** (Pass 1): web search and initial article discovery -//! - **Rewrite pass** (Pass 2): rewrite summaries using scraped content +//! - **Per-article classify**: per-article classification and summarization //! //! Prompts are provider-agnostic and parameterized by user settings. use crate::models::settings::UserSettings; use crate::models::source::Source; -use crate::models::synthesis::ScrapedNewsItem; /// Build the system prompt and user prompt for the search pass (Pass 1). /// @@ -119,43 +118,6 @@ pub fn build_search_prompt( (system_prompt, user_prompt) } -/// Build the system prompt and user prompt for the rewrite pass (Pass 2). -/// -/// The rewrite pass takes scraped article content and asks the LLM to -/// rewrite titles and summaries to faithfully reflect the actual content. -/// -/// # Arguments -/// * `scraped_data` — Map of category key to scraped news items with content -pub fn build_rewrite_prompt( - scraped_data: &std::collections::HashMap>, -) -> (String, String) { - let system_prompt = - "Tu es un assistant IA precis. Tu dois generer des titres et resumes fideles \ - au contenu fourni." - .to_string(); - - let data_json = serde_json::to_string_pretty(scraped_data).unwrap_or_default(); - - let user_prompt = format!( - "Tu es un expert en analyse de l'actualite.\n\ - Voici une liste d'articles d'actualite classes par categorie, avec leur contenu textuel \ - brut extrait des sites web ('scrapedContent').\n\ - Ta tache est de reecrire le 'title' et le 'summary' (4 ou 5 lignes) pour chaque article \ - afin qu'ils refletent EXACTEMENT et FIDELEMENT le contenu textuel fourni.\n\ - Pour chaque article, un 'originalTitle' extrait de la page web est fourni. Utilise ce \ - titre original comme base pour le titre final. Regles linguistiques: les titres en anglais \ - restent en anglais, les titres en francais restent en francais, les autres langues sont \ - traduites en francais.\n\ - Si le 'scrapedContent' est vide ou insuffisant, utilise le titre et le resume originaux \ - pour faire au mieux.\n\ - Conserve EXACTEMENT les memes URLs. Ne supprime aucun article de cette liste.\n\n\ - Donnees des articles :\n{data}", - data = data_json, - ); - - (system_prompt, user_prompt) -} - /// Build a prompt for LLM-assisted link extraction from a source page. pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) { let system_prompt = @@ -180,31 +142,6 @@ pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String (system_prompt, user_prompt) } -/// Build a prompt for LLM-assisted article content extraction. -pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (String, String) { - let system_prompt = - "Tu es un assistant qui analyse des articles web. \ - Tu dois extraire les informations structurees de l'article. \ - Reponds uniquement au format JSON demande." - .to_string(); - - let user_prompt = format!( - "Voici le contenu d'une page web.\n\n\ - \n{head}\n\n\n\ - Contenu textuel de la page :\n{body}\n\n\ - Extrais les informations suivantes :\n\ - - title : le titre de l'article\n\ - - published_date : la date de publication au format ISO 8601 (YYYY-MM-DDTHH:MM:SSZ), \ - ou une chaine vide si introuvable\n\ - - body_text : le contenu principal de l'article (pas la navigation, pas les pubs)\n\ - - is_error_page : true si c'est une page d'erreur/404, false sinon", - head = head_html, - body = body_text, - ); - - (system_prompt, user_prompt) -} - /// Build a prompt for per-article classification and summarization. /// /// The LLM classifies the article into a category and generates a title + summary. @@ -242,64 +179,6 @@ pub fn build_article_classify_prompt( (system_prompt, user_prompt) } -/// Build a prompt for classifying scraped articles into categories. -/// -/// # Arguments -/// * `articles` — scraped articles to classify (title + body snippet used) -/// * `categories` — user categories + "Autre" -/// * `max_per_category` — max items allowed per category -/// * `filled_counts` — how many items already fill each category (for Phase 2) -pub fn build_classification_prompt( - articles: &[ScrapedNewsItem], - categories: &[String], - max_per_category: i32, - filled_counts: &std::collections::HashMap, -) -> (String, String) { - let system_prompt = - "Tu es un assistant qui classe des articles dans des categories. \ - Reponds uniquement au format JSON demande." - .to_string(); - - let articles_json: Vec = articles - .iter() - .enumerate() - .map(|(i, a)| { - let snippet: String = a.scraped_content.chars().take(500).collect(); - serde_json::json!({ - "index": i, - "title": a.title, - "url": a.url, - "snippet": snippet - }) - }) - .collect(); - - let categories_info: Vec = categories - .iter() - .map(|cat| { - let filled = filled_counts.get(cat).copied().unwrap_or(0); - let remaining = (max_per_category as usize).saturating_sub(filled); - if remaining == 1 { - format!("- \"{}\" (encore 1 place)", cat) - } else { - format!("- \"{}\" (encore {} places)", cat, remaining) - } - }) - .collect(); - - let user_prompt = format!( - "Voici une liste d'articles :\n{articles}\n\n\ - Categories disponibles :\n{categories}\n\n\ - Classe chaque article dans la categorie la plus appropriee. \ - Si un article ne correspond a aucune categorie, classe-le dans \"Autre\".\n\ - Respecte le nombre de places restantes par categorie.", - articles = serde_json::to_string_pretty(&articles_json).unwrap_or_default(), - categories = categories_info.join("\n"), - ); - - (system_prompt, user_prompt) -} - #[cfg(test)] mod tests { use super::*; @@ -426,39 +305,6 @@ mod tests { assert!(user_prompt.contains("articles specifiques")); } - #[test] - fn rewrite_prompt_includes_instructions() { - let mut data = std::collections::HashMap::new(); - data.insert( - "category_0".to_string(), - vec![ScrapedNewsItem { - title: "Test Article".into(), - url: "https://example.com".into(), - summary: "A summary".into(), - original_title: "Original Test Article".into(), - scraped_content: "Full article text here...".into(), - source_url: None, - }], - ); - - let (system, user_prompt) = build_rewrite_prompt(&data); - assert!(system.contains("fideles")); - assert!(user_prompt.contains("scrapedContent")); - assert!(user_prompt.contains("Test Article")); - assert!(user_prompt.contains("https://example.com")); - assert!(user_prompt.contains("Ne supprime aucun article")); - assert!(user_prompt.contains("originalTitle")); - assert!(user_prompt.contains("titre original comme base")); - } - - #[test] - fn rewrite_prompt_with_empty_data() { - let data = std::collections::HashMap::new(); - let (_, user_prompt) = build_rewrite_prompt(&data); - // Should still produce a valid prompt with empty data - assert!(user_prompt.contains("Donnees des articles")); - } - #[test] fn search_prompt_includes_recent_domains_avoidance() { let settings = test_settings(); @@ -480,52 +326,6 @@ mod tests { assert!(!user_prompt.contains("Evite si possible")); } - #[test] - fn classification_prompt_includes_categories_and_articles() { - let articles = vec![ - ScrapedNewsItem { - title: "GPT-5 Released".into(), - url: "https://openai.com/blog/gpt5".into(), - summary: "s".into(), - original_title: "t".into(), - scraped_content: "OpenAI released GPT-5 today with major improvements".into(), - source_url: None, - }, - ]; - let categories = vec!["AI News".to_string(), "Autre".to_string()]; - let filled = std::collections::HashMap::new(); - let (_, user_prompt) = build_classification_prompt(&articles, &categories, 4, &filled); - assert!(user_prompt.contains("GPT-5 Released")); - assert!(user_prompt.contains("AI News")); - assert!(user_prompt.contains("Autre")); - assert!(user_prompt.contains("encore 4 places")); - } - - #[test] - fn classification_prompt_shows_reduced_capacity() { - let articles = vec![ - ScrapedNewsItem { - title: "T".into(), url: "https://a.com/1".into(), - summary: "s".into(), original_title: "t".into(), - scraped_content: "Content".into(), source_url: None, - }, - ]; - let categories = vec!["AI News".to_string(), "Autre".to_string()]; - let mut filled = std::collections::HashMap::new(); - filled.insert("AI News".to_string(), 3); - let (_, user_prompt) = build_classification_prompt(&articles, &categories, 4, &filled); - assert!(user_prompt.contains("encore 1 place")); - } - - #[test] - fn classification_prompt_system_is_french() { - let articles = vec![]; - let categories = vec!["Autre".to_string()]; - let filled = std::collections::HashMap::new(); - let (system, _) = build_classification_prompt(&articles, &categories, 4, &filled); - assert!(system.contains("classe")); - } - #[test] fn search_prompt_with_category_gaps() { let settings = test_settings(); @@ -584,11 +384,4 @@ mod tests { assert!(user.contains("(pas de titre)")); } - #[test] - fn article_extraction_prompt_includes_content() { - let (_, user) = build_article_extraction_prompt("", "Article body here"); - assert!(user.contains("Article body here")); - assert!(user.contains("published_date")); - assert!(user.contains("is_error_page")); - } } diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 5ec1420..47b7bf0 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -29,8 +29,8 @@ use crate::models::synthesis::{ }; use crate::services::encryption; use crate::services::llm::factory::create_provider; -use crate::services::llm::schema::{build_category_schema, build_classification_schema}; -use crate::services::prompts::{self, build_classification_prompt}; +use crate::services::llm::schema::build_category_schema; +use crate::services::prompts; use crate::services::scraper; use crate::services::source_scraper; @@ -496,45 +496,9 @@ async fn run_generation_inner( emit_progress(tx, "classifying", "Classification des articles...", 35); check_rate_limit(state, &user_rate_limiter, &provider_name)?; - let (class_system, class_user) = build_classification_prompt( - &valid_articles, - &classification_categories, - settings.max_items_per_category, - &filled_counts, - ); - let class_schema = build_classification_schema(); - - let llm_start = std::time::Instant::now(); - let class_response = provider - .call_llm( - &model_research, - &class_system, - &class_user, - &class_schema, - ) - .await?; - let llm_duration = llm_start.elapsed().as_millis() as u64; - log_llm_call(&state.pool, user_id, job_id, "classification_phase1", &model_research, - &class_system, &class_user, &class_response, llm_duration).await; - - // 1e. Parse classification and fill categories - let (phase1_classified, phase1_overflow) = parse_classification_response( - &class_response, - &valid_articles, - &classification_categories, - settings.max_items_per_category, - &mut filled_counts, - ); - - all_overflow.extend(phase1_overflow); - - // Merge into all_scraped and track URLs - for (cat_key, items) in phase1_classified { - for item in &items { - seen_urls.insert(item.url.to_lowercase()); - } - all_scraped.entry(cat_key).or_default().extend(items); - } + // TODO(Task 5): replace with per-article classify pipeline + let _ = (&valid_articles, &classification_categories, &filled_counts); + let _ = (); // phase1 classification stub // 1f. Enforce max_articles_per_source across all categories // (reuse domain counting logic) @@ -770,44 +734,9 @@ async fn run_generation_inner( emit_progress(tx, "classifying", "Classification des resultats web...", 70); check_rate_limit(state, &user_rate_limiter, &provider_name)?; - let (class_system, class_user) = build_classification_prompt( - &phase2_articles, - &classification_categories, - settings.max_items_per_category, - &filled_counts, - ); - let class_schema = build_classification_schema(); - - let llm_start = std::time::Instant::now(); - let class_response = provider - .call_llm( - &model_research, - &class_system, - &class_user, - &class_schema, - ) - .await?; - let llm_duration = llm_start.elapsed().as_millis() as u64; - log_llm_call(&state.pool, user_id, job_id, "classification_phase2", &model_research, - &class_system, &class_user, &class_response, llm_duration).await; - - let (phase2_classified, phase2_overflow) = parse_classification_response( - &class_response, - &phase2_articles, - &classification_categories, - settings.max_items_per_category, - &mut filled_counts, - ); - - all_overflow.extend(phase2_overflow); - - // Merge Phase 2 into all_scraped - for (cat_key, items) in phase2_classified { - for item in &items { - seen_urls.insert(item.url.to_lowercase()); - } - all_scraped.entry(cat_key).or_default().extend(items); - } + // TODO(Task 5): replace with per-article classify pipeline + let _ = (&phase2_articles, &classification_categories, &filled_counts); + let _ = (); // phase2 classification stub } } @@ -876,16 +805,13 @@ async fn run_generation_inner( emit_progress(tx, "rewrite", "Redaction des resumes...", 80); check_rate_limit(state, &user_rate_limiter, &provider_name)?; - let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&all_scraped); + // TODO(Task 5): rewrite pass replaced by per-article classify pipeline let rewrite_schema = build_rewrite_schema(&all_scraped, &settings.categories); + let _ = rewrite_schema; let llm_start = std::time::Instant::now(); - let final_results = provider - .call_llm(&model_writing, &rewrite_system, &rewrite_user, &rewrite_schema) - .await?; - let llm_duration = llm_start.elapsed().as_millis() as u64; - log_llm_call(&state.pool, user_id, job_id, "rewrite", &model_writing, - &rewrite_system, &rewrite_user, &final_results, llm_duration).await; + let _ = llm_start; + let final_results = serde_json::Value::Object(serde_json::Map::new()); // stub: replaced in Task 5 emit_progress(tx, "finalizing", "Finalisation...", 90); let mut final_sections = build_final_sections(&final_results, &settings.categories)?; @@ -1724,43 +1650,14 @@ async fn scrape_single_article_with_llm( return (String::new(), String::new(), final_url); } - let (system, user) = crate::services::prompts::build_article_extraction_prompt( - "", - &content.body_text, - ); - let schema = crate::services::llm::schema::build_article_extraction_schema(); - - match provider.call_llm(&model, &system, &user, &schema).await { - Ok(response) => { - let title = response.get("title").and_then(|t| t.as_str()).unwrap_or("").to_string(); - let body = response.get("body_text").and_then(|b| b.as_str()).unwrap_or("").to_string(); - let is_error = response.get("is_error_page").and_then(|e| e.as_bool()).unwrap_or(false); - let date_str = response.get("published_date").and_then(|d| d.as_str()).unwrap_or(""); - - if is_error || body.trim().is_empty() { - return (String::new(), String::new(), final_url); - } - - if !date_str.is_empty() { - if let Ok(date) = chrono::DateTime::parse_from_rfc3339(date_str) { - if scraper::is_article_too_old(Some(date.with_timezone(&chrono::Utc)), max_age_days) { - tracing::warn!(url = url, "LLM-extracted article too old"); - return (String::new(), String::new(), final_url); - } - } - } - - (body, title, final_url) - } - Err(e) => { - tracing::warn!(url = url, error = %e, "LLM extraction failed, using heuristic fallback"); - if scraper::is_article_too_old(content.published_date, max_age_days) { - return (String::new(), String::new(), final_url); - } - let title = content.title.unwrap_or_default(); - (content.body_text, title, final_url) - } + // TODO(Task 5): LLM article extraction removed; use heuristic fallback only. + // The provider and model parameters are kept for future use. + let _ = (provider, model); + if scraper::is_article_too_old(content.published_date, max_age_days) { + return (String::new(), String::new(), final_url); } + let title = content.title.unwrap_or_default(); + (content.body_text, title, final_url) } /// Build the final sections array from the LLM's rewrite output.