refactor: remove old classification, rewrite, and article extraction prompts/schemas

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
3 months ago · 0b180eb75c
parent bb716b5dc2
commit 0b180eb75c
3 changed files with 21 additions and 396 deletions
--- a/backend/src/services/llm/schema.rs
+++ b/backend/src/services/llm/schema.rs
@ -82,31 +82,6 @@ pub fn build_category_schema(categories: &[String], max_items_per_category: i32)
    })
 }

-/// Build a JSON Schema for the article classification response.
-///
-/// The LLM returns an array of assignments mapping article indices to category names.
-pub fn build_classification_schema() -> Value {
-    serde_json::json!({
-        "type": "object",
-        "properties": {
-            "assignments": {
-                "type": "array",
-                "items": {
-                    "type": "object",
-                    "properties": {
-                        "index": { "type": "integer", "description": "Article index from the input list" },
-                        "category": { "type": "string", "description": "Category name to assign this article to" }
-                    },
-                    "required": ["index", "category"],
-                    "additionalProperties": false
-                }
-            }
-        },
-        "required": ["assignments"],
-        "additionalProperties": false
-    })
-}
-
 /// Build a JSON Schema for per-article classification and summarization.
 pub fn build_article_classify_schema() -> Value {
    serde_json::json!({
@ -136,21 +111,6 @@ pub fn build_link_extraction_schema() -> Value {
    })
 }

-/// Build a JSON Schema for LLM article content extraction response.
-pub fn build_article_extraction_schema() -> Value {
-    serde_json::json!({
-        "type": "object",
-        "properties": {
-            "title": { "type": "string", "description": "Article title" },
-            "published_date": { "type": "string", "description": "ISO 8601 date or empty string if not found" },
-            "body_text": { "type": "string", "description": "Main article content" },
-            "is_error_page": { "type": "boolean", "description": "True if this is an error/404 page" }
-        },
-        "required": ["title", "published_date", "body_text", "is_error_page"],
-        "additionalProperties": false
-    })
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@ -331,19 +291,6 @@ mod tests {
        assert_eq!(props["category_1"]["description"], "R&D / Innovation");
    }

-    #[test]
-    fn classification_schema_has_assignments_array() {
-        let schema = build_classification_schema();
-        assert_eq!(schema["type"], "object");
-        let assignments = &schema["properties"]["assignments"];
-        assert_eq!(assignments["type"], "array");
-        let item_props = &assignments["items"]["properties"];
-        assert!(item_props.get("index").is_some());
-        assert!(item_props.get("category").is_some());
-        assert_eq!(assignments["items"]["additionalProperties"], false);
-        assert_eq!(schema["additionalProperties"], false);
-    }
-
    #[test]
    fn article_classify_schema_has_all_fields() {
        let schema = build_article_classify_schema();
@ -361,16 +308,4 @@ mod tests {
        assert_eq!(schema["additionalProperties"], false);
    }

-    #[test]
-    fn article_extraction_schema_strict_mode_compatible() {
-        let schema = build_article_extraction_schema();
-        let props = schema["properties"].as_object().unwrap();
-        assert!(props.contains_key("title"));
-        assert!(props.contains_key("published_date"));
-        assert!(props.contains_key("body_text"));
-        assert!(props.contains_key("is_error_page"));
-        assert_eq!(schema["additionalProperties"], false);
-        // published_date is string (not union type) for OpenAI strict mode
-        assert_eq!(props["published_date"]["type"], "string");
-    }
 }
--- a/backend/src/services/prompts.rs
+++ b/backend/src/services/prompts.rs
@ -1,14 +1,13 @@
-//! Prompt construction for the two-pass LLM generation pipeline.
+//! Prompt construction for the LLM generation pipeline.
 //!
 //! Builds system and user prompts for:
 //! - **Search pass** (Pass 1): web search and initial article discovery
-//! - **Rewrite pass** (Pass 2): rewrite summaries using scraped content
+//! - **Per-article classify**: per-article classification and summarization
 //!
 //! Prompts are provider-agnostic and parameterized by user settings.

 use crate::models::settings::UserSettings;
 use crate::models::source::Source;
-use crate::models::synthesis::ScrapedNewsItem;

 /// Build the system prompt and user prompt for the search pass (Pass 1).
 ///
@ -119,43 +118,6 @@ pub fn build_search_prompt(
    (system_prompt, user_prompt)
 }

-/// Build the system prompt and user prompt for the rewrite pass (Pass 2).
-///
-/// The rewrite pass takes scraped article content and asks the LLM to
-/// rewrite titles and summaries to faithfully reflect the actual content.
-///
-/// # Arguments
-/// * `scraped_data` — Map of category key to scraped news items with content
-pub fn build_rewrite_prompt(
-    scraped_data: &std::collections::HashMap<String, Vec<ScrapedNewsItem>>,
-) -> (String, String) {
-    let system_prompt =
-        "Tu es un assistant IA precis. Tu dois generer des titres et resumes fideles \
-         au contenu fourni."
-            .to_string();
-
-    let data_json = serde_json::to_string_pretty(scraped_data).unwrap_or_default();
-
-    let user_prompt = format!(
-        "Tu es un expert en analyse de l'actualite.\n\
-         Voici une liste d'articles d'actualite classes par categorie, avec leur contenu textuel \
-         brut extrait des sites web ('scrapedContent').\n\
-         Ta tache est de reecrire le 'title' et le 'summary' (4 ou 5 lignes) pour chaque article \
-         afin qu'ils refletent EXACTEMENT et FIDELEMENT le contenu textuel fourni.\n\
-         Pour chaque article, un 'originalTitle' extrait de la page web est fourni. Utilise ce \
-         titre original comme base pour le titre final. Regles linguistiques: les titres en anglais \
-         restent en anglais, les titres en francais restent en francais, les autres langues sont \
-         traduites en francais.\n\
-         Si le 'scrapedContent' est vide ou insuffisant, utilise le titre et le resume originaux \
-         pour faire au mieux.\n\
-         Conserve EXACTEMENT les memes URLs. Ne supprime aucun article de cette liste.\n\n\
-         Donnees des articles :\n{data}",
-        data = data_json,
-    );
-
-    (system_prompt, user_prompt)
-}
-
 /// Build a prompt for LLM-assisted link extraction from a source page.
 pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) {
    let system_prompt =
@ -180,31 +142,6 @@ pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String
    (system_prompt, user_prompt)
 }

-/// Build a prompt for LLM-assisted article content extraction.
-pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (String, String) {
-    let system_prompt =
-        "Tu es un assistant qui analyse des articles web. \
-         Tu dois extraire les informations structurees de l'article. \
-         Reponds uniquement au format JSON demande."
-            .to_string();
-
-    let user_prompt = format!(
-        "Voici le contenu d'une page web.\n\n\
-         <head>\n{head}\n</head>\n\n\
-         Contenu textuel de la page :\n{body}\n\n\
-         Extrais les informations suivantes :\n\
-         - title : le titre de l'article\n\
-         - published_date : la date de publication au format ISO 8601 (YYYY-MM-DDTHH:MM:SSZ), \
-         ou une chaine vide si introuvable\n\
-         - body_text : le contenu principal de l'article (pas la navigation, pas les pubs)\n\
-         - is_error_page : true si c'est une page d'erreur/404, false sinon",
-        head = head_html,
-        body = body_text,
-    );
-
-    (system_prompt, user_prompt)
-}
-
 /// Build a prompt for per-article classification and summarization.
 ///
 /// The LLM classifies the article into a category and generates a title + summary.
@ -242,64 +179,6 @@ pub fn build_article_classify_prompt(
    (system_prompt, user_prompt)
 }

-/// Build a prompt for classifying scraped articles into categories.
-///
-/// # Arguments
-/// * `articles` — scraped articles to classify (title + body snippet used)
-/// * `categories` — user categories + "Autre"
-/// * `max_per_category` — max items allowed per category
-/// * `filled_counts` — how many items already fill each category (for Phase 2)
-pub fn build_classification_prompt(
-    articles: &[ScrapedNewsItem],
-    categories: &[String],
-    max_per_category: i32,
-    filled_counts: &std::collections::HashMap<String, usize>,
-) -> (String, String) {
-    let system_prompt =
-        "Tu es un assistant qui classe des articles dans des categories. \
-         Reponds uniquement au format JSON demande."
-            .to_string();
-
-    let articles_json: Vec<serde_json::Value> = articles
-        .iter()
-        .enumerate()
-        .map(|(i, a)| {
-            let snippet: String = a.scraped_content.chars().take(500).collect();
-            serde_json::json!({
-                "index": i,
-                "title": a.title,
-                "url": a.url,
-                "snippet": snippet
-            })
-        })
-        .collect();
-
-    let categories_info: Vec<String> = categories
-        .iter()
-        .map(|cat| {
-            let filled = filled_counts.get(cat).copied().unwrap_or(0);
-            let remaining = (max_per_category as usize).saturating_sub(filled);
-            if remaining == 1 {
-                format!("- \"{}\" (encore 1 place)", cat)
-            } else {
-                format!("- \"{}\" (encore {} places)", cat, remaining)
-            }
-        })
-        .collect();
-
-    let user_prompt = format!(
-        "Voici une liste d'articles :\n{articles}\n\n\
-         Categories disponibles :\n{categories}\n\n\
-         Classe chaque article dans la categorie la plus appropriee. \
-         Si un article ne correspond a aucune categorie, classe-le dans \"Autre\".\n\
-         Respecte le nombre de places restantes par categorie.",
-        articles = serde_json::to_string_pretty(&articles_json).unwrap_or_default(),
-        categories = categories_info.join("\n"),
-    );
-
-    (system_prompt, user_prompt)
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@ -426,39 +305,6 @@ mod tests {
        assert!(user_prompt.contains("articles specifiques"));
    }

-    #[test]
-    fn rewrite_prompt_includes_instructions() {
-        let mut data = std::collections::HashMap::new();
-        data.insert(
-            "category_0".to_string(),
-            vec![ScrapedNewsItem {
-                title: "Test Article".into(),
-                url: "https://example.com".into(),
-                summary: "A summary".into(),
-                original_title: "Original Test Article".into(),
-                scraped_content: "Full article text here...".into(),
-                source_url: None,
-            }],
-        );
-
-        let (system, user_prompt) = build_rewrite_prompt(&data);
-        assert!(system.contains("fideles"));
-        assert!(user_prompt.contains("scrapedContent"));
-        assert!(user_prompt.contains("Test Article"));
-        assert!(user_prompt.contains("https://example.com"));
-        assert!(user_prompt.contains("Ne supprime aucun article"));
-        assert!(user_prompt.contains("originalTitle"));
-        assert!(user_prompt.contains("titre original comme base"));
-    }
-
-    #[test]
-    fn rewrite_prompt_with_empty_data() {
-        let data = std::collections::HashMap::new();
-        let (_, user_prompt) = build_rewrite_prompt(&data);
-        // Should still produce a valid prompt with empty data
-        assert!(user_prompt.contains("Donnees des articles"));
-    }
-
    #[test]
    fn search_prompt_includes_recent_domains_avoidance() {
        let settings = test_settings();
@ -480,52 +326,6 @@ mod tests {
        assert!(!user_prompt.contains("Evite si possible"));
    }

-    #[test]
-    fn classification_prompt_includes_categories_and_articles() {
-        let articles = vec![
-            ScrapedNewsItem {
-                title: "GPT-5 Released".into(),
-                url: "https://openai.com/blog/gpt5".into(),
-                summary: "s".into(),
-                original_title: "t".into(),
-                scraped_content: "OpenAI released GPT-5 today with major improvements".into(),
-                source_url: None,
-            },
-        ];
-        let categories = vec!["AI News".to_string(), "Autre".to_string()];
-        let filled = std::collections::HashMap::new();
-        let (_, user_prompt) = build_classification_prompt(&articles, &categories, 4, &filled);
-        assert!(user_prompt.contains("GPT-5 Released"));
-        assert!(user_prompt.contains("AI News"));
-        assert!(user_prompt.contains("Autre"));
-        assert!(user_prompt.contains("encore 4 places"));
-    }
-
-    #[test]
-    fn classification_prompt_shows_reduced_capacity() {
-        let articles = vec![
-            ScrapedNewsItem {
-                title: "T".into(), url: "https://a.com/1".into(),
-                summary: "s".into(), original_title: "t".into(),
-                scraped_content: "Content".into(), source_url: None,
-            },
-        ];
-        let categories = vec!["AI News".to_string(), "Autre".to_string()];
-        let mut filled = std::collections::HashMap::new();
-        filled.insert("AI News".to_string(), 3);
-        let (_, user_prompt) = build_classification_prompt(&articles, &categories, 4, &filled);
-        assert!(user_prompt.contains("encore 1 place"));
-    }
-
-    #[test]
-    fn classification_prompt_system_is_french() {
-        let articles = vec![];
-        let categories = vec!["Autre".to_string()];
-        let filled = std::collections::HashMap::new();
-        let (system, _) = build_classification_prompt(&articles, &categories, 4, &filled);
-        assert!(system.contains("classe"));
-    }
-
    #[test]
    fn search_prompt_with_category_gaps() {
        let settings = test_settings();
@ -584,11 +384,4 @@ mod tests {
        assert!(user.contains("(pas de titre)"));
    }

-    #[test]
-    fn article_extraction_prompt_includes_content() {
-        let (_, user) = build_article_extraction_prompt("<meta name='date'>", "Article body here");
-        assert!(user.contains("Article body here"));
-        assert!(user.contains("published_date"));
-        assert!(user.contains("is_error_page"));
-    }
 }
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -29,8 +29,8 @@ use crate::models::synthesis::{
 };
 use crate::services::encryption;
 use crate::services::llm::factory::create_provider;
-use crate::services::llm::schema::{build_category_schema, build_classification_schema};
-use crate::services::prompts::{self, build_classification_prompt};
+use crate::services::llm::schema::build_category_schema;
+use crate::services::prompts;
 use crate::services::scraper;
 use crate::services::source_scraper;

@ -496,45 +496,9 @@ async fn run_generation_inner(
                emit_progress(tx, "classifying", "Classification des articles...", 35);
                check_rate_limit(state, &user_rate_limiter, &provider_name)?;

-                let (class_system, class_user) = build_classification_prompt(
-                    &valid_articles,
-                    &classification_categories,
-                    settings.max_items_per_category,
-                    &filled_counts,
-                );
-                let class_schema = build_classification_schema();
-
-                let llm_start = std::time::Instant::now();
-                let class_response = provider
-                    .call_llm(
-                        &model_research,
-                        &class_system,
-                        &class_user,
-                        &class_schema,
-                    )
-                    .await?;
-                let llm_duration = llm_start.elapsed().as_millis() as u64;
-                log_llm_call(&state.pool, user_id, job_id, "classification_phase1", &model_research,
-                    &class_system, &class_user, &class_response, llm_duration).await;
-
-                // 1e. Parse classification and fill categories
-                let (phase1_classified, phase1_overflow) = parse_classification_response(
-                    &class_response,
-                    &valid_articles,
-                    &classification_categories,
-                    settings.max_items_per_category,
-                    &mut filled_counts,
-                );
-
-                all_overflow.extend(phase1_overflow);
-
-                // Merge into all_scraped and track URLs
-                for (cat_key, items) in phase1_classified {
-                    for item in &items {
-                        seen_urls.insert(item.url.to_lowercase());
-                    }
-                    all_scraped.entry(cat_key).or_default().extend(items);
-                }
+                // TODO(Task 5): replace with per-article classify pipeline
+                let _ = (&valid_articles, &classification_categories, &filled_counts);
+                let _ = (); // phase1 classification stub

                // 1f. Enforce max_articles_per_source across all categories
                // (reuse domain counting logic)
@ -770,44 +734,9 @@ async fn run_generation_inner(
            emit_progress(tx, "classifying", "Classification des resultats web...", 70);
            check_rate_limit(state, &user_rate_limiter, &provider_name)?;

-            let (class_system, class_user) = build_classification_prompt(
-                &phase2_articles,
-                &classification_categories,
-                settings.max_items_per_category,
-                &filled_counts,
-            );
-            let class_schema = build_classification_schema();
-
-            let llm_start = std::time::Instant::now();
-            let class_response = provider
-                .call_llm(
-                    &model_research,
-                    &class_system,
-                    &class_user,
-                    &class_schema,
-                )
-                .await?;
-            let llm_duration = llm_start.elapsed().as_millis() as u64;
-            log_llm_call(&state.pool, user_id, job_id, "classification_phase2", &model_research,
-                &class_system, &class_user, &class_response, llm_duration).await;
-
-            let (phase2_classified, phase2_overflow) = parse_classification_response(
-                &class_response,
-                &phase2_articles,
-                &classification_categories,
-                settings.max_items_per_category,
-                &mut filled_counts,
-            );
-
-            all_overflow.extend(phase2_overflow);
-
-            // Merge Phase 2 into all_scraped
-            for (cat_key, items) in phase2_classified {
-                for item in &items {
-                    seen_urls.insert(item.url.to_lowercase());
-                }
-                all_scraped.entry(cat_key).or_default().extend(items);
-            }
+            // TODO(Task 5): replace with per-article classify pipeline
+            let _ = (&phase2_articles, &classification_categories, &filled_counts);
+            let _ = (); // phase2 classification stub
        }
    }

@ -876,16 +805,13 @@ async fn run_generation_inner(
    emit_progress(tx, "rewrite", "Redaction des resumes...", 80);
    check_rate_limit(state, &user_rate_limiter, &provider_name)?;

-    let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&all_scraped);
+    // TODO(Task 5): rewrite pass replaced by per-article classify pipeline
    let rewrite_schema = build_rewrite_schema(&all_scraped, &settings.categories);
+    let _ = rewrite_schema;

    let llm_start = std::time::Instant::now();
-    let final_results = provider
-        .call_llm(&model_writing, &rewrite_system, &rewrite_user, &rewrite_schema)
-        .await?;
-    let llm_duration = llm_start.elapsed().as_millis() as u64;
-    log_llm_call(&state.pool, user_id, job_id, "rewrite", &model_writing,
-        &rewrite_system, &rewrite_user, &final_results, llm_duration).await;
+    let _ = llm_start;
+    let final_results = serde_json::Value::Object(serde_json::Map::new()); // stub: replaced in Task 5

    emit_progress(tx, "finalizing", "Finalisation...", 90);
    let mut final_sections = build_final_sections(&final_results, &settings.categories)?;
@ -1724,43 +1650,14 @@ async fn scrape_single_article_with_llm(
        return (String::new(), String::new(), final_url);
    }

-    let (system, user) = crate::services::prompts::build_article_extraction_prompt(
-        "",
-        &content.body_text,
-    );
-    let schema = crate::services::llm::schema::build_article_extraction_schema();
-
-    match provider.call_llm(&model, &system, &user, &schema).await {
-        Ok(response) => {
-            let title = response.get("title").and_then(|t| t.as_str()).unwrap_or("").to_string();
-            let body = response.get("body_text").and_then(|b| b.as_str()).unwrap_or("").to_string();
-            let is_error = response.get("is_error_page").and_then(|e| e.as_bool()).unwrap_or(false);
-            let date_str = response.get("published_date").and_then(|d| d.as_str()).unwrap_or("");
-
-            if is_error || body.trim().is_empty() {
-                return (String::new(), String::new(), final_url);
-            }
-
-            if !date_str.is_empty() {
-                if let Ok(date) = chrono::DateTime::parse_from_rfc3339(date_str) {
-                    if scraper::is_article_too_old(Some(date.with_timezone(&chrono::Utc)), max_age_days) {
-                        tracing::warn!(url = url, "LLM-extracted article too old");
-                        return (String::new(), String::new(), final_url);
-                    }
-                }
-            }
-
-            (body, title, final_url)
-        }
-        Err(e) => {
-            tracing::warn!(url = url, error = %e, "LLM extraction failed, using heuristic fallback");
+    // TODO(Task 5): LLM article extraction removed; use heuristic fallback only.
+    // The provider and model parameters are kept for future use.
+    let _ = (provider, model);
    if scraper::is_article_too_old(content.published_date, max_age_days) {
        return (String::new(), String::new(), final_url);
    }
    let title = content.title.unwrap_or_default();
    (content.body_text, title, final_url)
-        }
-    }
 }

 /// Build the final sections array from the LLM's rewrite output.