From 51ea032838cbfce718f8c8d50e74cbe8b7c99f40 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 01:52:18 +0100 Subject: [PATCH] feat: add scrape_flat_urls helper and gap-aware search prompt Co-Authored-By: Claude Sonnet 4.6 --- backend/src/services/prompts.rs | 64 ++++++++++++++++++++++------ backend/src/services/synthesis.rs | 71 ++++++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 13 deletions(-) diff --git a/backend/src/services/prompts.rs b/backend/src/services/prompts.rs index 10436ca..39dd158 100644 --- a/backend/src/services/prompts.rs +++ b/backend/src/services/prompts.rs @@ -25,6 +25,7 @@ pub fn build_search_prompt( sources: &[Source], current_date: &str, recent_domains: &[String], + category_gaps: Option<&[(String, i32)]>, ) -> (String, String) { let sources_text = if sources.is_empty() { String::new() @@ -100,6 +101,21 @@ pub fn build_search_prompt( ) }; + // If we have specific category gaps (Phase 2), replace the generic "N per category" line + let user_prompt = if let Some(gaps) = category_gaps { + let gaps_text = gaps + .iter() + .map(|(cat, needed)| format!("- {} : {} articles", cat, needed)) + .collect::>() + .join("\n"); + user_prompt.replace( + &format!("Pour chaque categorie, fournis exactement {} actualites.", settings.max_items_per_category), + &format!("Fournis le nombre d'articles suivant par categorie :\n{}", gaps_text), + ) + } else { + user_prompt + }; + (system_prompt, user_prompt) } @@ -229,21 +245,21 @@ mod tests { #[test] fn search_prompt_includes_theme() { let settings = test_settings(); - let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); + let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("Intelligence Artificielle")); } #[test] fn search_prompt_includes_date() { let settings = test_settings(); - let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); + let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("lundi 21 mars 2026")); } #[test] fn search_prompt_includes_max_age() { let settings = test_settings(); - let (system, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); + let (system, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("7 derniers jours")); assert!(system.contains("7")); } @@ -251,7 +267,7 @@ mod tests { #[test] fn search_prompt_includes_categories() { let settings = test_settings(); - let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); + let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("1. Annonces majeures")); assert!(user_prompt.contains("2. Recherche et innovation")); assert!(user_prompt.contains("2 grandes sections")); @@ -260,7 +276,7 @@ mod tests { #[test] fn search_prompt_includes_max_items() { let settings = test_settings(); - let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); + let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("4 actualites")); } @@ -284,7 +300,7 @@ mod tests { }, ]; - let (_, user_prompt) = build_search_prompt(&settings, &sources, "lundi 21 mars 2026", &[]); + let (_, user_prompt) = build_search_prompt(&settings, &sources, "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("TechCrunch (https://techcrunch.com)")); assert!(user_prompt.contains("The Verge (https://theverge.com)")); assert!(user_prompt.contains("sources personnalisees")); @@ -293,7 +309,7 @@ mod tests { #[test] fn search_prompt_no_sources_no_section() { let settings = test_settings(); - let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); + let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(!user_prompt.contains("sources personnalisees")); } @@ -303,7 +319,7 @@ mod tests { settings.search_agent_behavior = "Concentre-toi sur les sources europeennes.".to_string(); - let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); + let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("Concentre-toi sur les sources europeennes.")); assert!(!user_prompt.contains("recherche Google")); } @@ -311,14 +327,14 @@ mod tests { #[test] fn search_prompt_default_behavior_when_empty() { let settings = test_settings(); - let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); + let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("recherche Google")); } #[test] fn search_prompt_warns_against_homepage_urls() { let settings = test_settings(); - let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); + let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None); assert!(user_prompt.contains("pages d'accueil")); assert!(user_prompt.contains("articles specifiques")); } @@ -361,7 +377,7 @@ mod tests { let sources = vec![]; let date = "lundi 17 mars 2026"; let domains = vec!["techcrunch.com".to_string(), "theverge.com".to_string()]; - let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &domains); + let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &domains, None); assert!(user_prompt.contains("Evite si possible")); assert!(user_prompt.contains("techcrunch.com")); assert!(user_prompt.contains("theverge.com")); @@ -372,7 +388,7 @@ mod tests { let settings = test_settings(); let sources = vec![]; let date = "lundi 17 mars 2026"; - let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[]); + let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None); assert!(!user_prompt.contains("Evite si possible")); } @@ -420,4 +436,28 @@ mod tests { let (system, _) = build_classification_prompt(&articles, &categories, 4, &filled); assert!(system.contains("classe")); } + + #[test] + fn search_prompt_with_category_gaps() { + let settings = test_settings(); + let sources = vec![]; + let date = "lundi 17 mars 2026"; + let gaps = vec![ + ("AI News".to_string(), 2), + ("Cybersecurity".to_string(), 4), + ]; + let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], Some(&gaps)); + assert!(user_prompt.contains("AI News : 2 articles")); + assert!(user_prompt.contains("Cybersecurity : 4 articles")); + assert!(!user_prompt.contains("exactement")); + } + + #[test] + fn search_prompt_without_gaps_uses_default() { + let settings = test_settings(); + let sources = vec![]; + let date = "lundi 17 mars 2026"; + let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None); + assert!(user_prompt.contains("exactement")); + } } diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 76c3311..21dcdba 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -335,7 +335,7 @@ async fn run_generation_inner( }; let (system_prompt, user_prompt) = - prompts::build_search_prompt(&settings, &sources, ¤t_date, &recent_domains); + prompts::build_search_prompt(&settings, &sources, ¤t_date, &recent_domains, None); let raw_results = provider .generate_search_pass(&model_research, &system_prompt, &user_prompt, &schema) @@ -926,6 +926,75 @@ async fn scrape_articles( result } +/// Scrape a flat list of URLs and return ScrapedNewsItems. +/// +/// Used in Phase 1 where articles haven't been classified yet. +/// Reuses the same scraper infrastructure as `scrape_articles`. +async fn scrape_flat_urls( + state: &AppState, + urls: &[String], + max_age_days: i64, + tx: &watch::Sender, +) -> Vec { + let total = urls.len(); + if total == 0 { + return Vec::new(); + } + + let mut join_set = tokio::task::JoinSet::new(); + let mut pending = urls.iter().enumerate().peekable(); + let mut completed = 0usize; + let mut results = Vec::new(); + + let max_concurrent = 10; + + // Seed initial tasks + for _ in 0..max_concurrent { + if let Some((_, url)) = pending.next() { + let client = state.http_client.clone(); + let url = url.clone(); + let mad = max_age_days; + join_set.spawn(async move { + let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await; + (url, scraped_content, page_title) + }); + } + } + + while let Some(join_result) = join_set.join_next().await { + completed += 1; + let pct = 15 + ((completed as u32 * 15) / total as u32).min(15); + emit_progress( + tx, + "scraping_sources", + &format!("Analyse des sources ({}/{})...", completed, total), + pct as u8, + ); + + if let Ok((url, scraped_content, page_title)) = join_result { + results.push(ScrapedNewsItem { + title: page_title.clone(), + url, + summary: String::new(), // No LLM summary yet + original_title: page_title, + scraped_content, + }); + } + + if let Some((_, url)) = pending.next() { + let client = state.http_client.clone(); + let url = url.clone(); + let mad = max_age_days; + join_set.spawn(async move { + let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await; + (url, scraped_content, page_title) + }); + } + } + + results +} + /// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure. /// /// Handles all failure modes gracefully: