feat: add scrape_flat_urls helper and gap-aware search prompt

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 3 months ago
parent d508b5b4ab
commit 51ea032838

@ -25,6 +25,7 @@ pub fn build_search_prompt(
sources: &[Source], sources: &[Source],
current_date: &str, current_date: &str,
recent_domains: &[String], recent_domains: &[String],
category_gaps: Option<&[(String, i32)]>,
) -> (String, String) { ) -> (String, String) {
let sources_text = if sources.is_empty() { let sources_text = if sources.is_empty() {
String::new() String::new()
@ -100,6 +101,21 @@ pub fn build_search_prompt(
) )
}; };
// If we have specific category gaps (Phase 2), replace the generic "N per category" line
let user_prompt = if let Some(gaps) = category_gaps {
let gaps_text = gaps
.iter()
.map(|(cat, needed)| format!("- {} : {} articles", cat, needed))
.collect::<Vec<_>>()
.join("\n");
user_prompt.replace(
&format!("Pour chaque categorie, fournis exactement {} actualites.", settings.max_items_per_category),
&format!("Fournis le nombre d'articles suivant par categorie :\n{}", gaps_text),
)
} else {
user_prompt
};
(system_prompt, user_prompt) (system_prompt, user_prompt)
} }
@ -229,21 +245,21 @@ mod tests {
#[test] #[test]
fn search_prompt_includes_theme() { fn search_prompt_includes_theme() {
let settings = test_settings(); let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("Intelligence Artificielle")); assert!(user_prompt.contains("Intelligence Artificielle"));
} }
#[test] #[test]
fn search_prompt_includes_date() { fn search_prompt_includes_date() {
let settings = test_settings(); let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("lundi 21 mars 2026")); assert!(user_prompt.contains("lundi 21 mars 2026"));
} }
#[test] #[test]
fn search_prompt_includes_max_age() { fn search_prompt_includes_max_age() {
let settings = test_settings(); let settings = test_settings();
let (system, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); let (system, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("7 derniers jours")); assert!(user_prompt.contains("7 derniers jours"));
assert!(system.contains("7")); assert!(system.contains("7"));
} }
@ -251,7 +267,7 @@ mod tests {
#[test] #[test]
fn search_prompt_includes_categories() { fn search_prompt_includes_categories() {
let settings = test_settings(); let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("1. Annonces majeures")); assert!(user_prompt.contains("1. Annonces majeures"));
assert!(user_prompt.contains("2. Recherche et innovation")); assert!(user_prompt.contains("2. Recherche et innovation"));
assert!(user_prompt.contains("2 grandes sections")); assert!(user_prompt.contains("2 grandes sections"));
@ -260,7 +276,7 @@ mod tests {
#[test] #[test]
fn search_prompt_includes_max_items() { fn search_prompt_includes_max_items() {
let settings = test_settings(); let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("4 actualites")); assert!(user_prompt.contains("4 actualites"));
} }
@ -284,7 +300,7 @@ mod tests {
}, },
]; ];
let (_, user_prompt) = build_search_prompt(&settings, &sources, "lundi 21 mars 2026", &[]); let (_, user_prompt) = build_search_prompt(&settings, &sources, "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("TechCrunch (https://techcrunch.com)")); assert!(user_prompt.contains("TechCrunch (https://techcrunch.com)"));
assert!(user_prompt.contains("The Verge (https://theverge.com)")); assert!(user_prompt.contains("The Verge (https://theverge.com)"));
assert!(user_prompt.contains("sources personnalisees")); assert!(user_prompt.contains("sources personnalisees"));
@ -293,7 +309,7 @@ mod tests {
#[test] #[test]
fn search_prompt_no_sources_no_section() { fn search_prompt_no_sources_no_section() {
let settings = test_settings(); let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(!user_prompt.contains("sources personnalisees")); assert!(!user_prompt.contains("sources personnalisees"));
} }
@ -303,7 +319,7 @@ mod tests {
settings.search_agent_behavior = settings.search_agent_behavior =
"Concentre-toi sur les sources europeennes.".to_string(); "Concentre-toi sur les sources europeennes.".to_string();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("Concentre-toi sur les sources europeennes.")); assert!(user_prompt.contains("Concentre-toi sur les sources europeennes."));
assert!(!user_prompt.contains("recherche Google")); assert!(!user_prompt.contains("recherche Google"));
} }
@ -311,14 +327,14 @@ mod tests {
#[test] #[test]
fn search_prompt_default_behavior_when_empty() { fn search_prompt_default_behavior_when_empty() {
let settings = test_settings(); let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("recherche Google")); assert!(user_prompt.contains("recherche Google"));
} }
#[test] #[test]
fn search_prompt_warns_against_homepage_urls() { fn search_prompt_warns_against_homepage_urls() {
let settings = test_settings(); let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]); let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("pages d'accueil")); assert!(user_prompt.contains("pages d'accueil"));
assert!(user_prompt.contains("articles specifiques")); assert!(user_prompt.contains("articles specifiques"));
} }
@ -361,7 +377,7 @@ mod tests {
let sources = vec![]; let sources = vec![];
let date = "lundi 17 mars 2026"; let date = "lundi 17 mars 2026";
let domains = vec!["techcrunch.com".to_string(), "theverge.com".to_string()]; let domains = vec!["techcrunch.com".to_string(), "theverge.com".to_string()];
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &domains); let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &domains, None);
assert!(user_prompt.contains("Evite si possible")); assert!(user_prompt.contains("Evite si possible"));
assert!(user_prompt.contains("techcrunch.com")); assert!(user_prompt.contains("techcrunch.com"));
assert!(user_prompt.contains("theverge.com")); assert!(user_prompt.contains("theverge.com"));
@ -372,7 +388,7 @@ mod tests {
let settings = test_settings(); let settings = test_settings();
let sources = vec![]; let sources = vec![];
let date = "lundi 17 mars 2026"; let date = "lundi 17 mars 2026";
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[]); let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
assert!(!user_prompt.contains("Evite si possible")); assert!(!user_prompt.contains("Evite si possible"));
} }
@ -420,4 +436,28 @@ mod tests {
let (system, _) = build_classification_prompt(&articles, &categories, 4, &filled); let (system, _) = build_classification_prompt(&articles, &categories, 4, &filled);
assert!(system.contains("classe")); assert!(system.contains("classe"));
} }
#[test]
fn search_prompt_with_category_gaps() {
let settings = test_settings();
let sources = vec![];
let date = "lundi 17 mars 2026";
let gaps = vec![
("AI News".to_string(), 2),
("Cybersecurity".to_string(), 4),
];
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], Some(&gaps));
assert!(user_prompt.contains("AI News : 2 articles"));
assert!(user_prompt.contains("Cybersecurity : 4 articles"));
assert!(!user_prompt.contains("exactement"));
}
#[test]
fn search_prompt_without_gaps_uses_default() {
let settings = test_settings();
let sources = vec![];
let date = "lundi 17 mars 2026";
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
assert!(user_prompt.contains("exactement"));
}
} }

@ -335,7 +335,7 @@ async fn run_generation_inner(
}; };
let (system_prompt, user_prompt) = let (system_prompt, user_prompt) =
prompts::build_search_prompt(&settings, &sources, &current_date, &recent_domains); prompts::build_search_prompt(&settings, &sources, &current_date, &recent_domains, None);
let raw_results = provider let raw_results = provider
.generate_search_pass(&model_research, &system_prompt, &user_prompt, &schema) .generate_search_pass(&model_research, &system_prompt, &user_prompt, &schema)
@ -926,6 +926,75 @@ async fn scrape_articles(
result result
} }
/// Scrape a flat list of URLs and return ScrapedNewsItems.
///
/// Used in Phase 1 where articles haven't been classified yet.
/// Reuses the same scraper infrastructure as `scrape_articles`.
async fn scrape_flat_urls(
state: &AppState,
urls: &[String],
max_age_days: i64,
tx: &watch::Sender<ProgressEvent>,
) -> Vec<ScrapedNewsItem> {
let total = urls.len();
if total == 0 {
return Vec::new();
}
let mut join_set = tokio::task::JoinSet::new();
let mut pending = urls.iter().enumerate().peekable();
let mut completed = 0usize;
let mut results = Vec::new();
let max_concurrent = 10;
// Seed initial tasks
for _ in 0..max_concurrent {
if let Some((_, url)) = pending.next() {
let client = state.http_client.clone();
let url = url.clone();
let mad = max_age_days;
join_set.spawn(async move {
let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
(url, scraped_content, page_title)
});
}
}
while let Some(join_result) = join_set.join_next().await {
completed += 1;
let pct = 15 + ((completed as u32 * 15) / total as u32).min(15);
emit_progress(
tx,
"scraping_sources",
&format!("Analyse des sources ({}/{})...", completed, total),
pct as u8,
);
if let Ok((url, scraped_content, page_title)) = join_result {
results.push(ScrapedNewsItem {
title: page_title.clone(),
url,
summary: String::new(), // No LLM summary yet
original_title: page_title,
scraped_content,
});
}
if let Some((_, url)) = pending.next() {
let client = state.http_client.clone();
let url = url.clone();
let mad = max_age_days;
join_set.spawn(async move {
let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
(url, scraped_content, page_title)
});
}
}
results
}
/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure. /// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure.
/// ///
/// Handles all failure modes gracefully: /// Handles all failure modes gracefully:

Loading…
Cancel
Save