feat: add scrape_flat_urls helper and gap-aware search prompt

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 3 months ago
parent d508b5b4ab
commit 51ea032838

@ -25,6 +25,7 @@ pub fn build_search_prompt(
sources: &[Source],
current_date: &str,
recent_domains: &[String],
category_gaps: Option<&[(String, i32)]>,
) -> (String, String) {
let sources_text = if sources.is_empty() {
String::new()
@ -100,6 +101,21 @@ pub fn build_search_prompt(
)
};
// If we have specific category gaps (Phase 2), replace the generic "N per category" line
let user_prompt = if let Some(gaps) = category_gaps {
let gaps_text = gaps
.iter()
.map(|(cat, needed)| format!("- {} : {} articles", cat, needed))
.collect::<Vec<_>>()
.join("\n");
user_prompt.replace(
&format!("Pour chaque categorie, fournis exactement {} actualites.", settings.max_items_per_category),
&format!("Fournis le nombre d'articles suivant par categorie :\n{}", gaps_text),
)
} else {
user_prompt
};
(system_prompt, user_prompt)
}
@ -229,21 +245,21 @@ mod tests {
#[test]
fn search_prompt_includes_theme() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]);
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("Intelligence Artificielle"));
}
#[test]
fn search_prompt_includes_date() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]);
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("lundi 21 mars 2026"));
}
#[test]
fn search_prompt_includes_max_age() {
let settings = test_settings();
let (system, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]);
let (system, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("7 derniers jours"));
assert!(system.contains("7"));
}
@ -251,7 +267,7 @@ mod tests {
#[test]
fn search_prompt_includes_categories() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]);
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("1. Annonces majeures"));
assert!(user_prompt.contains("2. Recherche et innovation"));
assert!(user_prompt.contains("2 grandes sections"));
@ -260,7 +276,7 @@ mod tests {
#[test]
fn search_prompt_includes_max_items() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]);
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("4 actualites"));
}
@ -284,7 +300,7 @@ mod tests {
},
];
let (_, user_prompt) = build_search_prompt(&settings, &sources, "lundi 21 mars 2026", &[]);
let (_, user_prompt) = build_search_prompt(&settings, &sources, "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("TechCrunch (https://techcrunch.com)"));
assert!(user_prompt.contains("The Verge (https://theverge.com)"));
assert!(user_prompt.contains("sources personnalisees"));
@ -293,7 +309,7 @@ mod tests {
#[test]
fn search_prompt_no_sources_no_section() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]);
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(!user_prompt.contains("sources personnalisees"));
}
@ -303,7 +319,7 @@ mod tests {
settings.search_agent_behavior =
"Concentre-toi sur les sources europeennes.".to_string();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]);
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("Concentre-toi sur les sources europeennes."));
assert!(!user_prompt.contains("recherche Google"));
}
@ -311,14 +327,14 @@ mod tests {
#[test]
fn search_prompt_default_behavior_when_empty() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]);
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("recherche Google"));
}
#[test]
fn search_prompt_warns_against_homepage_urls() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[]);
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026", &[], None);
assert!(user_prompt.contains("pages d'accueil"));
assert!(user_prompt.contains("articles specifiques"));
}
@ -361,7 +377,7 @@ mod tests {
let sources = vec![];
let date = "lundi 17 mars 2026";
let domains = vec!["techcrunch.com".to_string(), "theverge.com".to_string()];
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &domains);
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &domains, None);
assert!(user_prompt.contains("Evite si possible"));
assert!(user_prompt.contains("techcrunch.com"));
assert!(user_prompt.contains("theverge.com"));
@ -372,7 +388,7 @@ mod tests {
let settings = test_settings();
let sources = vec![];
let date = "lundi 17 mars 2026";
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[]);
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
assert!(!user_prompt.contains("Evite si possible"));
}
@ -420,4 +436,28 @@ mod tests {
let (system, _) = build_classification_prompt(&articles, &categories, 4, &filled);
assert!(system.contains("classe"));
}
#[test]
fn search_prompt_with_category_gaps() {
let settings = test_settings();
let sources = vec![];
let date = "lundi 17 mars 2026";
let gaps = vec![
("AI News".to_string(), 2),
("Cybersecurity".to_string(), 4),
];
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], Some(&gaps));
assert!(user_prompt.contains("AI News : 2 articles"));
assert!(user_prompt.contains("Cybersecurity : 4 articles"));
assert!(!user_prompt.contains("exactement"));
}
#[test]
fn search_prompt_without_gaps_uses_default() {
let settings = test_settings();
let sources = vec![];
let date = "lundi 17 mars 2026";
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
assert!(user_prompt.contains("exactement"));
}
}

@ -335,7 +335,7 @@ async fn run_generation_inner(
};
let (system_prompt, user_prompt) =
prompts::build_search_prompt(&settings, &sources, &current_date, &recent_domains);
prompts::build_search_prompt(&settings, &sources, &current_date, &recent_domains, None);
let raw_results = provider
.generate_search_pass(&model_research, &system_prompt, &user_prompt, &schema)
@ -926,6 +926,75 @@ async fn scrape_articles(
result
}
/// Scrape a flat list of URLs and return ScrapedNewsItems.
///
/// Used in Phase 1 where articles haven't been classified yet.
/// Reuses the same scraper infrastructure as `scrape_articles`.
async fn scrape_flat_urls(
state: &AppState,
urls: &[String],
max_age_days: i64,
tx: &watch::Sender<ProgressEvent>,
) -> Vec<ScrapedNewsItem> {
let total = urls.len();
if total == 0 {
return Vec::new();
}
let mut join_set = tokio::task::JoinSet::new();
let mut pending = urls.iter().enumerate().peekable();
let mut completed = 0usize;
let mut results = Vec::new();
let max_concurrent = 10;
// Seed initial tasks
for _ in 0..max_concurrent {
if let Some((_, url)) = pending.next() {
let client = state.http_client.clone();
let url = url.clone();
let mad = max_age_days;
join_set.spawn(async move {
let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
(url, scraped_content, page_title)
});
}
}
while let Some(join_result) = join_set.join_next().await {
completed += 1;
let pct = 15 + ((completed as u32 * 15) / total as u32).min(15);
emit_progress(
tx,
"scraping_sources",
&format!("Analyse des sources ({}/{})...", completed, total),
pct as u8,
);
if let Ok((url, scraped_content, page_title)) = join_result {
results.push(ScrapedNewsItem {
title: page_title.clone(),
url,
summary: String::new(), // No LLM summary yet
original_title: page_title,
scraped_content,
});
}
if let Some((_, url)) = pending.next() {
let client = state.http_client.clone();
let url = url.clone();
let mad = max_age_days;
join_set.spawn(async move {
let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
(url, scraped_content, page_title)
});
}
}
results
}
/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure.
///
/// Handles all failure modes gracefully:

Loading…
Cancel
Save