diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 8cfbb5e..c3c1abf 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -375,7 +375,11 @@ async fn run_generation_inner( .await?; emit_progress(tx, "finalizing", "Finalisation...", 90); - let final_sections = build_final_sections(&final_results, &settings.categories)?; + let mut final_sections = build_final_sections(&final_results, &settings.categories)?; + + // Restore validated URLs from scraped data — the LLM rewrite pass may + // hallucinate different URLs despite being told to preserve them. + restore_scraped_urls(&mut final_sections, &scraped, &settings.categories); // Step 12: Save synthesis to DB emit_progress(tx, "saving", "Sauvegarde de la synthese...", 95); @@ -884,6 +888,37 @@ fn build_final_sections( Ok(sections) } +/// Restore validated URLs from scraped data into the final sections. +/// +/// The LLM rewrite pass may hallucinate different URLs despite being +/// instructed to preserve them. This function replaces each article's URL +/// with the original scraped URL by matching on position (category index, +/// item index within category). +fn restore_scraped_urls( + sections: &mut [NewsSection], + scraped: &std::collections::HashMap>, + categories: &[String], +) { + for (i, section) in sections.iter_mut().enumerate() { + let key = format!("category_{}", i); + if let Some(scraped_items) = scraped.get(&key) { + for (j, item) in section.items.iter_mut().enumerate() { + if let Some(scraped_item) = scraped_items.get(j) { + if item.url != scraped_item.url { + tracing::debug!( + category = %categories.get(i).unwrap_or(&key), + original = %scraped_item.url, + hallucinated = %item.url, + "Restored hallucinated URL to scraped original" + ); + item.url = scraped_item.url.clone(); + } + } + } + } + } +} + /// Minimum ratio of valid URLs (starting with `http`) required to skip the /// scrape+rewrite pass and use the search pass results directly. const URL_QUALITY_THRESHOLD: f64 = 0.70;