From a9be1ce4358ae7c0b95c93161f6a0789743802f0 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Mon, 23 Mar 2026 23:53:08 +0100 Subject: [PATCH] fix: restore scraped URLs after LLM rewrite pass to prevent hallucination The rewrite pass can replace validated URLs with hallucinated ones (Wikipedia, corporate sites) despite being instructed to preserve them. After the rewrite, restore_scraped_urls() replaces each article's URL with the original scraped URL by matching on position (category + item index). Logs when a URL is restored so hallucination patterns can be monitored. Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/services/synthesis.rs | 37 ++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 8cfbb5e..c3c1abf 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -375,7 +375,11 @@ async fn run_generation_inner( .await?; emit_progress(tx, "finalizing", "Finalisation...", 90); - let final_sections = build_final_sections(&final_results, &settings.categories)?; + let mut final_sections = build_final_sections(&final_results, &settings.categories)?; + + // Restore validated URLs from scraped data — the LLM rewrite pass may + // hallucinate different URLs despite being told to preserve them. + restore_scraped_urls(&mut final_sections, &scraped, &settings.categories); // Step 12: Save synthesis to DB emit_progress(tx, "saving", "Sauvegarde de la synthese...", 95); @@ -884,6 +888,37 @@ fn build_final_sections( Ok(sections) } +/// Restore validated URLs from scraped data into the final sections. +/// +/// The LLM rewrite pass may hallucinate different URLs despite being +/// instructed to preserve them. This function replaces each article's URL +/// with the original scraped URL by matching on position (category index, +/// item index within category). +fn restore_scraped_urls( + sections: &mut [NewsSection], + scraped: &std::collections::HashMap>, + categories: &[String], +) { + for (i, section) in sections.iter_mut().enumerate() { + let key = format!("category_{}", i); + if let Some(scraped_items) = scraped.get(&key) { + for (j, item) in section.items.iter_mut().enumerate() { + if let Some(scraped_item) = scraped_items.get(j) { + if item.url != scraped_item.url { + tracing::debug!( + category = %categories.get(i).unwrap_or(&key), + original = %scraped_item.url, + hallucinated = %item.url, + "Restored hallucinated URL to scraped original" + ); + item.url = scraped_item.url.clone(); + } + } + } + } + } +} + /// Minimum ratio of valid URLs (starting with `http`) required to skip the /// scrape+rewrite pass and use the search pass results directly. const URL_QUALITY_THRESHOLD: f64 = 0.70;