fix: restore scraped URLs after LLM rewrite pass to prevent hallucination

The rewrite pass can replace validated URLs with hallucinated ones (Wikipedia,
corporate sites) despite being instructed to preserve them. After the rewrite,
restore_scraped_urls() replaces each article's URL with the original scraped
URL by matching on position (category + item index). Logs when a URL is
restored so hallucination patterns can be monitored.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 8a18b70aff
commit a9be1ce435

@ -375,7 +375,11 @@ async fn run_generation_inner(
.await?; .await?;
emit_progress(tx, "finalizing", "Finalisation...", 90); emit_progress(tx, "finalizing", "Finalisation...", 90);
let final_sections = build_final_sections(&final_results, &settings.categories)?; let mut final_sections = build_final_sections(&final_results, &settings.categories)?;
// Restore validated URLs from scraped data — the LLM rewrite pass may
// hallucinate different URLs despite being told to preserve them.
restore_scraped_urls(&mut final_sections, &scraped, &settings.categories);
// Step 12: Save synthesis to DB // Step 12: Save synthesis to DB
emit_progress(tx, "saving", "Sauvegarde de la synthese...", 95); emit_progress(tx, "saving", "Sauvegarde de la synthese...", 95);
@ -884,6 +888,37 @@ fn build_final_sections(
Ok(sections) Ok(sections)
} }
/// Restore validated URLs from scraped data into the final sections.
///
/// The LLM rewrite pass may hallucinate different URLs despite being
/// instructed to preserve them. This function replaces each article's URL
/// with the original scraped URL by matching on position (category index,
/// item index within category).
fn restore_scraped_urls(
sections: &mut [NewsSection],
scraped: &std::collections::HashMap<String, Vec<ScrapedNewsItem>>,
categories: &[String],
) {
for (i, section) in sections.iter_mut().enumerate() {
let key = format!("category_{}", i);
if let Some(scraped_items) = scraped.get(&key) {
for (j, item) in section.items.iter_mut().enumerate() {
if let Some(scraped_item) = scraped_items.get(j) {
if item.url != scraped_item.url {
tracing::debug!(
category = %categories.get(i).unwrap_or(&key),
original = %scraped_item.url,
hallucinated = %item.url,
"Restored hallucinated URL to scraped original"
);
item.url = scraped_item.url.clone();
}
}
}
}
}
}
/// Minimum ratio of valid URLs (starting with `http`) required to skip the /// Minimum ratio of valid URLs (starting with `http`) required to skip the
/// scrape+rewrite pass and use the search pass results directly. /// scrape+rewrite pass and use the search pass results directly.
const URL_QUALITY_THRESHOLD: f64 = 0.70; const URL_QUALITY_THRESHOLD: f64 = 0.70;

Loading…
Cancel
Save