From a3f4c3b42f5541b043cb186e9a8449d53530b64b Mon Sep 17 00:00:00 2001 From: oabrivard Date: Mon, 23 Mar 2026 19:19:29 +0100 Subject: [PATCH] fix: always run scrape+rewrite pass to prevent hallucinated URLs The adaptive pipeline skipped the scrape+rewrite pass when the LLM's search results had URLs starting with "http". But LLMs hallucinate plausible URLs (Wikipedia, corporate sites) that pass the http check but aren't actual source articles. The scrape pass catches these by fetching each URL and validating the content exists. Always running the full pipeline ensures URL integrity. Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/services/synthesis.rs | 50 +++++++++++-------------------- 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 516e6c8..464a0fc 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -314,42 +314,28 @@ async fn run_generation_inner( // Step 7b: Filter out homepage URLs (path == "/" or empty) let parsed = filter_homepage_urls(parsed); - // Step 8: Adaptive pipeline — decide whether to scrape+rewrite or use search results directly + // Step 8: Scrape + rewrite pass // - // If the provider supports native web search and the search pass produced high-quality - // results (>70% valid URLs starting with http), we can skip the expensive scrape+rewrite - // pass and use the search results directly. - let final_sections = if provider.supports_web_search() && url_quality_sufficient(&parsed) { - tracing::info!( - provider = provider.provider_id(), - "Search pass URL quality sufficient, skipping scrape+rewrite pass" - ); - emit_progress( - tx, - "finalizing", - "Resultats de recherche de bonne qualite, finalisation directe...", - 85, - ); - build_final_sections(&raw_results, &settings.categories)? - } else { - // Full pipeline: scrape + rewrite - emit_progress(tx, "scraping", "Verification des sources...", 45); - let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await; - - // Rate limit check (pass 2) - check_rate_limit(state, &user_rate_limiter, &provider_name)?; + // Always run the full pipeline: the search pass URLs can be hallucinated + // by the LLM (Wikipedia, corporate sites instead of actual articles). + // The scrape pass fetches each URL and validates the content exists, + // then the rewrite pass produces summaries based on actual article content. + emit_progress(tx, "scraping", "Verification des sources...", 45); + let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await; + + // Rate limit check (pass 2) + check_rate_limit(state, &user_rate_limiter, &provider_name)?; - // LLM rewrite pass - emit_progress(tx, "rewrite", "Redaction des resumes...", 80); - let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&scraped); + // LLM rewrite pass + emit_progress(tx, "rewrite", "Redaction des resumes...", 80); + let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&scraped); - let final_results = provider - .generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &schema) - .await?; + let final_results = provider + .generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &schema) + .await?; - emit_progress(tx, "finalizing", "Finalisation...", 90); - build_final_sections(&final_results, &settings.categories)? - }; + emit_progress(tx, "finalizing", "Finalisation...", 90); + let final_sections = build_final_sections(&final_results, &settings.categories)?; // Step 12: Save synthesis to DB emit_progress(tx, "saving", "Sauvegarde de la synthese...", 95);