fix: always run scrape+rewrite pass to prevent hallucinated URLs

The adaptive pipeline skipped the scrape+rewrite pass when the LLM's search results had URLs starting with "http". But LLMs hallucinate plausible URLs (Wikipedia, corporate sites) that pass the http check but aren't actual source articles. The scrape pass catches these by fetching each URL and validating the content exists. Always running the full pipeline ensures URL integrity. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · a3f4c3b42f
parent 2b8f5236d5
commit a3f4c3b42f
1 changed files with 18 additions and 32 deletions
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -314,25 +314,12 @@ async fn run_generation_inner(
    // Step 7b: Filter out homepage URLs (path == "/" or empty)
    let parsed = filter_homepage_urls(parsed);

-    // Step 8: Adaptive pipeline — decide whether to scrape+rewrite or use search results directly
+    // Step 8: Scrape + rewrite pass
    //
-    // If the provider supports native web search and the search pass produced high-quality
-    // results (>70% valid URLs starting with http), we can skip the expensive scrape+rewrite
-    // pass and use the search results directly.
-    let final_sections = if provider.supports_web_search() && url_quality_sufficient(&parsed) {
-        tracing::info!(
-            provider = provider.provider_id(),
-            "Search pass URL quality sufficient, skipping scrape+rewrite pass"
-        );
-        emit_progress(
-            tx,
-            "finalizing",
-            "Resultats de recherche de bonne qualite, finalisation directe...",
-            85,
-        );
-        build_final_sections(&raw_results, &settings.categories)?
-    } else {
-        // Full pipeline: scrape + rewrite
+    // Always run the full pipeline: the search pass URLs can be hallucinated
+    // by the LLM (Wikipedia, corporate sites instead of actual articles).
+    // The scrape pass fetches each URL and validates the content exists,
+    // then the rewrite pass produces summaries based on actual article content.
    emit_progress(tx, "scraping", "Verification des sources...", 45);
    let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await;

@ -348,8 +335,7 @@ async fn run_generation_inner(
        .await?;

    emit_progress(tx, "finalizing", "Finalisation...", 90);
-        build_final_sections(&final_results, &settings.categories)?
-    };
+    let final_sections = build_final_sections(&final_results, &settings.categories)?;

    // Step 12: Save synthesis to DB
    emit_progress(tx, "saving", "Sauvegarde de la synthese...", 95);