From 13894a8f505db33bbfc28de95f40dd83908d3dec Mon Sep 17 00:00:00 2001
From: oabrivard <olivier@abrivard.fr>
Date: Tue, 24 Mar 2026 00:07:36 +0100
Subject: [PATCH] fix: filter empty scraped articles + restore URLs after
 rewrite + E2E assertions

- filter_empty_scraped_articles: removes articles with empty scraped content
  (too old, soft 404, scrape failure) before the rewrite pass, preventing
  empty articles in the final synthesis
- restore_scraped_urls: already existed, now has unit tests
- E2E test: added assertions for no Wikipedia URLs, no empty summaries,
  and updated settings payload with new fields (max_articles_per_source,
  source_diversity_window)
- 4 new unit tests for filter_empty + restore_scraped_urls

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 backend/src/services/synthesis.rs | 129 ++++++++++++++++++++++++++++++
 e2e/tests/generation-live.spec.ts |  12 ++-
 2 files changed, 139 insertions(+), 2 deletions(-)
diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs
index c3c1abf..1dafde6 100644
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@@ -363,6 +363,10 @@ async fn run_generation_inner(
     emit_progress(tx, "scraping", "Verification des sources...", 45);
     let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await;
 
+    // Remove articles with empty scraped content (too old, soft 404, scrape failure).
+    // These would produce empty/low-quality output in the rewrite pass.
+    let scraped = filter_empty_scraped_articles(scraped);
+
     // Rate limit check (pass 2)
     check_rate_limit(state, &user_rate_limiter, &provider_name)?;
 
@@ -533,6 +537,26 @@ fn filter_homepage_urls(
 /// Remove duplicate articles with the same URL across all categories.
 ///
 /// Keeps the first occurrence (in category order) and drops subsequent duplicates.
+/// Remove scraped articles with empty content from the data passed to the rewrite pass.
+///
+/// Articles with empty `scraped_content` are those where scraping failed (network error),
+/// the page was a soft 404, or the article was too old. Keeping them would produce
+/// empty or low-quality output in the final synthesis.
+fn filter_empty_scraped_articles(
+    scraped: HashMap<String, Vec<ScrapedNewsItem>>,
+) -> HashMap<String, Vec<ScrapedNewsItem>> {
+    scraped
+        .into_iter()
+        .map(|(cat_key, items)| {
+            let filtered: Vec<ScrapedNewsItem> = items
+                .into_iter()
+                .filter(|item| !item.scraped_content.trim().is_empty())
+                .collect();
+            (cat_key, filtered)
+        })
+        .collect()
+}
+
 fn dedup_by_url(parsed: Vec<(String, Vec<NewsItem>)>) -> Vec<(String, Vec<NewsItem>)> {
     let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
     parsed
@@ -1540,6 +1564,111 @@ mod tests {
         assert_eq!(result[0].1.len(), 2);
     }
 
+    // ── filter_empty_scraped_articles tests ─────────────────────────
+
+    #[test]
+    fn filter_empty_removes_articles_with_no_content() {
+        use crate::models::synthesis::ScrapedNewsItem;
+        let mut scraped = HashMap::new();
+        scraped.insert("category_0".to_string(), vec![
+            ScrapedNewsItem {
+                title: "Good".into(), url: "https://a.com/1".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "Real content here".into(),
+            },
+            ScrapedNewsItem {
+                title: "Empty".into(), url: "https://b.com/2".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "".into(),
+            },
+            ScrapedNewsItem {
+                title: "Whitespace".into(), url: "https://c.com/3".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "   ".into(),
+            },
+        ]);
+
+        let result = filter_empty_scraped_articles(scraped);
+        assert_eq!(result["category_0"].len(), 1);
+        assert_eq!(result["category_0"][0].title, "Good");
+    }
+
+    #[test]
+    fn filter_empty_keeps_all_when_all_have_content() {
+        use crate::models::synthesis::ScrapedNewsItem;
+        let mut scraped = HashMap::new();
+        scraped.insert("category_0".to_string(), vec![
+            ScrapedNewsItem {
+                title: "A".into(), url: "https://a.com/1".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "Content".into(),
+            },
+        ]);
+
+        let result = filter_empty_scraped_articles(scraped);
+        assert_eq!(result["category_0"].len(), 1);
+    }
+
+    // ── restore_scraped_urls tests ───────────────────────────────
+
+    #[test]
+    fn restore_urls_replaces_hallucinated_urls() {
+        use crate::models::synthesis::{ScrapedNewsItem, NewsSection};
+        let categories = vec!["Cat A".to_string()];
+        let mut scraped = HashMap::new();
+        scraped.insert("category_0".to_string(), vec![
+            ScrapedNewsItem {
+                title: "T".into(), url: "https://real-source.com/article".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "c".into(),
+            },
+        ]);
+
+        let mut sections = vec![
+            NewsSection {
+                title: "Cat A".into(),
+                items: vec![NewsItem {
+                    title: "Rewritten title".into(),
+                    url: "https://wikipedia.org/hallucinated".into(),
+                    summary: "Rewritten summary".into(),
+                }],
+            },
+        ];
+
+        restore_scraped_urls(&mut sections, &scraped, &categories);
+        assert_eq!(sections[0].items[0].url, "https://real-source.com/article");
+        // Title and summary are preserved from LLM rewrite
+        assert_eq!(sections[0].items[0].title, "Rewritten title");
+    }
+
+    #[test]
+    fn restore_urls_no_change_when_urls_match() {
+        use crate::models::synthesis::{ScrapedNewsItem, NewsSection};
+        let categories = vec!["Cat A".to_string()];
+        let mut scraped = HashMap::new();
+        scraped.insert("category_0".to_string(), vec![
+            ScrapedNewsItem {
+                title: "T".into(), url: "https://correct.com/article".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "c".into(),
+            },
+        ]);
+
+        let mut sections = vec![
+            NewsSection {
+                title: "Cat A".into(),
+                items: vec![NewsItem {
+                    title: "T".into(),
+                    url: "https://correct.com/article".into(),
+                    summary: "s".into(),
+                }],
+            },
+        ];
+
+        restore_scraped_urls(&mut sections, &scraped, &categories);
+        assert_eq!(sections[0].items[0].url, "https://correct.com/article");
+    }
+
     // ── limit_articles_per_source tests ────────────────────────────
 
     #[test]
diff --git a/e2e/tests/generation-live.spec.ts b/e2e/tests/generation-live.spec.ts
index 1308c1a..7b47408 100644
--- a/e2e/tests/generation-live.spec.ts
+++ b/e2e/tests/generation-live.spec.ts
@@ -134,7 +134,9 @@ test.describe('Live generation with OpenAI', () => {
       theme: 'AI Weekly',
       max_age_days: 7,
       categories: ['AI News'],
-      max_items_per_category: 5,
+      max_items_per_category: 4,
+      max_articles_per_source: 3,
+      source_diversity_window: 0,
       search_agent_behavior: '',
       ai_provider: 'openai',
       ai_model: 'gpt-4o-mini',
@@ -209,9 +211,15 @@ test.describe('Live generation with OpenAI', () => {
         expect(item.url).toBeTruthy();
         expect(item.url.startsWith('http')).toBe(true);
 
-        // Each item summary is non-trivial (> 50 chars)
+        // No hallucinated URLs: should not point to Wikipedia or generic corporate pages
+        expect(item.url).not.toContain('wikipedia.org');
+
+        // Each item summary is non-trivial (> 50 chars) — no empty articles
         expect(item.summary).toBeTruthy();
         expect(item.summary.length).toBeGreaterThan(50);
+
+        // Summary should not be placeholder text or empty content
+        expect(item.summary.trim().length).toBeGreaterThan(50);
       }
     }
   });