fix: filter empty scraped articles + restore URLs after rewrite + E2E assertions

- filter_empty_scraped_articles: removes articles with empty scraped content (too old, soft 404, scrape failure) before the rewrite pass, preventing empty articles in the final synthesis - restore_scraped_urls: already existed, now has unit tests - E2E test: added assertions for no Wikipedia URLs, no empty summaries, and updated settings payload with new fields (max_articles_per_source, source_diversity_window) - 4 new unit tests for filter_empty + restore_scraped_urls Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · 13894a8f50
parent a9be1ce435
commit 13894a8f50
2 changed files with 139 additions and 2 deletions
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -363,6 +363,10 @@ async fn run_generation_inner(
    emit_progress(tx, "scraping", "Verification des sources...", 45);
    let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await;

+    // Remove articles with empty scraped content (too old, soft 404, scrape failure).
+    // These would produce empty/low-quality output in the rewrite pass.
+    let scraped = filter_empty_scraped_articles(scraped);
+
    // Rate limit check (pass 2)
    check_rate_limit(state, &user_rate_limiter, &provider_name)?;

@ -533,6 +537,26 @@ fn filter_homepage_urls(
 /// Remove duplicate articles with the same URL across all categories.
 ///
 /// Keeps the first occurrence (in category order) and drops subsequent duplicates.
+/// Remove scraped articles with empty content from the data passed to the rewrite pass.
+///
+/// Articles with empty `scraped_content` are those where scraping failed (network error),
+/// the page was a soft 404, or the article was too old. Keeping them would produce
+/// empty or low-quality output in the final synthesis.
+fn filter_empty_scraped_articles(
+    scraped: HashMap<String, Vec<ScrapedNewsItem>>,
+) -> HashMap<String, Vec<ScrapedNewsItem>> {
+    scraped
+        .into_iter()
+        .map(|(cat_key, items)| {
+            let filtered: Vec<ScrapedNewsItem> = items
+                .into_iter()
+                .filter(|item| !item.scraped_content.trim().is_empty())
+                .collect();
+            (cat_key, filtered)
+        })
+        .collect()
+}
+
 fn dedup_by_url(parsed: Vec<(String, Vec<NewsItem>)>) -> Vec<(String, Vec<NewsItem>)> {
    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
    parsed
@ -1540,6 +1564,111 @@ mod tests {
        assert_eq!(result[0].1.len(), 2);
    }

+    // ── filter_empty_scraped_articles tests ─────────────────────────
+
+    #[test]
+    fn filter_empty_removes_articles_with_no_content() {
+        use crate::models::synthesis::ScrapedNewsItem;
+        let mut scraped = HashMap::new();
+        scraped.insert("category_0".to_string(), vec![
+            ScrapedNewsItem {
+                title: "Good".into(), url: "https://a.com/1".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "Real content here".into(),
+            },
+            ScrapedNewsItem {
+                title: "Empty".into(), url: "https://b.com/2".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "".into(),
+            },
+            ScrapedNewsItem {
+                title: "Whitespace".into(), url: "https://c.com/3".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "   ".into(),
+            },
+        ]);
+
+        let result = filter_empty_scraped_articles(scraped);
+        assert_eq!(result["category_0"].len(), 1);
+        assert_eq!(result["category_0"][0].title, "Good");
+    }
+
+    #[test]
+    fn filter_empty_keeps_all_when_all_have_content() {
+        use crate::models::synthesis::ScrapedNewsItem;
+        let mut scraped = HashMap::new();
+        scraped.insert("category_0".to_string(), vec![
+            ScrapedNewsItem {
+                title: "A".into(), url: "https://a.com/1".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "Content".into(),
+            },
+        ]);
+
+        let result = filter_empty_scraped_articles(scraped);
+        assert_eq!(result["category_0"].len(), 1);
+    }
+
+    // ── restore_scraped_urls tests ───────────────────────────────
+
+    #[test]
+    fn restore_urls_replaces_hallucinated_urls() {
+        use crate::models::synthesis::{ScrapedNewsItem, NewsSection};
+        let categories = vec!["Cat A".to_string()];
+        let mut scraped = HashMap::new();
+        scraped.insert("category_0".to_string(), vec![
+            ScrapedNewsItem {
+                title: "T".into(), url: "https://real-source.com/article".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "c".into(),
+            },
+        ]);
+
+        let mut sections = vec![
+            NewsSection {
+                title: "Cat A".into(),
+                items: vec![NewsItem {
+                    title: "Rewritten title".into(),
+                    url: "https://wikipedia.org/hallucinated".into(),
+                    summary: "Rewritten summary".into(),
+                }],
+            },
+        ];
+
+        restore_scraped_urls(&mut sections, &scraped, &categories);
+        assert_eq!(sections[0].items[0].url, "https://real-source.com/article");
+        // Title and summary are preserved from LLM rewrite
+        assert_eq!(sections[0].items[0].title, "Rewritten title");
+    }
+
+    #[test]
+    fn restore_urls_no_change_when_urls_match() {
+        use crate::models::synthesis::{ScrapedNewsItem, NewsSection};
+        let categories = vec!["Cat A".to_string()];
+        let mut scraped = HashMap::new();
+        scraped.insert("category_0".to_string(), vec![
+            ScrapedNewsItem {
+                title: "T".into(), url: "https://correct.com/article".into(),
+                summary: "s".into(), original_title: "t".into(),
+                scraped_content: "c".into(),
+            },
+        ]);
+
+        let mut sections = vec![
+            NewsSection {
+                title: "Cat A".into(),
+                items: vec![NewsItem {
+                    title: "T".into(),
+                    url: "https://correct.com/article".into(),
+                    summary: "s".into(),
+                }],
+            },
+        ];
+
+        restore_scraped_urls(&mut sections, &scraped, &categories);
+        assert_eq!(sections[0].items[0].url, "https://correct.com/article");
+    }
+
    // ── limit_articles_per_source tests ────────────────────────────

    #[test]
--- a/e2e/tests/generation-live.spec.ts
+++ b/e2e/tests/generation-live.spec.ts
@ -134,7 +134,9 @@ test.describe('Live generation with OpenAI', () => {
      theme: 'AI Weekly',
      max_age_days: 7,
      categories: ['AI News'],
-      max_items_per_category: 5,
+      max_items_per_category: 4,
+      max_articles_per_source: 3,
+      source_diversity_window: 0,
      search_agent_behavior: '',
      ai_provider: 'openai',
      ai_model: 'gpt-4o-mini',
@ -209,9 +211,15 @@ test.describe('Live generation with OpenAI', () => {
        expect(item.url).toBeTruthy();
        expect(item.url.startsWith('http')).toBe(true);

-        // Each item summary is non-trivial (> 50 chars)
+        // No hallucinated URLs: should not point to Wikipedia or generic corporate pages
+        expect(item.url).not.toContain('wikipedia.org');
+
+        // Each item summary is non-trivial (> 50 chars) — no empty articles
        expect(item.summary).toBeTruthy();
        expect(item.summary.length).toBeGreaterThan(50);
+
+        // Summary should not be placeholder text or empty content
+        expect(item.summary.trim().length).toBeGreaterThan(50);
      }
    }
  });