fix: filter empty scraped articles + restore URLs after rewrite + E2E assertions

- filter_empty_scraped_articles: removes articles with empty scraped content (too old, soft 404, scrape failure) before the rewrite pass, preventing empty articles in the final synthesis - restore_scraped_urls: already existed, now has unit tests - E2E test: added assertions for no Wikipedia URLs, no empty summaries, and updated settings payload with new fields (max_articles_per_source, source_diversity_window) - 4 new unit tests for filter_empty + restore_scraped_urls Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · 13894a8f50
parent a9be1ce435
commit 13894a8f50
2 changed files with 139 additions and 2 deletions
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -363,6 +363,10 @@ async fn run_generation_inner(
    emit_progress(tx, "scraping", "Verification des sources...", 45);
    let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await;
    // Remove articles with empty scraped content (too old, soft 404, scrape failure).
    // These would produce empty/low-quality output in the rewrite pass.
    let scraped = filter_empty_scraped_articles(scraped);
    // Rate limit check (pass 2)
    check_rate_limit(state, &user_rate_limiter, &provider_name)?;
@ -533,6 +537,26 @@ fn filter_homepage_urls(
 /// Remove duplicate articles with the same URL across all categories.
 ///
 /// Keeps the first occurrence (in category order) and drops subsequent duplicates.
 /// Remove scraped articles with empty content from the data passed to the rewrite pass.
 ///
 /// Articles with empty `scraped_content` are those where scraping failed (network error),
 /// the page was a soft 404, or the article was too old. Keeping them would produce
 /// empty or low-quality output in the final synthesis.
 fn filter_empty_scraped_articles(
    scraped: HashMap<String, Vec<ScrapedNewsItem>>,
 ) -> HashMap<String, Vec<ScrapedNewsItem>> {
    scraped
        .into_iter()
        .map(|(cat_key, items)| {
            let filtered: Vec<ScrapedNewsItem> = items
                .into_iter()
                .filter(|item| !item.scraped_content.trim().is_empty())
                .collect();
            (cat_key, filtered)
        })
        .collect()
 }
 fn dedup_by_url(parsed: Vec<(String, Vec<NewsItem>)>) -> Vec<(String, Vec<NewsItem>)> {
    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
    parsed
@ -1540,6 +1564,111 @@ mod tests {
        assert_eq!(result[0].1.len(), 2);
    }
    // ── filter_empty_scraped_articles tests ─────────────────────────
    #[test]
    fn filter_empty_removes_articles_with_no_content() {
        use crate::models::synthesis::ScrapedNewsItem;
        let mut scraped = HashMap::new();
        scraped.insert("category_0".to_string(), vec![
            ScrapedNewsItem {
                title: "Good".into(), url: "https://a.com/1".into(),
                summary: "s".into(), original_title: "t".into(),
                scraped_content: "Real content here".into(),
            },
            ScrapedNewsItem {
                title: "Empty".into(), url: "https://b.com/2".into(),
                summary: "s".into(), original_title: "t".into(),
                scraped_content: "".into(),
            },
            ScrapedNewsItem {
                title: "Whitespace".into(), url: "https://c.com/3".into(),
                summary: "s".into(), original_title: "t".into(),
                scraped_content: "   ".into(),
            },
        ]);
        let result = filter_empty_scraped_articles(scraped);
        assert_eq!(result["category_0"].len(), 1);
        assert_eq!(result["category_0"][0].title, "Good");
    }
    #[test]
    fn filter_empty_keeps_all_when_all_have_content() {
        use crate::models::synthesis::ScrapedNewsItem;
        let mut scraped = HashMap::new();
        scraped.insert("category_0".to_string(), vec![
            ScrapedNewsItem {
                title: "A".into(), url: "https://a.com/1".into(),
                summary: "s".into(), original_title: "t".into(),
                scraped_content: "Content".into(),
            },
        ]);
        let result = filter_empty_scraped_articles(scraped);
        assert_eq!(result["category_0"].len(), 1);
    }
    // ── restore_scraped_urls tests ───────────────────────────────
    #[test]
    fn restore_urls_replaces_hallucinated_urls() {
        use crate::models::synthesis::{ScrapedNewsItem, NewsSection};
        let categories = vec!["Cat A".to_string()];
        let mut scraped = HashMap::new();
        scraped.insert("category_0".to_string(), vec![
            ScrapedNewsItem {
                title: "T".into(), url: "https://real-source.com/article".into(),
                summary: "s".into(), original_title: "t".into(),
                scraped_content: "c".into(),
            },
        ]);
        let mut sections = vec![
            NewsSection {
                title: "Cat A".into(),
                items: vec![NewsItem {
                    title: "Rewritten title".into(),
                    url: "https://wikipedia.org/hallucinated".into(),
                    summary: "Rewritten summary".into(),
                }],
            },
        ];
        restore_scraped_urls(&mut sections, &scraped, &categories);
        assert_eq!(sections[0].items[0].url, "https://real-source.com/article");
        // Title and summary are preserved from LLM rewrite
        assert_eq!(sections[0].items[0].title, "Rewritten title");
    }
    #[test]
    fn restore_urls_no_change_when_urls_match() {
        use crate::models::synthesis::{ScrapedNewsItem, NewsSection};
        let categories = vec!["Cat A".to_string()];
        let mut scraped = HashMap::new();
        scraped.insert("category_0".to_string(), vec![
            ScrapedNewsItem {
                title: "T".into(), url: "https://correct.com/article".into(),
                summary: "s".into(), original_title: "t".into(),
                scraped_content: "c".into(),
            },
        ]);
        let mut sections = vec![
            NewsSection {
                title: "Cat A".into(),
                items: vec![NewsItem {
                    title: "T".into(),
                    url: "https://correct.com/article".into(),
                    summary: "s".into(),
                }],
            },
        ];
        restore_scraped_urls(&mut sections, &scraped, &categories);
        assert_eq!(sections[0].items[0].url, "https://correct.com/article");
    }
    // ── limit_articles_per_source tests ────────────────────────────
    #[test]
--- a/e2e/tests/generation-live.spec.ts
+++ b/e2e/tests/generation-live.spec.ts
@ -134,7 +134,9 @@ test.describe('Live generation with OpenAI', () => {
      theme: 'AI Weekly',
      max_age_days: 7,
      categories: ['AI News'],
-      max_items_per_category: 5,
+      max_items_per_category: 4,
      max_articles_per_source: 3,
      source_diversity_window: 0,
      search_agent_behavior: '',
      ai_provider: 'openai',
      ai_model: 'gpt-4o-mini',
@ -209,9 +211,15 @@ test.describe('Live generation with OpenAI', () => {
        expect(item.url).toBeTruthy();
        expect(item.url.startsWith('http')).toBe(true);
-        // Each item summary is non-trivial (> 50 chars)
+        // No hallucinated URLs: should not point to Wikipedia or generic corporate pages
        expect(item.url).not.toContain('wikipedia.org');
        // Each item summary is non-trivial (> 50 chars) — no empty articles
        expect(item.summary).toBeTruthy();
        expect(item.summary.length).toBeGreaterThan(50);
        // Summary should not be placeholder text or empty content
        expect(item.summary.trim().length).toBeGreaterThan(50);
      }
    }
  });