From 13894a8f505db33bbfc28de95f40dd83908d3dec Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 00:07:36 +0100 Subject: [PATCH] fix: filter empty scraped articles + restore URLs after rewrite + E2E assertions - filter_empty_scraped_articles: removes articles with empty scraped content (too old, soft 404, scrape failure) before the rewrite pass, preventing empty articles in the final synthesis - restore_scraped_urls: already existed, now has unit tests - E2E test: added assertions for no Wikipedia URLs, no empty summaries, and updated settings payload with new fields (max_articles_per_source, source_diversity_window) - 4 new unit tests for filter_empty + restore_scraped_urls Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/services/synthesis.rs | 129 ++++++++++++++++++++++++++++++ e2e/tests/generation-live.spec.ts | 12 ++- 2 files changed, 139 insertions(+), 2 deletions(-) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index c3c1abf..1dafde6 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -363,6 +363,10 @@ async fn run_generation_inner( emit_progress(tx, "scraping", "Verification des sources...", 45); let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await; + // Remove articles with empty scraped content (too old, soft 404, scrape failure). + // These would produce empty/low-quality output in the rewrite pass. + let scraped = filter_empty_scraped_articles(scraped); + // Rate limit check (pass 2) check_rate_limit(state, &user_rate_limiter, &provider_name)?; @@ -533,6 +537,26 @@ fn filter_homepage_urls( /// Remove duplicate articles with the same URL across all categories. /// /// Keeps the first occurrence (in category order) and drops subsequent duplicates. +/// Remove scraped articles with empty content from the data passed to the rewrite pass. +/// +/// Articles with empty `scraped_content` are those where scraping failed (network error), +/// the page was a soft 404, or the article was too old. Keeping them would produce +/// empty or low-quality output in the final synthesis. +fn filter_empty_scraped_articles( + scraped: HashMap>, +) -> HashMap> { + scraped + .into_iter() + .map(|(cat_key, items)| { + let filtered: Vec = items + .into_iter() + .filter(|item| !item.scraped_content.trim().is_empty()) + .collect(); + (cat_key, filtered) + }) + .collect() +} + fn dedup_by_url(parsed: Vec<(String, Vec)>) -> Vec<(String, Vec)> { let mut seen: std::collections::HashSet = std::collections::HashSet::new(); parsed @@ -1540,6 +1564,111 @@ mod tests { assert_eq!(result[0].1.len(), 2); } + // ── filter_empty_scraped_articles tests ───────────────────────── + + #[test] + fn filter_empty_removes_articles_with_no_content() { + use crate::models::synthesis::ScrapedNewsItem; + let mut scraped = HashMap::new(); + scraped.insert("category_0".to_string(), vec![ + ScrapedNewsItem { + title: "Good".into(), url: "https://a.com/1".into(), + summary: "s".into(), original_title: "t".into(), + scraped_content: "Real content here".into(), + }, + ScrapedNewsItem { + title: "Empty".into(), url: "https://b.com/2".into(), + summary: "s".into(), original_title: "t".into(), + scraped_content: "".into(), + }, + ScrapedNewsItem { + title: "Whitespace".into(), url: "https://c.com/3".into(), + summary: "s".into(), original_title: "t".into(), + scraped_content: " ".into(), + }, + ]); + + let result = filter_empty_scraped_articles(scraped); + assert_eq!(result["category_0"].len(), 1); + assert_eq!(result["category_0"][0].title, "Good"); + } + + #[test] + fn filter_empty_keeps_all_when_all_have_content() { + use crate::models::synthesis::ScrapedNewsItem; + let mut scraped = HashMap::new(); + scraped.insert("category_0".to_string(), vec![ + ScrapedNewsItem { + title: "A".into(), url: "https://a.com/1".into(), + summary: "s".into(), original_title: "t".into(), + scraped_content: "Content".into(), + }, + ]); + + let result = filter_empty_scraped_articles(scraped); + assert_eq!(result["category_0"].len(), 1); + } + + // ── restore_scraped_urls tests ─────────────────────────────── + + #[test] + fn restore_urls_replaces_hallucinated_urls() { + use crate::models::synthesis::{ScrapedNewsItem, NewsSection}; + let categories = vec!["Cat A".to_string()]; + let mut scraped = HashMap::new(); + scraped.insert("category_0".to_string(), vec![ + ScrapedNewsItem { + title: "T".into(), url: "https://real-source.com/article".into(), + summary: "s".into(), original_title: "t".into(), + scraped_content: "c".into(), + }, + ]); + + let mut sections = vec![ + NewsSection { + title: "Cat A".into(), + items: vec![NewsItem { + title: "Rewritten title".into(), + url: "https://wikipedia.org/hallucinated".into(), + summary: "Rewritten summary".into(), + }], + }, + ]; + + restore_scraped_urls(&mut sections, &scraped, &categories); + assert_eq!(sections[0].items[0].url, "https://real-source.com/article"); + // Title and summary are preserved from LLM rewrite + assert_eq!(sections[0].items[0].title, "Rewritten title"); + } + + #[test] + fn restore_urls_no_change_when_urls_match() { + use crate::models::synthesis::{ScrapedNewsItem, NewsSection}; + let categories = vec!["Cat A".to_string()]; + let mut scraped = HashMap::new(); + scraped.insert("category_0".to_string(), vec![ + ScrapedNewsItem { + title: "T".into(), url: "https://correct.com/article".into(), + summary: "s".into(), original_title: "t".into(), + scraped_content: "c".into(), + }, + ]); + + let mut sections = vec![ + NewsSection { + title: "Cat A".into(), + items: vec![NewsItem { + title: "T".into(), + url: "https://correct.com/article".into(), + summary: "s".into(), + }], + }, + ]; + + restore_scraped_urls(&mut sections, &scraped, &categories); + assert_eq!(sections[0].items[0].url, "https://correct.com/article"); + } + // ── limit_articles_per_source tests ──────────────────────────── #[test] diff --git a/e2e/tests/generation-live.spec.ts b/e2e/tests/generation-live.spec.ts index 1308c1a..7b47408 100644 --- a/e2e/tests/generation-live.spec.ts +++ b/e2e/tests/generation-live.spec.ts @@ -134,7 +134,9 @@ test.describe('Live generation with OpenAI', () => { theme: 'AI Weekly', max_age_days: 7, categories: ['AI News'], - max_items_per_category: 5, + max_items_per_category: 4, + max_articles_per_source: 3, + source_diversity_window: 0, search_agent_behavior: '', ai_provider: 'openai', ai_model: 'gpt-4o-mini', @@ -209,9 +211,15 @@ test.describe('Live generation with OpenAI', () => { expect(item.url).toBeTruthy(); expect(item.url.startsWith('http')).toBe(true); - // Each item summary is non-trivial (> 50 chars) + // No hallucinated URLs: should not point to Wikipedia or generic corporate pages + expect(item.url).not.toContain('wikipedia.org'); + + // Each item summary is non-trivial (> 50 chars) — no empty articles expect(item.summary).toBeTruthy(); expect(item.summary.length).toBeGreaterThan(50); + + // Summary should not be placeholder text or empty content + expect(item.summary.trim().length).toBeGreaterThan(50); } } });