fix: filter empty scraped articles + restore URLs after rewrite + E2E assertions

- filter_empty_scraped_articles: removes articles with empty scraped content
  (too old, soft 404, scrape failure) before the rewrite pass, preventing
  empty articles in the final synthesis
- restore_scraped_urls: already existed, now has unit tests
- E2E test: added assertions for no Wikipedia URLs, no empty summaries,
  and updated settings payload with new fields (max_articles_per_source,
  source_diversity_window)
- 4 new unit tests for filter_empty + restore_scraped_urls

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent a9be1ce435
commit 13894a8f50

@ -363,6 +363,10 @@ async fn run_generation_inner(
emit_progress(tx, "scraping", "Verification des sources...", 45);
let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await;
// Remove articles with empty scraped content (too old, soft 404, scrape failure).
// These would produce empty/low-quality output in the rewrite pass.
let scraped = filter_empty_scraped_articles(scraped);
// Rate limit check (pass 2)
check_rate_limit(state, &user_rate_limiter, &provider_name)?;
@ -533,6 +537,26 @@ fn filter_homepage_urls(
/// Remove duplicate articles with the same URL across all categories.
///
/// Keeps the first occurrence (in category order) and drops subsequent duplicates.
/// Remove scraped articles with empty content from the data passed to the rewrite pass.
///
/// Articles with empty `scraped_content` are those where scraping failed (network error),
/// the page was a soft 404, or the article was too old. Keeping them would produce
/// empty or low-quality output in the final synthesis.
fn filter_empty_scraped_articles(
scraped: HashMap<String, Vec<ScrapedNewsItem>>,
) -> HashMap<String, Vec<ScrapedNewsItem>> {
scraped
.into_iter()
.map(|(cat_key, items)| {
let filtered: Vec<ScrapedNewsItem> = items
.into_iter()
.filter(|item| !item.scraped_content.trim().is_empty())
.collect();
(cat_key, filtered)
})
.collect()
}
fn dedup_by_url(parsed: Vec<(String, Vec<NewsItem>)>) -> Vec<(String, Vec<NewsItem>)> {
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
parsed
@ -1540,6 +1564,111 @@ mod tests {
assert_eq!(result[0].1.len(), 2);
}
// ── filter_empty_scraped_articles tests ─────────────────────────
#[test]
fn filter_empty_removes_articles_with_no_content() {
use crate::models::synthesis::ScrapedNewsItem;
let mut scraped = HashMap::new();
scraped.insert("category_0".to_string(), vec![
ScrapedNewsItem {
title: "Good".into(), url: "https://a.com/1".into(),
summary: "s".into(), original_title: "t".into(),
scraped_content: "Real content here".into(),
},
ScrapedNewsItem {
title: "Empty".into(), url: "https://b.com/2".into(),
summary: "s".into(), original_title: "t".into(),
scraped_content: "".into(),
},
ScrapedNewsItem {
title: "Whitespace".into(), url: "https://c.com/3".into(),
summary: "s".into(), original_title: "t".into(),
scraped_content: " ".into(),
},
]);
let result = filter_empty_scraped_articles(scraped);
assert_eq!(result["category_0"].len(), 1);
assert_eq!(result["category_0"][0].title, "Good");
}
#[test]
fn filter_empty_keeps_all_when_all_have_content() {
use crate::models::synthesis::ScrapedNewsItem;
let mut scraped = HashMap::new();
scraped.insert("category_0".to_string(), vec![
ScrapedNewsItem {
title: "A".into(), url: "https://a.com/1".into(),
summary: "s".into(), original_title: "t".into(),
scraped_content: "Content".into(),
},
]);
let result = filter_empty_scraped_articles(scraped);
assert_eq!(result["category_0"].len(), 1);
}
// ── restore_scraped_urls tests ───────────────────────────────
#[test]
fn restore_urls_replaces_hallucinated_urls() {
use crate::models::synthesis::{ScrapedNewsItem, NewsSection};
let categories = vec!["Cat A".to_string()];
let mut scraped = HashMap::new();
scraped.insert("category_0".to_string(), vec![
ScrapedNewsItem {
title: "T".into(), url: "https://real-source.com/article".into(),
summary: "s".into(), original_title: "t".into(),
scraped_content: "c".into(),
},
]);
let mut sections = vec![
NewsSection {
title: "Cat A".into(),
items: vec![NewsItem {
title: "Rewritten title".into(),
url: "https://wikipedia.org/hallucinated".into(),
summary: "Rewritten summary".into(),
}],
},
];
restore_scraped_urls(&mut sections, &scraped, &categories);
assert_eq!(sections[0].items[0].url, "https://real-source.com/article");
// Title and summary are preserved from LLM rewrite
assert_eq!(sections[0].items[0].title, "Rewritten title");
}
#[test]
fn restore_urls_no_change_when_urls_match() {
use crate::models::synthesis::{ScrapedNewsItem, NewsSection};
let categories = vec!["Cat A".to_string()];
let mut scraped = HashMap::new();
scraped.insert("category_0".to_string(), vec![
ScrapedNewsItem {
title: "T".into(), url: "https://correct.com/article".into(),
summary: "s".into(), original_title: "t".into(),
scraped_content: "c".into(),
},
]);
let mut sections = vec![
NewsSection {
title: "Cat A".into(),
items: vec![NewsItem {
title: "T".into(),
url: "https://correct.com/article".into(),
summary: "s".into(),
}],
},
];
restore_scraped_urls(&mut sections, &scraped, &categories);
assert_eq!(sections[0].items[0].url, "https://correct.com/article");
}
// ── limit_articles_per_source tests ────────────────────────────
#[test]

@ -134,7 +134,9 @@ test.describe('Live generation with OpenAI', () => {
theme: 'AI Weekly',
max_age_days: 7,
categories: ['AI News'],
max_items_per_category: 5,
max_items_per_category: 4,
max_articles_per_source: 3,
source_diversity_window: 0,
search_agent_behavior: '',
ai_provider: 'openai',
ai_model: 'gpt-4o-mini',
@ -209,9 +211,15 @@ test.describe('Live generation with OpenAI', () => {
expect(item.url).toBeTruthy();
expect(item.url.startsWith('http')).toBe(true);
// Each item summary is non-trivial (> 50 chars)
// No hallucinated URLs: should not point to Wikipedia or generic corporate pages
expect(item.url).not.toContain('wikipedia.org');
// Each item summary is non-trivial (> 50 chars) — no empty articles
expect(item.summary).toBeTruthy();
expect(item.summary.length).toBeGreaterThan(50);
// Summary should not be placeholder text or empty content
expect(item.summary.trim().length).toBeGreaterThan(50);
}
}
});

Loading…
Cancel
Save