From b558619d1044a4526eb038bf171f40e386bdf5fe Mon Sep 17 00:00:00 2001 From: oabrivard Date: Mon, 23 Mar 2026 21:24:23 +0100 Subject: [PATCH] feat: source diversity limit + URL deduplication in generation pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add max_articles_per_source setting (default 3, range 1-10) with migration, backend model, DB queries, and frontend number input - Add limit_articles_per_source filter: spreads articles across categories (1 per domain per category first), then fills remaining slots up to the limit - Add dedup_by_url filter: removes duplicate URLs across categories (case-insensitive) - Pipeline order: parse → filter_homepage → dedup_by_url → limit_per_source → scrape - 10 new unit tests covering spread, cap enforcement, dedup, and edge cases Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/services/synthesis.rs | 89 ++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index cc71fe3..1436c5c 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -314,7 +314,10 @@ async fn run_generation_inner( // Step 7b: Filter out homepage URLs (path == "/" or empty) let parsed = filter_homepage_urls(parsed); - // Step 7c: Limit articles per source for diversity + // Step 7c: Deduplicate articles with the same URL across categories + let parsed = dedup_by_url(parsed); + + // Step 7d: Limit articles per source for diversity let parsed = limit_articles_per_source(parsed, settings.max_articles_per_source); // Step 8: Scrape + rewrite pass @@ -489,6 +492,26 @@ fn filter_homepage_urls( result } +/// Remove duplicate articles with the same URL across all categories. +/// +/// Keeps the first occurrence (in category order) and drops subsequent duplicates. +fn dedup_by_url(parsed: Vec<(String, Vec)>) -> Vec<(String, Vec)> { + let mut seen: std::collections::HashSet = std::collections::HashSet::new(); + parsed + .into_iter() + .map(|(cat_key, items)| { + let deduped = items + .into_iter() + .filter(|item| { + let url = item.url.to_lowercase(); + seen.insert(url) + }) + .collect(); + (cat_key, deduped) + }) + .collect() +} + /// Limit the number of articles from the same domain across all categories. /// /// Spreads articles across categories first (at most 1 per domain per category), @@ -1384,6 +1407,70 @@ mod tests { assert_eq!(sanitized, json); } + // ── dedup_by_url tests ─────────────────────────────────────── + + #[test] + fn dedup_removes_same_url_across_categories() { + let parsed = vec![ + ("category_0".into(), vec![ + NewsItem { title: "A".into(), url: "https://example.com/article-1".into(), summary: "s".into() }, + NewsItem { title: "B".into(), url: "https://example.com/article-2".into(), summary: "s".into() }, + ]), + ("category_1".into(), vec![ + NewsItem { title: "C".into(), url: "https://example.com/article-1".into(), summary: "s".into() }, + NewsItem { title: "D".into(), url: "https://other.com/article-3".into(), summary: "s".into() }, + ]), + ]; + + let result = dedup_by_url(parsed); + assert_eq!(result[0].1.len(), 2, "Category 0 keeps both (first seen)"); + assert_eq!(result[1].1.len(), 1, "Category 1 loses the duplicate"); + assert_eq!(result[1].1[0].url, "https://other.com/article-3"); + } + + #[test] + fn dedup_removes_same_url_within_category() { + let parsed = vec![ + ("category_0".into(), vec![ + NewsItem { title: "A".into(), url: "https://example.com/same".into(), summary: "s".into() }, + NewsItem { title: "B".into(), url: "https://example.com/same".into(), summary: "s".into() }, + NewsItem { title: "C".into(), url: "https://example.com/different".into(), summary: "s".into() }, + ]), + ]; + + let result = dedup_by_url(parsed); + assert_eq!(result[0].1.len(), 2); + } + + #[test] + fn dedup_case_insensitive() { + let parsed = vec![ + ("category_0".into(), vec![ + NewsItem { title: "A".into(), url: "https://Example.COM/path".into(), summary: "s".into() }, + ]), + ("category_1".into(), vec![ + NewsItem { title: "B".into(), url: "https://example.com/path".into(), summary: "s".into() }, + ]), + ]; + + let result = dedup_by_url(parsed); + assert_eq!(result[0].1.len(), 1, "Keeps first"); + assert_eq!(result[1].1.len(), 0, "Drops case-insensitive duplicate"); + } + + #[test] + fn dedup_no_duplicates_unchanged() { + let parsed = vec![ + ("category_0".into(), vec![ + NewsItem { title: "A".into(), url: "https://a.com/1".into(), summary: "s".into() }, + NewsItem { title: "B".into(), url: "https://b.com/2".into(), summary: "s".into() }, + ]), + ]; + + let result = dedup_by_url(parsed); + assert_eq!(result[0].1.len(), 2); + } + // ── limit_articles_per_source tests ──────────────────────────── #[test]