feat: source diversity limit + URL deduplication in generation pipeline

- Add max_articles_per_source setting (default 3, range 1-10) with migration,
  backend model, DB queries, and frontend number input
- Add limit_articles_per_source filter: spreads articles across categories
  (1 per domain per category first), then fills remaining slots up to the limit
- Add dedup_by_url filter: removes duplicate URLs across categories (case-insensitive)
- Pipeline order: parse → filter_homepage → dedup_by_url → limit_per_source → scrape
- 10 new unit tests covering spread, cap enforcement, dedup, and edge cases

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent da05965dde
commit b558619d10

@ -314,7 +314,10 @@ async fn run_generation_inner(
// Step 7b: Filter out homepage URLs (path == "/" or empty)
let parsed = filter_homepage_urls(parsed);
// Step 7c: Limit articles per source for diversity
// Step 7c: Deduplicate articles with the same URL across categories
let parsed = dedup_by_url(parsed);
// Step 7d: Limit articles per source for diversity
let parsed = limit_articles_per_source(parsed, settings.max_articles_per_source);
// Step 8: Scrape + rewrite pass
@ -489,6 +492,26 @@ fn filter_homepage_urls(
result
}
/// Remove duplicate articles with the same URL across all categories.
///
/// Keeps the first occurrence (in category order) and drops subsequent duplicates.
fn dedup_by_url(parsed: Vec<(String, Vec<NewsItem>)>) -> Vec<(String, Vec<NewsItem>)> {
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
parsed
.into_iter()
.map(|(cat_key, items)| {
let deduped = items
.into_iter()
.filter(|item| {
let url = item.url.to_lowercase();
seen.insert(url)
})
.collect();
(cat_key, deduped)
})
.collect()
}
/// Limit the number of articles from the same domain across all categories.
///
/// Spreads articles across categories first (at most 1 per domain per category),
@ -1384,6 +1407,70 @@ mod tests {
assert_eq!(sanitized, json);
}
// ── dedup_by_url tests ───────────────────────────────────────
#[test]
fn dedup_removes_same_url_across_categories() {
let parsed = vec![
("category_0".into(), vec![
NewsItem { title: "A".into(), url: "https://example.com/article-1".into(), summary: "s".into() },
NewsItem { title: "B".into(), url: "https://example.com/article-2".into(), summary: "s".into() },
]),
("category_1".into(), vec![
NewsItem { title: "C".into(), url: "https://example.com/article-1".into(), summary: "s".into() },
NewsItem { title: "D".into(), url: "https://other.com/article-3".into(), summary: "s".into() },
]),
];
let result = dedup_by_url(parsed);
assert_eq!(result[0].1.len(), 2, "Category 0 keeps both (first seen)");
assert_eq!(result[1].1.len(), 1, "Category 1 loses the duplicate");
assert_eq!(result[1].1[0].url, "https://other.com/article-3");
}
#[test]
fn dedup_removes_same_url_within_category() {
let parsed = vec![
("category_0".into(), vec![
NewsItem { title: "A".into(), url: "https://example.com/same".into(), summary: "s".into() },
NewsItem { title: "B".into(), url: "https://example.com/same".into(), summary: "s".into() },
NewsItem { title: "C".into(), url: "https://example.com/different".into(), summary: "s".into() },
]),
];
let result = dedup_by_url(parsed);
assert_eq!(result[0].1.len(), 2);
}
#[test]
fn dedup_case_insensitive() {
let parsed = vec![
("category_0".into(), vec![
NewsItem { title: "A".into(), url: "https://Example.COM/path".into(), summary: "s".into() },
]),
("category_1".into(), vec![
NewsItem { title: "B".into(), url: "https://example.com/path".into(), summary: "s".into() },
]),
];
let result = dedup_by_url(parsed);
assert_eq!(result[0].1.len(), 1, "Keeps first");
assert_eq!(result[1].1.len(), 0, "Drops case-insensitive duplicate");
}
#[test]
fn dedup_no_duplicates_unchanged() {
let parsed = vec![
("category_0".into(), vec![
NewsItem { title: "A".into(), url: "https://a.com/1".into(), summary: "s".into() },
NewsItem { title: "B".into(), url: "https://b.com/2".into(), summary: "s".into() },
]),
];
let result = dedup_by_url(parsed);
assert_eq!(result[0].1.len(), 2);
}
// ── limit_articles_per_source tests ────────────────────────────
#[test]

Loading…
Cancel
Save