@ -314,7 +314,10 @@ async fn run_generation_inner(
// Step 7b: Filter out homepage URLs (path == "/" or empty)
let parsed = filter_homepage_urls ( parsed ) ;
// Step 7c: Limit articles per source for diversity
// Step 7c: Deduplicate articles with the same URL across categories
let parsed = dedup_by_url ( parsed ) ;
// Step 7d: Limit articles per source for diversity
let parsed = limit_articles_per_source ( parsed , settings . max_articles_per_source ) ;
// Step 8: Scrape + rewrite pass
@ -489,6 +492,26 @@ fn filter_homepage_urls(
result
}
/// Remove duplicate articles with the same URL across all categories.
///
/// Keeps the first occurrence (in category order) and drops subsequent duplicates.
fn dedup_by_url ( parsed : Vec < ( String , Vec < NewsItem > ) > ) -> Vec < ( String , Vec < NewsItem > ) > {
let mut seen : std ::collections ::HashSet < String > = std ::collections ::HashSet ::new ( ) ;
parsed
. into_iter ( )
. map ( | ( cat_key , items ) | {
let deduped = items
. into_iter ( )
. filter ( | item | {
let url = item . url . to_lowercase ( ) ;
seen . insert ( url )
} )
. collect ( ) ;
( cat_key , deduped )
} )
. collect ( )
}
/// Limit the number of articles from the same domain across all categories.
///
/// Spreads articles across categories first (at most 1 per domain per category),
@ -1384,6 +1407,70 @@ mod tests {
assert_eq! ( sanitized , json ) ;
}
// ── dedup_by_url tests ───────────────────────────────────────
#[ test ]
fn dedup_removes_same_url_across_categories ( ) {
let parsed = vec! [
( "category_0" . into ( ) , vec! [
NewsItem { title : "A" . into ( ) , url : "https://example.com/article-1" . into ( ) , summary : "s" . into ( ) } ,
NewsItem { title : "B" . into ( ) , url : "https://example.com/article-2" . into ( ) , summary : "s" . into ( ) } ,
] ) ,
( "category_1" . into ( ) , vec! [
NewsItem { title : "C" . into ( ) , url : "https://example.com/article-1" . into ( ) , summary : "s" . into ( ) } ,
NewsItem { title : "D" . into ( ) , url : "https://other.com/article-3" . into ( ) , summary : "s" . into ( ) } ,
] ) ,
] ;
let result = dedup_by_url ( parsed ) ;
assert_eq! ( result [ 0 ] . 1. len ( ) , 2 , "Category 0 keeps both (first seen)" ) ;
assert_eq! ( result [ 1 ] . 1. len ( ) , 1 , "Category 1 loses the duplicate" ) ;
assert_eq! ( result [ 1 ] . 1 [ 0 ] . url , "https://other.com/article-3" ) ;
}
#[ test ]
fn dedup_removes_same_url_within_category ( ) {
let parsed = vec! [
( "category_0" . into ( ) , vec! [
NewsItem { title : "A" . into ( ) , url : "https://example.com/same" . into ( ) , summary : "s" . into ( ) } ,
NewsItem { title : "B" . into ( ) , url : "https://example.com/same" . into ( ) , summary : "s" . into ( ) } ,
NewsItem { title : "C" . into ( ) , url : "https://example.com/different" . into ( ) , summary : "s" . into ( ) } ,
] ) ,
] ;
let result = dedup_by_url ( parsed ) ;
assert_eq! ( result [ 0 ] . 1. len ( ) , 2 ) ;
}
#[ test ]
fn dedup_case_insensitive ( ) {
let parsed = vec! [
( "category_0" . into ( ) , vec! [
NewsItem { title : "A" . into ( ) , url : "https://Example.COM/path" . into ( ) , summary : "s" . into ( ) } ,
] ) ,
( "category_1" . into ( ) , vec! [
NewsItem { title : "B" . into ( ) , url : "https://example.com/path" . into ( ) , summary : "s" . into ( ) } ,
] ) ,
] ;
let result = dedup_by_url ( parsed ) ;
assert_eq! ( result [ 0 ] . 1. len ( ) , 1 , "Keeps first" ) ;
assert_eq! ( result [ 1 ] . 1. len ( ) , 0 , "Drops case-insensitive duplicate" ) ;
}
#[ test ]
fn dedup_no_duplicates_unchanged ( ) {
let parsed = vec! [
( "category_0" . into ( ) , vec! [
NewsItem { title : "A" . into ( ) , url : "https://a.com/1" . into ( ) , summary : "s" . into ( ) } ,
NewsItem { title : "B" . into ( ) , url : "https://b.com/2" . into ( ) , summary : "s" . into ( ) } ,
] ) ,
] ;
let result = dedup_by_url ( parsed ) ;
assert_eq! ( result [ 0 ] . 1. len ( ) , 2 ) ;
}
// ── limit_articles_per_source tests ────────────────────────────
#[ test ]