@ -363,6 +363,10 @@ async fn run_generation_inner(
emit_progress ( tx , "scraping" , "Verification des sources..." , 45 ) ;
emit_progress ( tx , "scraping" , "Verification des sources..." , 45 ) ;
let scraped = scrape_articles ( state , & parsed , settings . max_age_days as i64 , tx ) . await ;
let scraped = scrape_articles ( state , & parsed , settings . max_age_days as i64 , tx ) . await ;
// Remove articles with empty scraped content (too old, soft 404, scrape failure).
// These would produce empty/low-quality output in the rewrite pass.
let scraped = filter_empty_scraped_articles ( scraped ) ;
// Rate limit check (pass 2)
// Rate limit check (pass 2)
check_rate_limit ( state , & user_rate_limiter , & provider_name ) ? ;
check_rate_limit ( state , & user_rate_limiter , & provider_name ) ? ;
@ -533,6 +537,26 @@ fn filter_homepage_urls(
/// Remove duplicate articles with the same URL across all categories.
/// Remove duplicate articles with the same URL across all categories.
///
///
/// Keeps the first occurrence (in category order) and drops subsequent duplicates.
/// Keeps the first occurrence (in category order) and drops subsequent duplicates.
/// Remove scraped articles with empty content from the data passed to the rewrite pass.
///
/// Articles with empty `scraped_content` are those where scraping failed (network error),
/// the page was a soft 404, or the article was too old. Keeping them would produce
/// empty or low-quality output in the final synthesis.
fn filter_empty_scraped_articles (
scraped : HashMap < String , Vec < ScrapedNewsItem > > ,
) -> HashMap < String , Vec < ScrapedNewsItem > > {
scraped
. into_iter ( )
. map ( | ( cat_key , items ) | {
let filtered : Vec < ScrapedNewsItem > = items
. into_iter ( )
. filter ( | item | ! item . scraped_content . trim ( ) . is_empty ( ) )
. collect ( ) ;
( cat_key , filtered )
} )
. collect ( )
}
fn dedup_by_url ( parsed : Vec < ( String , Vec < NewsItem > ) > ) -> Vec < ( String , Vec < NewsItem > ) > {
fn dedup_by_url ( parsed : Vec < ( String , Vec < NewsItem > ) > ) -> Vec < ( String , Vec < NewsItem > ) > {
let mut seen : std ::collections ::HashSet < String > = std ::collections ::HashSet ::new ( ) ;
let mut seen : std ::collections ::HashSet < String > = std ::collections ::HashSet ::new ( ) ;
parsed
parsed
@ -1540,6 +1564,111 @@ mod tests {
assert_eq! ( result [ 0 ] . 1. len ( ) , 2 ) ;
assert_eq! ( result [ 0 ] . 1. len ( ) , 2 ) ;
}
}
// ── filter_empty_scraped_articles tests ─────────────────────────
#[ test ]
fn filter_empty_removes_articles_with_no_content ( ) {
use crate ::models ::synthesis ::ScrapedNewsItem ;
let mut scraped = HashMap ::new ( ) ;
scraped . insert ( "category_0" . to_string ( ) , vec! [
ScrapedNewsItem {
title : "Good" . into ( ) , url : "https://a.com/1" . into ( ) ,
summary : "s" . into ( ) , original_title : "t" . into ( ) ,
scraped_content : "Real content here" . into ( ) ,
} ,
ScrapedNewsItem {
title : "Empty" . into ( ) , url : "https://b.com/2" . into ( ) ,
summary : "s" . into ( ) , original_title : "t" . into ( ) ,
scraped_content : "" . into ( ) ,
} ,
ScrapedNewsItem {
title : "Whitespace" . into ( ) , url : "https://c.com/3" . into ( ) ,
summary : "s" . into ( ) , original_title : "t" . into ( ) ,
scraped_content : " " . into ( ) ,
} ,
] ) ;
let result = filter_empty_scraped_articles ( scraped ) ;
assert_eq! ( result [ "category_0" ] . len ( ) , 1 ) ;
assert_eq! ( result [ "category_0" ] [ 0 ] . title , "Good" ) ;
}
#[ test ]
fn filter_empty_keeps_all_when_all_have_content ( ) {
use crate ::models ::synthesis ::ScrapedNewsItem ;
let mut scraped = HashMap ::new ( ) ;
scraped . insert ( "category_0" . to_string ( ) , vec! [
ScrapedNewsItem {
title : "A" . into ( ) , url : "https://a.com/1" . into ( ) ,
summary : "s" . into ( ) , original_title : "t" . into ( ) ,
scraped_content : "Content" . into ( ) ,
} ,
] ) ;
let result = filter_empty_scraped_articles ( scraped ) ;
assert_eq! ( result [ "category_0" ] . len ( ) , 1 ) ;
}
// ── restore_scraped_urls tests ───────────────────────────────
#[ test ]
fn restore_urls_replaces_hallucinated_urls ( ) {
use crate ::models ::synthesis ::{ ScrapedNewsItem , NewsSection } ;
let categories = vec! [ "Cat A" . to_string ( ) ] ;
let mut scraped = HashMap ::new ( ) ;
scraped . insert ( "category_0" . to_string ( ) , vec! [
ScrapedNewsItem {
title : "T" . into ( ) , url : "https://real-source.com/article" . into ( ) ,
summary : "s" . into ( ) , original_title : "t" . into ( ) ,
scraped_content : "c" . into ( ) ,
} ,
] ) ;
let mut sections = vec! [
NewsSection {
title : "Cat A" . into ( ) ,
items : vec ! [ NewsItem {
title : "Rewritten title" . into ( ) ,
url : "https://wikipedia.org/hallucinated" . into ( ) ,
summary : "Rewritten summary" . into ( ) ,
} ] ,
} ,
] ;
restore_scraped_urls ( & mut sections , & scraped , & categories ) ;
assert_eq! ( sections [ 0 ] . items [ 0 ] . url , "https://real-source.com/article" ) ;
// Title and summary are preserved from LLM rewrite
assert_eq! ( sections [ 0 ] . items [ 0 ] . title , "Rewritten title" ) ;
}
#[ test ]
fn restore_urls_no_change_when_urls_match ( ) {
use crate ::models ::synthesis ::{ ScrapedNewsItem , NewsSection } ;
let categories = vec! [ "Cat A" . to_string ( ) ] ;
let mut scraped = HashMap ::new ( ) ;
scraped . insert ( "category_0" . to_string ( ) , vec! [
ScrapedNewsItem {
title : "T" . into ( ) , url : "https://correct.com/article" . into ( ) ,
summary : "s" . into ( ) , original_title : "t" . into ( ) ,
scraped_content : "c" . into ( ) ,
} ,
] ) ;
let mut sections = vec! [
NewsSection {
title : "Cat A" . into ( ) ,
items : vec ! [ NewsItem {
title : "T" . into ( ) ,
url : "https://correct.com/article" . into ( ) ,
summary : "s" . into ( ) ,
} ] ,
} ,
] ;
restore_scraped_urls ( & mut sections , & scraped , & categories ) ;
assert_eq! ( sections [ 0 ] . items [ 0 ] . url , "https://correct.com/article" ) ;
}
// ── limit_articles_per_source tests ────────────────────────────
// ── limit_articles_per_source tests ────────────────────────────
#[ test ]
#[ test ]