@ -370,12 +370,15 @@ async fn run_generation_inner(
// Rate limit check (pass 2)
check_rate_limit ( state , & user_rate_limiter , & provider_name ) ? ;
// LLM rewrite pass
// LLM rewrite pass — use a schema that matches the actual scraped item counts
// (which may be less than max_items_per_category after filtering empty content)
emit_progress ( tx , "rewrite" , "Redaction des resumes..." , 80 ) ;
let ( rewrite_system , rewrite_user ) = prompts ::build_rewrite_prompt ( & scraped ) ;
let rewrite_schema = build_rewrite_schema ( & scraped , & settings . categories ) ;
let final_results = provider
. generate_rewrite_pass ( & model_writing , & rewrite_system , & rewrite_user , & schema )
. generate_rewrite_pass ( & model_writing , & rewrite_system , & rewrite_user , & rewrite_ schema)
. await ? ;
emit_progress ( tx , "finalizing" , "Finalisation..." , 90 ) ;
@ -542,6 +545,54 @@ fn filter_homepage_urls(
/// Articles with empty `scraped_content` are those where scraping failed (network error),
/// the page was a soft 404, or the article was too old. Keeping them would produce
/// empty or low-quality output in the final synthesis.
/// Build a JSON schema for the rewrite pass that matches the actual scraped item counts.
///
/// Unlike the search pass schema (which uses `minItems`/`maxItems` from user settings),
/// the rewrite schema uses the actual number of items per category after scraping and
/// filtering. This prevents the LLM from duplicating content to fill a quota.
fn build_rewrite_schema (
scraped : & HashMap < String , Vec < ScrapedNewsItem > > ,
categories : & [ String ] ,
) -> serde_json ::Value {
// Build a schema where each category's minItems/maxItems matches the actual count
let news_item_schema = serde_json ::json ! ( {
"type" : "object" ,
"properties" : {
"title" : { "type" : "string" , "description" : "The title of the news article" } ,
"url" : { "type" : "string" , "description" : "The URL of the source article" } ,
"summary" : { "type" : "string" , "description" : "A concise summary of the article" }
} ,
"required" : [ "title" , "url" , "summary" ] ,
"additionalProperties" : false
} ) ;
let mut properties = serde_json ::Map ::new ( ) ;
let mut required = Vec ::new ( ) ;
for ( i , cat_name ) in categories . iter ( ) . enumerate ( ) {
let key = format! ( "category_{}" , i ) ;
let count = scraped . get ( & key ) . map_or ( 0 , | items | items . len ( ) as i32 ) . max ( 1 ) ;
properties . insert (
key . clone ( ) ,
serde_json ::json ! ( {
"type" : "array" ,
"description" : cat_name ,
"items" : news_item_schema ,
"minItems" : count ,
"maxItems" : count
} ) ,
) ;
required . push ( serde_json ::Value ::String ( key ) ) ;
}
serde_json ::json ! ( {
"type" : "object" ,
"properties" : properties ,
"required" : required ,
"additionalProperties" : false
} )
}
fn filter_empty_scraped_articles (
scraped : HashMap < String , Vec < ScrapedNewsItem > > ,
) -> HashMap < String , Vec < ScrapedNewsItem > > {
@ -587,8 +638,6 @@ fn limit_articles_per_source(
// Pass 1: keep at most 1 article per domain per category
let mut kept : Vec < ( String , Vec < NewsItem > ) > = Vec ::new ( ) ;
let mut dropped : Vec < ( usize , NewsItem ) > = Vec ::new ( ) ; // (category_index, item)
let mut domain_counts : std ::collections ::HashMap < String , usize > =
std ::collections ::HashMap ::new ( ) ;
for ( cat_idx , ( cat_key , items ) ) in parsed . into_iter ( ) . enumerate ( ) {
let mut cat_kept = Vec ::new ( ) ;