From 45e5ee8a7dcfa705558a44886e22ce9ebf285fca Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 00:28:43 +0100 Subject: [PATCH] fix: rewrite pass schema uses actual scraped item counts, not max setting The rewrite pass shared the search pass schema which enforced minItems/maxItems equal to max_items_per_category. After filter_empty_scraped_articles removed old/failed articles, the scraped data had fewer items than the schema required, causing the LLM to duplicate content to fill the quota. Now build_rewrite_schema counts actual items per category from the scraped data and sets minItems/maxItems accordingly. Also removed dead domain_counts variable. Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/services/synthesis.rs | 57 ++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 1dafde6..be01c52 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -370,12 +370,15 @@ async fn run_generation_inner( // Rate limit check (pass 2) check_rate_limit(state, &user_rate_limiter, &provider_name)?; - // LLM rewrite pass + // LLM rewrite pass — use a schema that matches the actual scraped item counts + // (which may be less than max_items_per_category after filtering empty content) emit_progress(tx, "rewrite", "Redaction des resumes...", 80); let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&scraped); + let rewrite_schema = build_rewrite_schema(&scraped, &settings.categories); + let final_results = provider - .generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &schema) + .generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &rewrite_schema) .await?; emit_progress(tx, "finalizing", "Finalisation...", 90); @@ -542,6 +545,54 @@ fn filter_homepage_urls( /// Articles with empty `scraped_content` are those where scraping failed (network error), /// the page was a soft 404, or the article was too old. Keeping them would produce /// empty or low-quality output in the final synthesis. +/// Build a JSON schema for the rewrite pass that matches the actual scraped item counts. +/// +/// Unlike the search pass schema (which uses `minItems`/`maxItems` from user settings), +/// the rewrite schema uses the actual number of items per category after scraping and +/// filtering. This prevents the LLM from duplicating content to fill a quota. +fn build_rewrite_schema( + scraped: &HashMap>, + categories: &[String], +) -> serde_json::Value { + // Build a schema where each category's minItems/maxItems matches the actual count + let news_item_schema = serde_json::json!({ + "type": "object", + "properties": { + "title": { "type": "string", "description": "The title of the news article" }, + "url": { "type": "string", "description": "The URL of the source article" }, + "summary": { "type": "string", "description": "A concise summary of the article" } + }, + "required": ["title", "url", "summary"], + "additionalProperties": false + }); + + let mut properties = serde_json::Map::new(); + let mut required = Vec::new(); + + for (i, cat_name) in categories.iter().enumerate() { + let key = format!("category_{}", i); + let count = scraped.get(&key).map_or(0, |items| items.len() as i32).max(1); + properties.insert( + key.clone(), + serde_json::json!({ + "type": "array", + "description": cat_name, + "items": news_item_schema, + "minItems": count, + "maxItems": count + }), + ); + required.push(serde_json::Value::String(key)); + } + + serde_json::json!({ + "type": "object", + "properties": properties, + "required": required, + "additionalProperties": false + }) +} + fn filter_empty_scraped_articles( scraped: HashMap>, ) -> HashMap> { @@ -587,8 +638,6 @@ fn limit_articles_per_source( // Pass 1: keep at most 1 article per domain per category let mut kept: Vec<(String, Vec)> = Vec::new(); let mut dropped: Vec<(usize, NewsItem)> = Vec::new(); // (category_index, item) - let mut domain_counts: std::collections::HashMap = - std::collections::HashMap::new(); for (cat_idx, (cat_key, items)) in parsed.into_iter().enumerate() { let mut cat_kept = Vec::new();