fix: rewrite pass schema uses actual scraped item counts, not max setting

The rewrite pass shared the search pass schema which enforced minItems/maxItems equal to max_items_per_category. After filter_empty_scraped_articles removed old/failed articles, the scraped data had fewer items than the schema required, causing the LLM to duplicate content to fill the quota. Now build_rewrite_schema counts actual items per category from the scraped data and sets minItems/maxItems accordingly. Also removed dead domain_counts variable. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · 45e5ee8a7d
parent 13894a8f50
commit 45e5ee8a7d
1 changed files with 53 additions and 4 deletions
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -370,12 +370,15 @@ async fn run_generation_inner(
    // Rate limit check (pass 2)
    check_rate_limit(state, &user_rate_limiter, &provider_name)?;

-    // LLM rewrite pass
+    // LLM rewrite pass — use a schema that matches the actual scraped item counts
+    // (which may be less than max_items_per_category after filtering empty content)
    emit_progress(tx, "rewrite", "Redaction des resumes...", 80);
    let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&scraped);

+    let rewrite_schema = build_rewrite_schema(&scraped, &settings.categories);
+
    let final_results = provider
-        .generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &schema)
+        .generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &rewrite_schema)
        .await?;

    emit_progress(tx, "finalizing", "Finalisation...", 90);
@ -542,6 +545,54 @@ fn filter_homepage_urls(
 /// Articles with empty `scraped_content` are those where scraping failed (network error),
 /// the page was a soft 404, or the article was too old. Keeping them would produce
 /// empty or low-quality output in the final synthesis.
+/// Build a JSON schema for the rewrite pass that matches the actual scraped item counts.
+///
+/// Unlike the search pass schema (which uses `minItems`/`maxItems` from user settings),
+/// the rewrite schema uses the actual number of items per category after scraping and
+/// filtering. This prevents the LLM from duplicating content to fill a quota.
+fn build_rewrite_schema(
+    scraped: &HashMap<String, Vec<ScrapedNewsItem>>,
+    categories: &[String],
+) -> serde_json::Value {
+    // Build a schema where each category's minItems/maxItems matches the actual count
+    let news_item_schema = serde_json::json!({
+        "type": "object",
+        "properties": {
+            "title": { "type": "string", "description": "The title of the news article" },
+            "url": { "type": "string", "description": "The URL of the source article" },
+            "summary": { "type": "string", "description": "A concise summary of the article" }
+        },
+        "required": ["title", "url", "summary"],
+        "additionalProperties": false
+    });
+
+    let mut properties = serde_json::Map::new();
+    let mut required = Vec::new();
+
+    for (i, cat_name) in categories.iter().enumerate() {
+        let key = format!("category_{}", i);
+        let count = scraped.get(&key).map_or(0, |items| items.len() as i32).max(1);
+        properties.insert(
+            key.clone(),
+            serde_json::json!({
+                "type": "array",
+                "description": cat_name,
+                "items": news_item_schema,
+                "minItems": count,
+                "maxItems": count
+            }),
+        );
+        required.push(serde_json::Value::String(key));
+    }
+
+    serde_json::json!({
+        "type": "object",
+        "properties": properties,
+        "required": required,
+        "additionalProperties": false
+    })
+}
+
 fn filter_empty_scraped_articles(
    scraped: HashMap<String, Vec<ScrapedNewsItem>>,
 ) -> HashMap<String, Vec<ScrapedNewsItem>> {
@ -587,8 +638,6 @@ fn limit_articles_per_source(
    // Pass 1: keep at most 1 article per domain per category
    let mut kept: Vec<(String, Vec<NewsItem>)> = Vec::new();
    let mut dropped: Vec<(usize, NewsItem)> = Vec::new(); // (category_index, item)
-    let mut domain_counts: std::collections::HashMap<String, usize> =
-        std::collections::HashMap::new();

    for (cat_idx, (cat_key, items)) in parsed.into_iter().enumerate() {
        let mut cat_kept = Vec::new();