fix: rewrite pass schema uses actual scraped item counts, not max setting

The rewrite pass shared the search pass schema which enforced minItems/maxItems
equal to max_items_per_category. After filter_empty_scraped_articles removed
old/failed articles, the scraped data had fewer items than the schema required,
causing the LLM to duplicate content to fill the quota.

Now build_rewrite_schema counts actual items per category from the scraped data
and sets minItems/maxItems accordingly. Also removed dead domain_counts variable.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 13894a8f50
commit 45e5ee8a7d

@ -370,12 +370,15 @@ async fn run_generation_inner(
// Rate limit check (pass 2)
check_rate_limit(state, &user_rate_limiter, &provider_name)?;
// LLM rewrite pass
// LLM rewrite pass — use a schema that matches the actual scraped item counts
// (which may be less than max_items_per_category after filtering empty content)
emit_progress(tx, "rewrite", "Redaction des resumes...", 80);
let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&scraped);
let rewrite_schema = build_rewrite_schema(&scraped, &settings.categories);
let final_results = provider
.generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &schema)
.generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &rewrite_schema)
.await?;
emit_progress(tx, "finalizing", "Finalisation...", 90);
@ -542,6 +545,54 @@ fn filter_homepage_urls(
/// Articles with empty `scraped_content` are those where scraping failed (network error),
/// the page was a soft 404, or the article was too old. Keeping them would produce
/// empty or low-quality output in the final synthesis.
/// Build a JSON schema for the rewrite pass that matches the actual scraped item counts.
///
/// Unlike the search pass schema (which uses `minItems`/`maxItems` from user settings),
/// the rewrite schema uses the actual number of items per category after scraping and
/// filtering. This prevents the LLM from duplicating content to fill a quota.
fn build_rewrite_schema(
scraped: &HashMap<String, Vec<ScrapedNewsItem>>,
categories: &[String],
) -> serde_json::Value {
// Build a schema where each category's minItems/maxItems matches the actual count
let news_item_schema = serde_json::json!({
"type": "object",
"properties": {
"title": { "type": "string", "description": "The title of the news article" },
"url": { "type": "string", "description": "The URL of the source article" },
"summary": { "type": "string", "description": "A concise summary of the article" }
},
"required": ["title", "url", "summary"],
"additionalProperties": false
});
let mut properties = serde_json::Map::new();
let mut required = Vec::new();
for (i, cat_name) in categories.iter().enumerate() {
let key = format!("category_{}", i);
let count = scraped.get(&key).map_or(0, |items| items.len() as i32).max(1);
properties.insert(
key.clone(),
serde_json::json!({
"type": "array",
"description": cat_name,
"items": news_item_schema,
"minItems": count,
"maxItems": count
}),
);
required.push(serde_json::Value::String(key));
}
serde_json::json!({
"type": "object",
"properties": properties,
"required": required,
"additionalProperties": false
})
}
fn filter_empty_scraped_articles(
scraped: HashMap<String, Vec<ScrapedNewsItem>>,
) -> HashMap<String, Vec<ScrapedNewsItem>> {
@ -587,8 +638,6 @@ fn limit_articles_per_source(
// Pass 1: keep at most 1 article per domain per category
let mut kept: Vec<(String, Vec<NewsItem>)> = Vec::new();
let mut dropped: Vec<(usize, NewsItem)> = Vec::new(); // (category_index, item)
let mut domain_counts: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
for (cat_idx, (cat_key, items)) in parsed.into_iter().enumerate() {
let mut cat_kept = Vec::new();

Loading…
Cancel
Save