feat: Autre fill-up to 75% synthesis target with source diversity enforcement

Accumulates overflow articles from both classification phases and redistributes
them into the Autre category when total articles fall below 75% of the configured
max, respecting per-source diversity limits.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 3 months ago
parent c3e6103ef1
commit 7cbb2853ce

@ -316,6 +316,8 @@ async fn run_generation_inner(
let mut filled_counts: HashMap<String, usize> = HashMap::new(); let mut filled_counts: HashMap<String, usize> = HashMap::new();
// Combined scraped articles keyed by category // Combined scraped articles keyed by category
let mut all_scraped: HashMap<String, Vec<ScrapedNewsItem>> = HashMap::new(); let mut all_scraped: HashMap<String, Vec<ScrapedNewsItem>> = HashMap::new();
// Overflow articles that didn't fit any category (used for fill-up)
let mut all_overflow: Vec<ScrapedNewsItem> = Vec::new();
// Track all URLs seen (for cross-phase dedup) // Track all URLs seen (for cross-phase dedup)
let mut seen_urls: std::collections::HashSet<String> = std::collections::HashSet::new(); let mut seen_urls: std::collections::HashSet<String> = std::collections::HashSet::new();
@ -499,6 +501,8 @@ async fn run_generation_inner(
&mut filled_counts, &mut filled_counts,
); );
all_overflow.extend(phase1_overflow);
// Merge into all_scraped and track URLs // Merge into all_scraped and track URLs
for (cat_key, items) in phase1_classified { for (cat_key, items) in phase1_classified {
for item in &items { for item in &items {
@ -706,6 +710,8 @@ async fn run_generation_inner(
&mut filled_counts, &mut filled_counts,
); );
all_overflow.extend(phase2_overflow);
// Merge Phase 2 into all_scraped // Merge Phase 2 into all_scraped
for (cat_key, items) in phase2_classified { for (cat_key, items) in phase2_classified {
for item in &items { for item in &items {
@ -719,6 +725,59 @@ async fn run_generation_inner(
// ═══════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════
// COMBINED REWRITE PASS // COMBINED REWRITE PASS
// ═══════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════
// Fill-up: if total articles are below 75% of max, expand "Autre" with overflow
let total_articles: usize = all_scraped.values().map(|v| v.len()).sum();
let max_articles = settings.categories.len() * settings.max_items_per_category as usize;
let target = (SYNTHESIS_MIN_FILL_RATIO * max_articles as f64).ceil() as usize;
let shortfall = target.saturating_sub(total_articles);
if shortfall > 0 && !all_overflow.is_empty() {
tracing::info!(
total = total_articles,
target = target,
shortfall = shortfall,
overflow_available = all_overflow.len(),
"Synthesis under-filled, adding overflow to Autre"
);
// Count domain occurrences across all categories for source diversity enforcement
let mut domain_counts: HashMap<String, usize> = HashMap::new();
for items in all_scraped.values() {
for item in items {
if let Some(domain) = extract_domain(&item.url) {
*domain_counts.entry(domain).or_insert(0) += 1;
}
}
}
let max_per_source = settings.max_articles_per_source as usize;
let mut added = 0usize;
for article in all_overflow {
if added >= shortfall {
break;
}
// Enforce source diversity on overflow articles
if let Some(domain) = extract_domain(&article.url) {
let count = domain_counts.get(&domain).copied().unwrap_or(0);
if count >= max_per_source {
continue;
}
*domain_counts.entry(domain).or_insert(0) += 1;
}
all_scraped
.entry("category_autre".to_string())
.or_default()
.push(article);
added += 1;
}
if added > 0 {
tracing::info!(added = added, "Added overflow articles to Autre");
}
}
if all_scraped.values().all(|items| items.is_empty()) { if all_scraped.values().all(|items| items.is_empty()) {
return Err(AppError::BadRequest( return Err(AppError::BadRequest(
"Aucun article valide trouve. Verifiez vos sources et categories.".into(), "Aucun article valide trouve. Verifiez vos sources et categories.".into(),
@ -2571,4 +2630,42 @@ mod tests {
let h2 = hash_article_url("https://example.com/article-2"); let h2 = hash_article_url("https://example.com/article-2");
assert_ne!(h1, h2); assert_ne!(h1, h2);
} }
// ── fill-up calculation tests ───────────────────────────────
#[test]
fn fillup_target_calculation() {
// 4 categories x 4 items = 16 max, 75% = 12
let max = 4 * 4;
let target = (0.75_f64 * max as f64).ceil() as usize;
assert_eq!(target, 12);
}
#[test]
fn fillup_shortfall_saturating() {
let target: usize = 12;
let total: usize = 15;
let shortfall = target.saturating_sub(total);
assert_eq!(shortfall, 0);
}
#[test]
fn classification_overflow_collected_when_all_full() {
use crate::models::synthesis::ScrapedNewsItem;
let articles: Vec<ScrapedNewsItem> = (0..6).map(|i| ScrapedNewsItem {
title: format!("Art{}", i), url: format!("https://a.com/{}", i),
summary: "s".into(), original_title: "t".into(), scraped_content: "c".into(),
}).collect();
let categories = vec!["AI News".to_string(), "Autre".to_string()];
let response = serde_json::json!({
"assignments": (0..6).map(|i| serde_json::json!({"index": i, "category": "AI News"})).collect::<Vec<_>>()
});
let mut filled = HashMap::new();
let (result, overflow) = parse_classification_response(&response, &articles, &categories, 2, &mut filled);
// AI News capped at 2, Autre gets 2, remaining 2 go to overflow
assert_eq!(result.get("category_0").map(|v| v.len()), Some(2));
assert_eq!(result.get("category_autre").map(|v| v.len()), Some(2));
assert_eq!(overflow.len(), 2, "2 articles should overflow when both categories are full");
}
} }

Loading…
Cancel
Save