From 7cbb2853ced0b47e5957a4e7ec1607c03738fce0 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 18:07:03 +0100 Subject: [PATCH] feat: Autre fill-up to 75% synthesis target with source diversity enforcement Accumulates overflow articles from both classification phases and redistributes them into the Autre category when total articles fall below 75% of the configured max, respecting per-source diversity limits. Co-Authored-By: Claude Sonnet 4.6 --- backend/src/services/synthesis.rs | 97 +++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 21290ca..b3db2f2 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -316,6 +316,8 @@ async fn run_generation_inner( let mut filled_counts: HashMap = HashMap::new(); // Combined scraped articles keyed by category let mut all_scraped: HashMap> = HashMap::new(); + // Overflow articles that didn't fit any category (used for fill-up) + let mut all_overflow: Vec = Vec::new(); // Track all URLs seen (for cross-phase dedup) let mut seen_urls: std::collections::HashSet = std::collections::HashSet::new(); @@ -499,6 +501,8 @@ async fn run_generation_inner( &mut filled_counts, ); + all_overflow.extend(phase1_overflow); + // Merge into all_scraped and track URLs for (cat_key, items) in phase1_classified { for item in &items { @@ -706,6 +710,8 @@ async fn run_generation_inner( &mut filled_counts, ); + all_overflow.extend(phase2_overflow); + // Merge Phase 2 into all_scraped for (cat_key, items) in phase2_classified { for item in &items { @@ -719,6 +725,59 @@ async fn run_generation_inner( // ═══════════════════════════════════════════════════════════════ // COMBINED REWRITE PASS // ═══════════════════════════════════════════════════════════════ + + // Fill-up: if total articles are below 75% of max, expand "Autre" with overflow + let total_articles: usize = all_scraped.values().map(|v| v.len()).sum(); + let max_articles = settings.categories.len() * settings.max_items_per_category as usize; + let target = (SYNTHESIS_MIN_FILL_RATIO * max_articles as f64).ceil() as usize; + let shortfall = target.saturating_sub(total_articles); + + if shortfall > 0 && !all_overflow.is_empty() { + tracing::info!( + total = total_articles, + target = target, + shortfall = shortfall, + overflow_available = all_overflow.len(), + "Synthesis under-filled, adding overflow to Autre" + ); + + // Count domain occurrences across all categories for source diversity enforcement + let mut domain_counts: HashMap = HashMap::new(); + for items in all_scraped.values() { + for item in items { + if let Some(domain) = extract_domain(&item.url) { + *domain_counts.entry(domain).or_insert(0) += 1; + } + } + } + + let max_per_source = settings.max_articles_per_source as usize; + let mut added = 0usize; + + for article in all_overflow { + if added >= shortfall { + break; + } + // Enforce source diversity on overflow articles + if let Some(domain) = extract_domain(&article.url) { + let count = domain_counts.get(&domain).copied().unwrap_or(0); + if count >= max_per_source { + continue; + } + *domain_counts.entry(domain).or_insert(0) += 1; + } + all_scraped + .entry("category_autre".to_string()) + .or_default() + .push(article); + added += 1; + } + + if added > 0 { + tracing::info!(added = added, "Added overflow articles to Autre"); + } + } + if all_scraped.values().all(|items| items.is_empty()) { return Err(AppError::BadRequest( "Aucun article valide trouve. Verifiez vos sources et categories.".into(), @@ -2571,4 +2630,42 @@ mod tests { let h2 = hash_article_url("https://example.com/article-2"); assert_ne!(h1, h2); } + + // ── fill-up calculation tests ─────────────────────────────── + + #[test] + fn fillup_target_calculation() { + // 4 categories x 4 items = 16 max, 75% = 12 + let max = 4 * 4; + let target = (0.75_f64 * max as f64).ceil() as usize; + assert_eq!(target, 12); + } + + #[test] + fn fillup_shortfall_saturating() { + let target: usize = 12; + let total: usize = 15; + let shortfall = target.saturating_sub(total); + assert_eq!(shortfall, 0); + } + + #[test] + fn classification_overflow_collected_when_all_full() { + use crate::models::synthesis::ScrapedNewsItem; + let articles: Vec = (0..6).map(|i| ScrapedNewsItem { + title: format!("Art{}", i), url: format!("https://a.com/{}", i), + summary: "s".into(), original_title: "t".into(), scraped_content: "c".into(), + }).collect(); + let categories = vec!["AI News".to_string(), "Autre".to_string()]; + let response = serde_json::json!({ + "assignments": (0..6).map(|i| serde_json::json!({"index": i, "category": "AI News"})).collect::>() + }); + let mut filled = HashMap::new(); + let (result, overflow) = parse_classification_response(&response, &articles, &categories, 2, &mut filled); + + // AI News capped at 2, Autre gets 2, remaining 2 go to overflow + assert_eq!(result.get("category_0").map(|v| v.len()), Some(2)); + assert_eq!(result.get("category_autre").map(|v| v.len()), Some(2)); + assert_eq!(overflow.len(), 2, "2 articles should overflow when both categories are full"); + } }