diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 3c99714..21290ca 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -491,7 +491,7 @@ async fn run_generation_inner( .await?; // 1e. Parse classification and fill categories - let phase1_classified = parse_classification_response( + let (phase1_classified, phase1_overflow) = parse_classification_response( &class_response, &valid_articles, &classification_categories, @@ -698,7 +698,7 @@ async fn run_generation_inner( ) .await?; - let phase2_classified = parse_classification_response( + let (phase2_classified, phase2_overflow) = parse_classification_response( &class_response, &phase2_articles, &classification_categories, @@ -770,6 +770,10 @@ async fn run_generation_inner( // Helper Functions // ─────────────────────────────────────────────────────────────────── +/// Minimum fill ratio for synthesis. If total articles are below this percentage +/// of the maximum capacity, overflow articles are added to "Autre" to compensate. +const SYNTHESIS_MIN_FILL_RATIO: f64 = 0.75; + /// Recursively strip `\u0000` null bytes from JSON values. /// /// PostgreSQL rejects null bytes in JSONB text. LLM output occasionally @@ -1675,9 +1679,10 @@ fn parse_classification_response( categories: &[String], max_per_category: i32, filled_counts: &mut HashMap, -) -> HashMap> { +) -> (HashMap>, Vec) { let max = max_per_category as usize; let mut result: HashMap> = HashMap::new(); + let mut overflow: Vec = Vec::new(); // Build category name → key mapping (case-insensitive) // "Autre" always maps to "category_autre" @@ -1739,6 +1744,9 @@ fn parse_classification_response( result.entry("category_autre".to_string()).or_default().push(articles[index].clone()); *filled_counts.entry("Autre".to_string()).or_insert(0) += 1; assigned_indices.insert(index); + } else { + overflow.push(articles[index].clone()); + assigned_indices.insert(index); } continue; } @@ -1755,11 +1763,13 @@ fn parse_classification_response( if autre_filled < max { result.entry("category_autre".to_string()).or_default().push(article.clone()); *filled_counts.entry("Autre".to_string()).or_insert(0) += 1; + } else { + overflow.push(article.clone()); } } } - result + (result, overflow) } #[cfg(test)] @@ -2415,7 +2425,7 @@ mod tests { ] }); let mut filled = HashMap::new(); - let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled); + let (result, _overflow) = parse_classification_response(&response, &articles, &categories, 4, &mut filled); assert_eq!(result.get("category_0").map(|v| v.len()), Some(1)); assert_eq!(result.get("category_autre").map(|v| v.len()), Some(1)); } @@ -2431,7 +2441,7 @@ mod tests { "assignments": [{"index": 0, "category": "Unknown Category"}] }); let mut filled = HashMap::new(); - let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled); + let (result, _overflow) = parse_classification_response(&response, &articles, &categories, 4, &mut filled); assert_eq!(result.get("category_autre").map(|v| v.len()), Some(1)); } @@ -2447,9 +2457,12 @@ mod tests { "assignments": (0..5).map(|i| serde_json::json!({"index": i, "category": "AI News"})).collect::>() }); let mut filled = HashMap::new(); - let result = parse_classification_response(&response, &articles, &categories, 2, &mut filled); + let (result, overflow) = parse_classification_response(&response, &articles, &categories, 2, &mut filled); assert_eq!(result.get("category_0").map(|v| v.len()), Some(2)); - assert!(result.get("category_autre").map(|v| v.len()).unwrap_or(0) > 0); + assert_eq!(result.get("category_autre").map(|v| v.len()), Some(2)); + // Article at index 4 couldn't fit in AI News (capped at 2) or Autre (capped at 2) + assert_eq!(overflow.len(), 1); + assert_eq!(overflow[0].title, "Art4"); } #[test] @@ -2463,7 +2476,7 @@ mod tests { "assignments": [{"index": 99, "category": "AI News"}] }); let mut filled = HashMap::new(); - let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled); + let (result, _overflow) = parse_classification_response(&response, &articles, &categories, 4, &mut filled); // Index 99 is invalid → article 0 is unclassified → goes to Autre assert_eq!(result.get("category_autre").map(|v| v.len()), Some(1)); } @@ -2479,7 +2492,7 @@ mod tests { "assignments": [{"index": 0, "category": "ai news"}] }); let mut filled = HashMap::new(); - let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled); + let (result, _overflow) = parse_classification_response(&response, &articles, &categories, 4, &mut filled); assert_eq!(result.get("category_0").map(|v| v.len()), Some(1)); }