feat: parse_classification_response collects overflow articles

Returns a (result, overflow) tuple so callers can access articles that
could not fit in any category or Autre. Also adds the
SYNTHESIS_MIN_FILL_RATIO constant for the upcoming fill-up logic.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 3 months ago
parent f5f0656604
commit c3e6103ef1

@ -491,7 +491,7 @@ async fn run_generation_inner(
.await?;
// 1e. Parse classification and fill categories
let phase1_classified = parse_classification_response(
let (phase1_classified, phase1_overflow) = parse_classification_response(
&class_response,
&valid_articles,
&classification_categories,
@ -698,7 +698,7 @@ async fn run_generation_inner(
)
.await?;
let phase2_classified = parse_classification_response(
let (phase2_classified, phase2_overflow) = parse_classification_response(
&class_response,
&phase2_articles,
&classification_categories,
@ -770,6 +770,10 @@ async fn run_generation_inner(
// Helper Functions
// ───────────────────────────────────────────────────────────────────
/// Minimum fill ratio for synthesis. If total articles are below this percentage
/// of the maximum capacity, overflow articles are added to "Autre" to compensate.
const SYNTHESIS_MIN_FILL_RATIO: f64 = 0.75;
/// Recursively strip `\u0000` null bytes from JSON values.
///
/// PostgreSQL rejects null bytes in JSONB text. LLM output occasionally
@ -1675,9 +1679,10 @@ fn parse_classification_response(
categories: &[String],
max_per_category: i32,
filled_counts: &mut HashMap<String, usize>,
) -> HashMap<String, Vec<ScrapedNewsItem>> {
) -> (HashMap<String, Vec<ScrapedNewsItem>>, Vec<ScrapedNewsItem>) {
let max = max_per_category as usize;
let mut result: HashMap<String, Vec<ScrapedNewsItem>> = HashMap::new();
let mut overflow: Vec<ScrapedNewsItem> = Vec::new();
// Build category name → key mapping (case-insensitive)
// "Autre" always maps to "category_autre"
@ -1739,6 +1744,9 @@ fn parse_classification_response(
result.entry("category_autre".to_string()).or_default().push(articles[index].clone());
*filled_counts.entry("Autre".to_string()).or_insert(0) += 1;
assigned_indices.insert(index);
} else {
overflow.push(articles[index].clone());
assigned_indices.insert(index);
}
continue;
}
@ -1755,11 +1763,13 @@ fn parse_classification_response(
if autre_filled < max {
result.entry("category_autre".to_string()).or_default().push(article.clone());
*filled_counts.entry("Autre".to_string()).or_insert(0) += 1;
} else {
overflow.push(article.clone());
}
}
}
result
(result, overflow)
}
#[cfg(test)]
@ -2415,7 +2425,7 @@ mod tests {
]
});
let mut filled = HashMap::new();
let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
let (result, _overflow) = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
assert_eq!(result.get("category_0").map(|v| v.len()), Some(1));
assert_eq!(result.get("category_autre").map(|v| v.len()), Some(1));
}
@ -2431,7 +2441,7 @@ mod tests {
"assignments": [{"index": 0, "category": "Unknown Category"}]
});
let mut filled = HashMap::new();
let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
let (result, _overflow) = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
assert_eq!(result.get("category_autre").map(|v| v.len()), Some(1));
}
@ -2447,9 +2457,12 @@ mod tests {
"assignments": (0..5).map(|i| serde_json::json!({"index": i, "category": "AI News"})).collect::<Vec<_>>()
});
let mut filled = HashMap::new();
let result = parse_classification_response(&response, &articles, &categories, 2, &mut filled);
let (result, overflow) = parse_classification_response(&response, &articles, &categories, 2, &mut filled);
assert_eq!(result.get("category_0").map(|v| v.len()), Some(2));
assert!(result.get("category_autre").map(|v| v.len()).unwrap_or(0) > 0);
assert_eq!(result.get("category_autre").map(|v| v.len()), Some(2));
// Article at index 4 couldn't fit in AI News (capped at 2) or Autre (capped at 2)
assert_eq!(overflow.len(), 1);
assert_eq!(overflow[0].title, "Art4");
}
#[test]
@ -2463,7 +2476,7 @@ mod tests {
"assignments": [{"index": 99, "category": "AI News"}]
});
let mut filled = HashMap::new();
let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
let (result, _overflow) = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
// Index 99 is invalid → article 0 is unclassified → goes to Autre
assert_eq!(result.get("category_autre").map(|v| v.len()), Some(1));
}
@ -2479,7 +2492,7 @@ mod tests {
"assignments": [{"index": 0, "category": "ai news"}]
});
let mut filled = HashMap::new();
let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
let (result, _overflow) = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
assert_eq!(result.get("category_0").map(|v| v.len()), Some(1));
}

Loading…
Cancel
Save