@ -491,7 +491,7 @@ async fn run_generation_inner(
. await ? ;
. await ? ;
// 1e. Parse classification and fill categories
// 1e. Parse classification and fill categories
let phase1_classified = parse_classification_response (
let ( phase1_classified , phase1_overflow ) = parse_classification_response (
& class_response ,
& class_response ,
& valid_articles ,
& valid_articles ,
& classification_categories ,
& classification_categories ,
@ -698,7 +698,7 @@ async fn run_generation_inner(
)
)
. await ? ;
. await ? ;
let phase2_classified = parse_classification_response (
let ( phase2_classified , phase2_overflow ) = parse_classification_response (
& class_response ,
& class_response ,
& phase2_articles ,
& phase2_articles ,
& classification_categories ,
& classification_categories ,
@ -770,6 +770,10 @@ async fn run_generation_inner(
// Helper Functions
// Helper Functions
// ───────────────────────────────────────────────────────────────────
// ───────────────────────────────────────────────────────────────────
/// Minimum fill ratio for synthesis. If total articles are below this percentage
/// of the maximum capacity, overflow articles are added to "Autre" to compensate.
const SYNTHESIS_MIN_FILL_RATIO : f64 = 0.75 ;
/// Recursively strip `\u0000` null bytes from JSON values.
/// Recursively strip `\u0000` null bytes from JSON values.
///
///
/// PostgreSQL rejects null bytes in JSONB text. LLM output occasionally
/// PostgreSQL rejects null bytes in JSONB text. LLM output occasionally
@ -1675,9 +1679,10 @@ fn parse_classification_response(
categories : & [ String ] ,
categories : & [ String ] ,
max_per_category : i32 ,
max_per_category : i32 ,
filled_counts : & mut HashMap < String , usize > ,
filled_counts : & mut HashMap < String , usize > ,
) -> HashMap< String , Vec < ScrapedNewsItem > > {
) -> ( HashMap< String , Vec < ScrapedNewsItem > > , Vec < ScrapedNewsItem > ) {
let max = max_per_category as usize ;
let max = max_per_category as usize ;
let mut result : HashMap < String , Vec < ScrapedNewsItem > > = HashMap ::new ( ) ;
let mut result : HashMap < String , Vec < ScrapedNewsItem > > = HashMap ::new ( ) ;
let mut overflow : Vec < ScrapedNewsItem > = Vec ::new ( ) ;
// Build category name → key mapping (case-insensitive)
// Build category name → key mapping (case-insensitive)
// "Autre" always maps to "category_autre"
// "Autre" always maps to "category_autre"
@ -1739,6 +1744,9 @@ fn parse_classification_response(
result . entry ( "category_autre" . to_string ( ) ) . or_default ( ) . push ( articles [ index ] . clone ( ) ) ;
result . entry ( "category_autre" . to_string ( ) ) . or_default ( ) . push ( articles [ index ] . clone ( ) ) ;
* filled_counts . entry ( "Autre" . to_string ( ) ) . or_insert ( 0 ) + = 1 ;
* filled_counts . entry ( "Autre" . to_string ( ) ) . or_insert ( 0 ) + = 1 ;
assigned_indices . insert ( index ) ;
assigned_indices . insert ( index ) ;
} else {
overflow . push ( articles [ index ] . clone ( ) ) ;
assigned_indices . insert ( index ) ;
}
}
continue ;
continue ;
}
}
@ -1755,11 +1763,13 @@ fn parse_classification_response(
if autre_filled < max {
if autre_filled < max {
result . entry ( "category_autre" . to_string ( ) ) . or_default ( ) . push ( article . clone ( ) ) ;
result . entry ( "category_autre" . to_string ( ) ) . or_default ( ) . push ( article . clone ( ) ) ;
* filled_counts . entry ( "Autre" . to_string ( ) ) . or_insert ( 0 ) + = 1 ;
* filled_counts . entry ( "Autre" . to_string ( ) ) . or_insert ( 0 ) + = 1 ;
} else {
overflow . push ( article . clone ( ) ) ;
}
}
}
}
}
}
result
( result , overflow )
}
}
#[ cfg(test) ]
#[ cfg(test) ]
@ -2415,7 +2425,7 @@ mod tests {
]
]
} ) ;
} ) ;
let mut filled = HashMap ::new ( ) ;
let mut filled = HashMap ::new ( ) ;
let result = parse_classification_response ( & response , & articles , & categories , 4 , & mut filled ) ;
let ( result , _overflow ) = parse_classification_response ( & response , & articles , & categories , 4 , & mut filled ) ;
assert_eq! ( result . get ( "category_0" ) . map ( | v | v . len ( ) ) , Some ( 1 ) ) ;
assert_eq! ( result . get ( "category_0" ) . map ( | v | v . len ( ) ) , Some ( 1 ) ) ;
assert_eq! ( result . get ( "category_autre" ) . map ( | v | v . len ( ) ) , Some ( 1 ) ) ;
assert_eq! ( result . get ( "category_autre" ) . map ( | v | v . len ( ) ) , Some ( 1 ) ) ;
}
}
@ -2431,7 +2441,7 @@ mod tests {
"assignments" : [ { "index" : 0 , "category" : "Unknown Category" } ]
"assignments" : [ { "index" : 0 , "category" : "Unknown Category" } ]
} ) ;
} ) ;
let mut filled = HashMap ::new ( ) ;
let mut filled = HashMap ::new ( ) ;
let result = parse_classification_response ( & response , & articles , & categories , 4 , & mut filled ) ;
let ( result , _overflow ) = parse_classification_response ( & response , & articles , & categories , 4 , & mut filled ) ;
assert_eq! ( result . get ( "category_autre" ) . map ( | v | v . len ( ) ) , Some ( 1 ) ) ;
assert_eq! ( result . get ( "category_autre" ) . map ( | v | v . len ( ) ) , Some ( 1 ) ) ;
}
}
@ -2447,9 +2457,12 @@ mod tests {
"assignments" : ( 0 .. 5 ) . map ( | i | serde_json ::json ! ( { "index" : i , "category" : "AI News" } ) ) . collect ::< Vec < _ > > ( )
"assignments" : ( 0 .. 5 ) . map ( | i | serde_json ::json ! ( { "index" : i , "category" : "AI News" } ) ) . collect ::< Vec < _ > > ( )
} ) ;
} ) ;
let mut filled = HashMap ::new ( ) ;
let mut filled = HashMap ::new ( ) ;
let result = parse_classification_response ( & response , & articles , & categories , 2 , & mut filled ) ;
let ( result , overflow ) = parse_classification_response ( & response , & articles , & categories , 2 , & mut filled ) ;
assert_eq! ( result . get ( "category_0" ) . map ( | v | v . len ( ) ) , Some ( 2 ) ) ;
assert_eq! ( result . get ( "category_0" ) . map ( | v | v . len ( ) ) , Some ( 2 ) ) ;
assert! ( result . get ( "category_autre" ) . map ( | v | v . len ( ) ) . unwrap_or ( 0 ) > 0 ) ;
assert_eq! ( result . get ( "category_autre" ) . map ( | v | v . len ( ) ) , Some ( 2 ) ) ;
// Article at index 4 couldn't fit in AI News (capped at 2) or Autre (capped at 2)
assert_eq! ( overflow . len ( ) , 1 ) ;
assert_eq! ( overflow [ 0 ] . title , "Art4" ) ;
}
}
#[ test ]
#[ test ]
@ -2463,7 +2476,7 @@ mod tests {
"assignments" : [ { "index" : 99 , "category" : "AI News" } ]
"assignments" : [ { "index" : 99 , "category" : "AI News" } ]
} ) ;
} ) ;
let mut filled = HashMap ::new ( ) ;
let mut filled = HashMap ::new ( ) ;
let result = parse_classification_response ( & response , & articles , & categories , 4 , & mut filled ) ;
let ( result , _overflow ) = parse_classification_response ( & response , & articles , & categories , 4 , & mut filled ) ;
// Index 99 is invalid → article 0 is unclassified → goes to Autre
// Index 99 is invalid → article 0 is unclassified → goes to Autre
assert_eq! ( result . get ( "category_autre" ) . map ( | v | v . len ( ) ) , Some ( 1 ) ) ;
assert_eq! ( result . get ( "category_autre" ) . map ( | v | v . len ( ) ) , Some ( 1 ) ) ;
}
}
@ -2479,7 +2492,7 @@ mod tests {
"assignments" : [ { "index" : 0 , "category" : "ai news" } ]
"assignments" : [ { "index" : 0 , "category" : "ai news" } ]
} ) ;
} ) ;
let mut filled = HashMap ::new ( ) ;
let mut filled = HashMap ::new ( ) ;
let result = parse_classification_response ( & response , & articles , & categories , 4 , & mut filled ) ;
let ( result , _overflow ) = parse_classification_response ( & response , & articles , & categories , 4 , & mut filled ) ;
assert_eq! ( result . get ( "category_0" ) . map ( | v | v . len ( ) ) , Some ( 1 ) ) ;
assert_eq! ( result . get ( "category_0" ) . map ( | v | v . len ( ) ) , Some ( 1 ) ) ;
}
}