From d508b5b4abc73d5c1b5d9a67d99f5bbd99daa681 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 01:47:30 +0100 Subject: [PATCH] feat: Autre category support in rewrite schema, final sections, URL restore + remove dead code Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/services/synthesis.rs | 270 +++++++++--------------------- 1 file changed, 75 insertions(+), 195 deletions(-) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 1ff12b5..76c3311 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -569,9 +569,13 @@ fn build_rewrite_schema( let mut properties = serde_json::Map::new(); let mut required = Vec::new(); + // User categories for (i, cat_name) in categories.iter().enumerate() { let key = format!("category_{}", i); - let count = scraped.get(&key).map_or(0, |items| items.len() as i32).max(1); + let count = scraped.get(&key).map_or(0, |items| items.len() as i32); + if count == 0 { + continue; // Omit empty categories — no hallucinated articles + } properties.insert( key.clone(), serde_json::json!({ @@ -585,6 +589,24 @@ fn build_rewrite_schema( required.push(serde_json::Value::String(key)); } + // "Autre" category (if it has articles) + if let Some(autre_items) = scraped.get("category_autre") { + let count = autre_items.len() as i32; + if count > 0 { + properties.insert( + "category_autre".to_string(), + serde_json::json!({ + "type": "array", + "description": "Autre", + "items": news_item_schema, + "minItems": count, + "maxItems": count + }), + ); + required.push(serde_json::Value::String("category_autre".to_string())); + } + } + serde_json::json!({ "type": "object", "properties": properties, @@ -946,16 +968,28 @@ fn build_final_sections( ) -> Result, AppError> { let mut sections = Vec::new(); + // User categories for (i, cat_name) in categories.iter().enumerate() { let key = format!("category_{}", i); let items_val = raw.get(&key).cloned().unwrap_or(serde_json::json!([])); - let items: Vec = serde_json::from_value(items_val).unwrap_or_default(); + if !items.is_empty() { + sections.push(NewsSection { + title: cat_name.clone(), + items, + }); + } + } - sections.push(NewsSection { - title: cat_name.clone(), - items, - }); + // "Autre" category (if present in LLM output) + if let Some(autre_val) = raw.get("category_autre") { + let items: Vec = serde_json::from_value(autre_val.clone()).unwrap_or_default(); + if !items.is_empty() { + sections.push(NewsSection { + title: "Autre".to_string(), + items, + }); + } } Ok(sections) @@ -972,14 +1006,25 @@ fn restore_scraped_urls( scraped: &std::collections::HashMap>, categories: &[String], ) { - for (i, section) in sections.iter_mut().enumerate() { - let key = format!("category_{}", i); + for section in sections.iter_mut() { + // Determine the category key for this section + let key = if section.title == "Autre" { + "category_autre".to_string() + } else { + // Find the index of this category in the user categories list + categories + .iter() + .position(|c| c == §ion.title) + .map(|i| format!("category_{}", i)) + .unwrap_or_default() + }; + if let Some(scraped_items) = scraped.get(&key) { for (j, item) in section.items.iter_mut().enumerate() { if let Some(scraped_item) = scraped_items.get(j) { if item.url != scraped_item.url { tracing::debug!( - category = %categories.get(i).unwrap_or(&key), + category = %section.title, original = %scraped_item.url, hallucinated = %item.url, "Restored hallucinated URL to scraped original" @@ -992,47 +1037,6 @@ fn restore_scraped_urls( } } -/// Minimum ratio of valid URLs (starting with `http`) required to skip the -/// scrape+rewrite pass and use the search pass results directly. -const URL_QUALITY_THRESHOLD: f64 = 0.70; - -/// Check whether the search pass produced sufficiently high-quality URLs. -/// -/// Returns `true` if more than 70% of the URLs across all categories start -/// with `http` (indicating they are real web URLs rather than hallucinated -/// or malformed references). -/// -/// If there are no articles at all, returns `false` to fall through to the -/// full pipeline. -fn url_quality_sufficient(parsed: &[(String, Vec)]) -> bool { - let mut total = 0usize; - let mut valid = 0usize; - - for (_cat_key, items) in parsed { - for item in items { - total += 1; - if item.url.starts_with("http") { - valid += 1; - } - } - } - - if total == 0 { - return false; - } - - let ratio = valid as f64 / total as f64; - tracing::debug!( - total_urls = total, - valid_urls = valid, - ratio = ratio, - threshold = URL_QUALITY_THRESHOLD, - "URL quality check" - ); - - ratio >= URL_QUALITY_THRESHOLD -} - /// Sanitize error messages to prevent leaking sensitive information. /// /// Removes potential API keys, internal paths, and other sensitive data. @@ -1367,11 +1371,29 @@ mod tests { let categories = vec!["Annonces majeures".into(), "Recherche".into()]; let sections = build_final_sections(&raw, &categories).unwrap(); - assert_eq!(sections.len(), 2); + // Only 1 section — empty categories are omitted + assert_eq!(sections.len(), 1); assert_eq!(sections[0].title, "Annonces majeures"); assert_eq!(sections[0].items.len(), 1); - assert_eq!(sections[1].title, "Recherche"); - assert_eq!(sections[1].items.len(), 0); + } + + #[test] + fn build_final_sections_includes_autre() { + let raw = serde_json::json!({ + "category_0": [ + {"title": "A", "url": "https://a.com", "summary": "s"} + ], + "category_autre": [ + {"title": "B", "url": "https://b.com", "summary": "s"} + ] + }); + + let categories = vec!["AI News".into()]; + let sections = build_final_sections(&raw, &categories).unwrap(); + + assert_eq!(sections.len(), 2); + assert_eq!(sections[0].title, "AI News"); + assert_eq!(sections[1].title, "Autre"); } // ── sanitize_error_message tests ───────────────────────────── @@ -1415,148 +1437,6 @@ mod tests { assert_eq!(sanitized, msg); } - // ── url_quality_sufficient tests ──────────────────────────── - - #[test] - fn url_quality_all_valid_urls() { - let parsed = vec![ - ( - "category_0".into(), - vec![ - NewsItem { - title: "A".into(), - url: "https://example.com/a".into(), - summary: "Sum A".into(), - }, - NewsItem { - title: "B".into(), - url: "https://example.com/b".into(), - summary: "Sum B".into(), - }, - ], - ), - ( - "category_1".into(), - vec![NewsItem { - title: "C".into(), - url: "http://example.org/c".into(), - summary: "Sum C".into(), - }], - ), - ]; - - // 3/3 = 100% valid -> true - assert!(url_quality_sufficient(&parsed)); - } - - #[test] - fn url_quality_above_threshold() { - // 8 valid out of 10 = 80% > 70% - let mut items = Vec::new(); - for i in 0..8 { - items.push(NewsItem { - title: format!("Art {}", i), - url: format!("https://example.com/{}", i), - summary: "Sum".into(), - }); - } - for i in 8..10 { - items.push(NewsItem { - title: format!("Art {}", i), - url: format!("bad-url-{}", i), - summary: "Sum".into(), - }); - } - - let parsed = vec![("category_0".into(), items)]; - assert!(url_quality_sufficient(&parsed)); - } - - #[test] - fn url_quality_exactly_at_threshold() { - // 7 valid out of 10 = 70% >= 70% - let mut items = Vec::new(); - for i in 0..7 { - items.push(NewsItem { - title: format!("Art {}", i), - url: format!("https://example.com/{}", i), - summary: "Sum".into(), - }); - } - for i in 7..10 { - items.push(NewsItem { - title: format!("Art {}", i), - url: format!("bad-url-{}", i), - summary: "Sum".into(), - }); - } - - let parsed = vec![("category_0".into(), items)]; - assert!(url_quality_sufficient(&parsed)); - } - - #[test] - fn url_quality_below_threshold() { - // 6 valid out of 10 = 60% < 70% - let mut items = Vec::new(); - for i in 0..6 { - items.push(NewsItem { - title: format!("Art {}", i), - url: format!("https://example.com/{}", i), - summary: "Sum".into(), - }); - } - for i in 6..10 { - items.push(NewsItem { - title: format!("Art {}", i), - url: format!("no-protocol-{}", i), - summary: "Sum".into(), - }); - } - - let parsed = vec![("category_0".into(), items)]; - assert!(!url_quality_sufficient(&parsed)); - } - - #[test] - fn url_quality_all_invalid_urls() { - let parsed = vec![( - "category_0".into(), - vec![ - NewsItem { - title: "A".into(), - url: "not-a-url".into(), - summary: "Sum".into(), - }, - NewsItem { - title: "B".into(), - url: "also-not-a-url".into(), - summary: "Sum".into(), - }, - ], - )]; - - // 0/2 = 0% -> false - assert!(!url_quality_sufficient(&parsed)); - } - - #[test] - fn url_quality_empty_articles() { - let parsed: Vec<(String, Vec)> = vec![ - ("category_0".into(), vec![]), - ("category_1".into(), vec![]), - ]; - - // No articles -> false (fall through to full pipeline) - assert!(!url_quality_sufficient(&parsed)); - } - - #[test] - fn url_quality_empty_categories() { - let parsed: Vec<(String, Vec)> = vec![]; - assert!(!url_quality_sufficient(&parsed)); - } - // ── filter_homepage_urls tests ────────────────────────────── #[test]