feat: Autre category support in rewrite schema, final sections, URL restore + remove dead code

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent ba7024e280
commit d508b5b4ab

@ -569,9 +569,13 @@ fn build_rewrite_schema(
let mut properties = serde_json::Map::new();
let mut required = Vec::new();
// User categories
for (i, cat_name) in categories.iter().enumerate() {
let key = format!("category_{}", i);
let count = scraped.get(&key).map_or(0, |items| items.len() as i32).max(1);
let count = scraped.get(&key).map_or(0, |items| items.len() as i32);
if count == 0 {
continue; // Omit empty categories — no hallucinated articles
}
properties.insert(
key.clone(),
serde_json::json!({
@ -585,6 +589,24 @@ fn build_rewrite_schema(
required.push(serde_json::Value::String(key));
}
// "Autre" category (if it has articles)
if let Some(autre_items) = scraped.get("category_autre") {
let count = autre_items.len() as i32;
if count > 0 {
properties.insert(
"category_autre".to_string(),
serde_json::json!({
"type": "array",
"description": "Autre",
"items": news_item_schema,
"minItems": count,
"maxItems": count
}),
);
required.push(serde_json::Value::String("category_autre".to_string()));
}
}
serde_json::json!({
"type": "object",
"properties": properties,
@ -946,17 +968,29 @@ fn build_final_sections(
) -> Result<Vec<NewsSection>, AppError> {
let mut sections = Vec::new();
// User categories
for (i, cat_name) in categories.iter().enumerate() {
let key = format!("category_{}", i);
let items_val = raw.get(&key).cloned().unwrap_or(serde_json::json!([]));
let items: Vec<NewsItem> = serde_json::from_value(items_val).unwrap_or_default();
if !items.is_empty() {
sections.push(NewsSection {
title: cat_name.clone(),
items,
});
}
}
// "Autre" category (if present in LLM output)
if let Some(autre_val) = raw.get("category_autre") {
let items: Vec<NewsItem> = serde_json::from_value(autre_val.clone()).unwrap_or_default();
if !items.is_empty() {
sections.push(NewsSection {
title: "Autre".to_string(),
items,
});
}
}
Ok(sections)
}
@ -972,14 +1006,25 @@ fn restore_scraped_urls(
scraped: &std::collections::HashMap<String, Vec<ScrapedNewsItem>>,
categories: &[String],
) {
for (i, section) in sections.iter_mut().enumerate() {
let key = format!("category_{}", i);
for section in sections.iter_mut() {
// Determine the category key for this section
let key = if section.title == "Autre" {
"category_autre".to_string()
} else {
// Find the index of this category in the user categories list
categories
.iter()
.position(|c| c == &section.title)
.map(|i| format!("category_{}", i))
.unwrap_or_default()
};
if let Some(scraped_items) = scraped.get(&key) {
for (j, item) in section.items.iter_mut().enumerate() {
if let Some(scraped_item) = scraped_items.get(j) {
if item.url != scraped_item.url {
tracing::debug!(
category = %categories.get(i).unwrap_or(&key),
category = %section.title,
original = %scraped_item.url,
hallucinated = %item.url,
"Restored hallucinated URL to scraped original"
@ -992,47 +1037,6 @@ fn restore_scraped_urls(
}
}
/// Minimum ratio of valid URLs (starting with `http`) required to skip the
/// scrape+rewrite pass and use the search pass results directly.
const URL_QUALITY_THRESHOLD: f64 = 0.70;
/// Check whether the search pass produced sufficiently high-quality URLs.
///
/// Returns `true` if more than 70% of the URLs across all categories start
/// with `http` (indicating they are real web URLs rather than hallucinated
/// or malformed references).
///
/// If there are no articles at all, returns `false` to fall through to the
/// full pipeline.
fn url_quality_sufficient(parsed: &[(String, Vec<NewsItem>)]) -> bool {
let mut total = 0usize;
let mut valid = 0usize;
for (_cat_key, items) in parsed {
for item in items {
total += 1;
if item.url.starts_with("http") {
valid += 1;
}
}
}
if total == 0 {
return false;
}
let ratio = valid as f64 / total as f64;
tracing::debug!(
total_urls = total,
valid_urls = valid,
ratio = ratio,
threshold = URL_QUALITY_THRESHOLD,
"URL quality check"
);
ratio >= URL_QUALITY_THRESHOLD
}
/// Sanitize error messages to prevent leaking sensitive information.
///
/// Removes potential API keys, internal paths, and other sensitive data.
@ -1367,11 +1371,29 @@ mod tests {
let categories = vec!["Annonces majeures".into(), "Recherche".into()];
let sections = build_final_sections(&raw, &categories).unwrap();
assert_eq!(sections.len(), 2);
// Only 1 section — empty categories are omitted
assert_eq!(sections.len(), 1);
assert_eq!(sections[0].title, "Annonces majeures");
assert_eq!(sections[0].items.len(), 1);
assert_eq!(sections[1].title, "Recherche");
assert_eq!(sections[1].items.len(), 0);
}
#[test]
fn build_final_sections_includes_autre() {
let raw = serde_json::json!({
"category_0": [
{"title": "A", "url": "https://a.com", "summary": "s"}
],
"category_autre": [
{"title": "B", "url": "https://b.com", "summary": "s"}
]
});
let categories = vec!["AI News".into()];
let sections = build_final_sections(&raw, &categories).unwrap();
assert_eq!(sections.len(), 2);
assert_eq!(sections[0].title, "AI News");
assert_eq!(sections[1].title, "Autre");
}
// ── sanitize_error_message tests ─────────────────────────────
@ -1415,148 +1437,6 @@ mod tests {
assert_eq!(sanitized, msg);
}
// ── url_quality_sufficient tests ────────────────────────────
#[test]
fn url_quality_all_valid_urls() {
let parsed = vec![
(
"category_0".into(),
vec![
NewsItem {
title: "A".into(),
url: "https://example.com/a".into(),
summary: "Sum A".into(),
},
NewsItem {
title: "B".into(),
url: "https://example.com/b".into(),
summary: "Sum B".into(),
},
],
),
(
"category_1".into(),
vec![NewsItem {
title: "C".into(),
url: "http://example.org/c".into(),
summary: "Sum C".into(),
}],
),
];
// 3/3 = 100% valid -> true
assert!(url_quality_sufficient(&parsed));
}
#[test]
fn url_quality_above_threshold() {
// 8 valid out of 10 = 80% > 70%
let mut items = Vec::new();
for i in 0..8 {
items.push(NewsItem {
title: format!("Art {}", i),
url: format!("https://example.com/{}", i),
summary: "Sum".into(),
});
}
for i in 8..10 {
items.push(NewsItem {
title: format!("Art {}", i),
url: format!("bad-url-{}", i),
summary: "Sum".into(),
});
}
let parsed = vec![("category_0".into(), items)];
assert!(url_quality_sufficient(&parsed));
}
#[test]
fn url_quality_exactly_at_threshold() {
// 7 valid out of 10 = 70% >= 70%
let mut items = Vec::new();
for i in 0..7 {
items.push(NewsItem {
title: format!("Art {}", i),
url: format!("https://example.com/{}", i),
summary: "Sum".into(),
});
}
for i in 7..10 {
items.push(NewsItem {
title: format!("Art {}", i),
url: format!("bad-url-{}", i),
summary: "Sum".into(),
});
}
let parsed = vec![("category_0".into(), items)];
assert!(url_quality_sufficient(&parsed));
}
#[test]
fn url_quality_below_threshold() {
// 6 valid out of 10 = 60% < 70%
let mut items = Vec::new();
for i in 0..6 {
items.push(NewsItem {
title: format!("Art {}", i),
url: format!("https://example.com/{}", i),
summary: "Sum".into(),
});
}
for i in 6..10 {
items.push(NewsItem {
title: format!("Art {}", i),
url: format!("no-protocol-{}", i),
summary: "Sum".into(),
});
}
let parsed = vec![("category_0".into(), items)];
assert!(!url_quality_sufficient(&parsed));
}
#[test]
fn url_quality_all_invalid_urls() {
let parsed = vec![(
"category_0".into(),
vec![
NewsItem {
title: "A".into(),
url: "not-a-url".into(),
summary: "Sum".into(),
},
NewsItem {
title: "B".into(),
url: "also-not-a-url".into(),
summary: "Sum".into(),
},
],
)];
// 0/2 = 0% -> false
assert!(!url_quality_sufficient(&parsed));
}
#[test]
fn url_quality_empty_articles() {
let parsed: Vec<(String, Vec<NewsItem>)> = vec![
("category_0".into(), vec![]),
("category_1".into(), vec![]),
];
// No articles -> false (fall through to full pipeline)
assert!(!url_quality_sufficient(&parsed));
}
#[test]
fn url_quality_empty_categories() {
let parsed: Vec<(String, Vec<NewsItem>)> = vec![];
assert!(!url_quality_sufficient(&parsed));
}
// ── filter_homepage_urls tests ──────────────────────────────
#[test]

Loading…
Cancel
Save