|
|
|
@ -569,9 +569,13 @@ fn build_rewrite_schema(
|
|
|
|
let mut properties = serde_json::Map::new();
|
|
|
|
let mut properties = serde_json::Map::new();
|
|
|
|
let mut required = Vec::new();
|
|
|
|
let mut required = Vec::new();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// User categories
|
|
|
|
for (i, cat_name) in categories.iter().enumerate() {
|
|
|
|
for (i, cat_name) in categories.iter().enumerate() {
|
|
|
|
let key = format!("category_{}", i);
|
|
|
|
let key = format!("category_{}", i);
|
|
|
|
let count = scraped.get(&key).map_or(0, |items| items.len() as i32).max(1);
|
|
|
|
let count = scraped.get(&key).map_or(0, |items| items.len() as i32);
|
|
|
|
|
|
|
|
if count == 0 {
|
|
|
|
|
|
|
|
continue; // Omit empty categories — no hallucinated articles
|
|
|
|
|
|
|
|
}
|
|
|
|
properties.insert(
|
|
|
|
properties.insert(
|
|
|
|
key.clone(),
|
|
|
|
key.clone(),
|
|
|
|
serde_json::json!({
|
|
|
|
serde_json::json!({
|
|
|
|
@ -585,6 +589,24 @@ fn build_rewrite_schema(
|
|
|
|
required.push(serde_json::Value::String(key));
|
|
|
|
required.push(serde_json::Value::String(key));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// "Autre" category (if it has articles)
|
|
|
|
|
|
|
|
if let Some(autre_items) = scraped.get("category_autre") {
|
|
|
|
|
|
|
|
let count = autre_items.len() as i32;
|
|
|
|
|
|
|
|
if count > 0 {
|
|
|
|
|
|
|
|
properties.insert(
|
|
|
|
|
|
|
|
"category_autre".to_string(),
|
|
|
|
|
|
|
|
serde_json::json!({
|
|
|
|
|
|
|
|
"type": "array",
|
|
|
|
|
|
|
|
"description": "Autre",
|
|
|
|
|
|
|
|
"items": news_item_schema,
|
|
|
|
|
|
|
|
"minItems": count,
|
|
|
|
|
|
|
|
"maxItems": count
|
|
|
|
|
|
|
|
}),
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
required.push(serde_json::Value::String("category_autre".to_string()));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
serde_json::json!({
|
|
|
|
serde_json::json!({
|
|
|
|
"type": "object",
|
|
|
|
"type": "object",
|
|
|
|
"properties": properties,
|
|
|
|
"properties": properties,
|
|
|
|
@ -946,17 +968,29 @@ fn build_final_sections(
|
|
|
|
) -> Result<Vec<NewsSection>, AppError> {
|
|
|
|
) -> Result<Vec<NewsSection>, AppError> {
|
|
|
|
let mut sections = Vec::new();
|
|
|
|
let mut sections = Vec::new();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// User categories
|
|
|
|
for (i, cat_name) in categories.iter().enumerate() {
|
|
|
|
for (i, cat_name) in categories.iter().enumerate() {
|
|
|
|
let key = format!("category_{}", i);
|
|
|
|
let key = format!("category_{}", i);
|
|
|
|
let items_val = raw.get(&key).cloned().unwrap_or(serde_json::json!([]));
|
|
|
|
let items_val = raw.get(&key).cloned().unwrap_or(serde_json::json!([]));
|
|
|
|
|
|
|
|
|
|
|
|
let items: Vec<NewsItem> = serde_json::from_value(items_val).unwrap_or_default();
|
|
|
|
let items: Vec<NewsItem> = serde_json::from_value(items_val).unwrap_or_default();
|
|
|
|
|
|
|
|
if !items.is_empty() {
|
|
|
|
sections.push(NewsSection {
|
|
|
|
sections.push(NewsSection {
|
|
|
|
title: cat_name.clone(),
|
|
|
|
title: cat_name.clone(),
|
|
|
|
items,
|
|
|
|
items,
|
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// "Autre" category (if present in LLM output)
|
|
|
|
|
|
|
|
if let Some(autre_val) = raw.get("category_autre") {
|
|
|
|
|
|
|
|
let items: Vec<NewsItem> = serde_json::from_value(autre_val.clone()).unwrap_or_default();
|
|
|
|
|
|
|
|
if !items.is_empty() {
|
|
|
|
|
|
|
|
sections.push(NewsSection {
|
|
|
|
|
|
|
|
title: "Autre".to_string(),
|
|
|
|
|
|
|
|
items,
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Ok(sections)
|
|
|
|
Ok(sections)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@ -972,14 +1006,25 @@ fn restore_scraped_urls(
|
|
|
|
scraped: &std::collections::HashMap<String, Vec<ScrapedNewsItem>>,
|
|
|
|
scraped: &std::collections::HashMap<String, Vec<ScrapedNewsItem>>,
|
|
|
|
categories: &[String],
|
|
|
|
categories: &[String],
|
|
|
|
) {
|
|
|
|
) {
|
|
|
|
for (i, section) in sections.iter_mut().enumerate() {
|
|
|
|
for section in sections.iter_mut() {
|
|
|
|
let key = format!("category_{}", i);
|
|
|
|
// Determine the category key for this section
|
|
|
|
|
|
|
|
let key = if section.title == "Autre" {
|
|
|
|
|
|
|
|
"category_autre".to_string()
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
// Find the index of this category in the user categories list
|
|
|
|
|
|
|
|
categories
|
|
|
|
|
|
|
|
.iter()
|
|
|
|
|
|
|
|
.position(|c| c == §ion.title)
|
|
|
|
|
|
|
|
.map(|i| format!("category_{}", i))
|
|
|
|
|
|
|
|
.unwrap_or_default()
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
if let Some(scraped_items) = scraped.get(&key) {
|
|
|
|
if let Some(scraped_items) = scraped.get(&key) {
|
|
|
|
for (j, item) in section.items.iter_mut().enumerate() {
|
|
|
|
for (j, item) in section.items.iter_mut().enumerate() {
|
|
|
|
if let Some(scraped_item) = scraped_items.get(j) {
|
|
|
|
if let Some(scraped_item) = scraped_items.get(j) {
|
|
|
|
if item.url != scraped_item.url {
|
|
|
|
if item.url != scraped_item.url {
|
|
|
|
tracing::debug!(
|
|
|
|
tracing::debug!(
|
|
|
|
category = %categories.get(i).unwrap_or(&key),
|
|
|
|
category = %section.title,
|
|
|
|
original = %scraped_item.url,
|
|
|
|
original = %scraped_item.url,
|
|
|
|
hallucinated = %item.url,
|
|
|
|
hallucinated = %item.url,
|
|
|
|
"Restored hallucinated URL to scraped original"
|
|
|
|
"Restored hallucinated URL to scraped original"
|
|
|
|
@ -992,47 +1037,6 @@ fn restore_scraped_urls(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Minimum ratio of valid URLs (starting with `http`) required to skip the
|
|
|
|
|
|
|
|
/// scrape+rewrite pass and use the search pass results directly.
|
|
|
|
|
|
|
|
const URL_QUALITY_THRESHOLD: f64 = 0.70;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Check whether the search pass produced sufficiently high-quality URLs.
|
|
|
|
|
|
|
|
///
|
|
|
|
|
|
|
|
/// Returns `true` if more than 70% of the URLs across all categories start
|
|
|
|
|
|
|
|
/// with `http` (indicating they are real web URLs rather than hallucinated
|
|
|
|
|
|
|
|
/// or malformed references).
|
|
|
|
|
|
|
|
///
|
|
|
|
|
|
|
|
/// If there are no articles at all, returns `false` to fall through to the
|
|
|
|
|
|
|
|
/// full pipeline.
|
|
|
|
|
|
|
|
fn url_quality_sufficient(parsed: &[(String, Vec<NewsItem>)]) -> bool {
|
|
|
|
|
|
|
|
let mut total = 0usize;
|
|
|
|
|
|
|
|
let mut valid = 0usize;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (_cat_key, items) in parsed {
|
|
|
|
|
|
|
|
for item in items {
|
|
|
|
|
|
|
|
total += 1;
|
|
|
|
|
|
|
|
if item.url.starts_with("http") {
|
|
|
|
|
|
|
|
valid += 1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if total == 0 {
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let ratio = valid as f64 / total as f64;
|
|
|
|
|
|
|
|
tracing::debug!(
|
|
|
|
|
|
|
|
total_urls = total,
|
|
|
|
|
|
|
|
valid_urls = valid,
|
|
|
|
|
|
|
|
ratio = ratio,
|
|
|
|
|
|
|
|
threshold = URL_QUALITY_THRESHOLD,
|
|
|
|
|
|
|
|
"URL quality check"
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ratio >= URL_QUALITY_THRESHOLD
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Sanitize error messages to prevent leaking sensitive information.
|
|
|
|
/// Sanitize error messages to prevent leaking sensitive information.
|
|
|
|
///
|
|
|
|
///
|
|
|
|
/// Removes potential API keys, internal paths, and other sensitive data.
|
|
|
|
/// Removes potential API keys, internal paths, and other sensitive data.
|
|
|
|
@ -1367,11 +1371,29 @@ mod tests {
|
|
|
|
let categories = vec!["Annonces majeures".into(), "Recherche".into()];
|
|
|
|
let categories = vec!["Annonces majeures".into(), "Recherche".into()];
|
|
|
|
let sections = build_final_sections(&raw, &categories).unwrap();
|
|
|
|
let sections = build_final_sections(&raw, &categories).unwrap();
|
|
|
|
|
|
|
|
|
|
|
|
assert_eq!(sections.len(), 2);
|
|
|
|
// Only 1 section — empty categories are omitted
|
|
|
|
|
|
|
|
assert_eq!(sections.len(), 1);
|
|
|
|
assert_eq!(sections[0].title, "Annonces majeures");
|
|
|
|
assert_eq!(sections[0].title, "Annonces majeures");
|
|
|
|
assert_eq!(sections[0].items.len(), 1);
|
|
|
|
assert_eq!(sections[0].items.len(), 1);
|
|
|
|
assert_eq!(sections[1].title, "Recherche");
|
|
|
|
}
|
|
|
|
assert_eq!(sections[1].items.len(), 0);
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn build_final_sections_includes_autre() {
|
|
|
|
|
|
|
|
let raw = serde_json::json!({
|
|
|
|
|
|
|
|
"category_0": [
|
|
|
|
|
|
|
|
{"title": "A", "url": "https://a.com", "summary": "s"}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"category_autre": [
|
|
|
|
|
|
|
|
{"title": "B", "url": "https://b.com", "summary": "s"}
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let categories = vec!["AI News".into()];
|
|
|
|
|
|
|
|
let sections = build_final_sections(&raw, &categories).unwrap();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert_eq!(sections.len(), 2);
|
|
|
|
|
|
|
|
assert_eq!(sections[0].title, "AI News");
|
|
|
|
|
|
|
|
assert_eq!(sections[1].title, "Autre");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ── sanitize_error_message tests ─────────────────────────────
|
|
|
|
// ── sanitize_error_message tests ─────────────────────────────
|
|
|
|
@ -1415,148 +1437,6 @@ mod tests {
|
|
|
|
assert_eq!(sanitized, msg);
|
|
|
|
assert_eq!(sanitized, msg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ── url_quality_sufficient tests ────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn url_quality_all_valid_urls() {
|
|
|
|
|
|
|
|
let parsed = vec![
|
|
|
|
|
|
|
|
(
|
|
|
|
|
|
|
|
"category_0".into(),
|
|
|
|
|
|
|
|
vec![
|
|
|
|
|
|
|
|
NewsItem {
|
|
|
|
|
|
|
|
title: "A".into(),
|
|
|
|
|
|
|
|
url: "https://example.com/a".into(),
|
|
|
|
|
|
|
|
summary: "Sum A".into(),
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
NewsItem {
|
|
|
|
|
|
|
|
title: "B".into(),
|
|
|
|
|
|
|
|
url: "https://example.com/b".into(),
|
|
|
|
|
|
|
|
summary: "Sum B".into(),
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
),
|
|
|
|
|
|
|
|
(
|
|
|
|
|
|
|
|
"category_1".into(),
|
|
|
|
|
|
|
|
vec![NewsItem {
|
|
|
|
|
|
|
|
title: "C".into(),
|
|
|
|
|
|
|
|
url: "http://example.org/c".into(),
|
|
|
|
|
|
|
|
summary: "Sum C".into(),
|
|
|
|
|
|
|
|
}],
|
|
|
|
|
|
|
|
),
|
|
|
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 3/3 = 100% valid -> true
|
|
|
|
|
|
|
|
assert!(url_quality_sufficient(&parsed));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn url_quality_above_threshold() {
|
|
|
|
|
|
|
|
// 8 valid out of 10 = 80% > 70%
|
|
|
|
|
|
|
|
let mut items = Vec::new();
|
|
|
|
|
|
|
|
for i in 0..8 {
|
|
|
|
|
|
|
|
items.push(NewsItem {
|
|
|
|
|
|
|
|
title: format!("Art {}", i),
|
|
|
|
|
|
|
|
url: format!("https://example.com/{}", i),
|
|
|
|
|
|
|
|
summary: "Sum".into(),
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
for i in 8..10 {
|
|
|
|
|
|
|
|
items.push(NewsItem {
|
|
|
|
|
|
|
|
title: format!("Art {}", i),
|
|
|
|
|
|
|
|
url: format!("bad-url-{}", i),
|
|
|
|
|
|
|
|
summary: "Sum".into(),
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let parsed = vec![("category_0".into(), items)];
|
|
|
|
|
|
|
|
assert!(url_quality_sufficient(&parsed));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn url_quality_exactly_at_threshold() {
|
|
|
|
|
|
|
|
// 7 valid out of 10 = 70% >= 70%
|
|
|
|
|
|
|
|
let mut items = Vec::new();
|
|
|
|
|
|
|
|
for i in 0..7 {
|
|
|
|
|
|
|
|
items.push(NewsItem {
|
|
|
|
|
|
|
|
title: format!("Art {}", i),
|
|
|
|
|
|
|
|
url: format!("https://example.com/{}", i),
|
|
|
|
|
|
|
|
summary: "Sum".into(),
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
for i in 7..10 {
|
|
|
|
|
|
|
|
items.push(NewsItem {
|
|
|
|
|
|
|
|
title: format!("Art {}", i),
|
|
|
|
|
|
|
|
url: format!("bad-url-{}", i),
|
|
|
|
|
|
|
|
summary: "Sum".into(),
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let parsed = vec![("category_0".into(), items)];
|
|
|
|
|
|
|
|
assert!(url_quality_sufficient(&parsed));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn url_quality_below_threshold() {
|
|
|
|
|
|
|
|
// 6 valid out of 10 = 60% < 70%
|
|
|
|
|
|
|
|
let mut items = Vec::new();
|
|
|
|
|
|
|
|
for i in 0..6 {
|
|
|
|
|
|
|
|
items.push(NewsItem {
|
|
|
|
|
|
|
|
title: format!("Art {}", i),
|
|
|
|
|
|
|
|
url: format!("https://example.com/{}", i),
|
|
|
|
|
|
|
|
summary: "Sum".into(),
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
for i in 6..10 {
|
|
|
|
|
|
|
|
items.push(NewsItem {
|
|
|
|
|
|
|
|
title: format!("Art {}", i),
|
|
|
|
|
|
|
|
url: format!("no-protocol-{}", i),
|
|
|
|
|
|
|
|
summary: "Sum".into(),
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let parsed = vec![("category_0".into(), items)];
|
|
|
|
|
|
|
|
assert!(!url_quality_sufficient(&parsed));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn url_quality_all_invalid_urls() {
|
|
|
|
|
|
|
|
let parsed = vec![(
|
|
|
|
|
|
|
|
"category_0".into(),
|
|
|
|
|
|
|
|
vec![
|
|
|
|
|
|
|
|
NewsItem {
|
|
|
|
|
|
|
|
title: "A".into(),
|
|
|
|
|
|
|
|
url: "not-a-url".into(),
|
|
|
|
|
|
|
|
summary: "Sum".into(),
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
NewsItem {
|
|
|
|
|
|
|
|
title: "B".into(),
|
|
|
|
|
|
|
|
url: "also-not-a-url".into(),
|
|
|
|
|
|
|
|
summary: "Sum".into(),
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
)];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 0/2 = 0% -> false
|
|
|
|
|
|
|
|
assert!(!url_quality_sufficient(&parsed));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn url_quality_empty_articles() {
|
|
|
|
|
|
|
|
let parsed: Vec<(String, Vec<NewsItem>)> = vec![
|
|
|
|
|
|
|
|
("category_0".into(), vec![]),
|
|
|
|
|
|
|
|
("category_1".into(), vec![]),
|
|
|
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// No articles -> false (fall through to full pipeline)
|
|
|
|
|
|
|
|
assert!(!url_quality_sufficient(&parsed));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn url_quality_empty_categories() {
|
|
|
|
|
|
|
|
let parsed: Vec<(String, Vec<NewsItem>)> = vec![];
|
|
|
|
|
|
|
|
assert!(!url_quality_sufficient(&parsed));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// ── filter_homepage_urls tests ──────────────────────────────
|
|
|
|
// ── filter_homepage_urls tests ──────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[test]
|
|
|
|
|