feat: add "Articles sans date" category for articles without publication date

Articles where neither the scraper nor the LLM could extract a date
are now placed in a separate "Articles sans date" section instead of
their classified category. This makes undated articles visible without
mixing them with properly dated content.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent fb086a706f
commit a89c61c5b6

@ -522,6 +522,24 @@ pub async fn run_generation_inner(
} }
} }
let llm_date = class_response.get("date").and_then(|d| d.as_str()).filter(|s| !s.is_empty()).map(|s| s.to_string());
// Articles without any date go to "Articles sans date" category
if llm_date.is_none() {
let llm_title = class_response.get("title").and_then(|t| t.as_str()).unwrap_or(&page_title).to_string();
let llm_summary = class_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
article_scraped.entry("category_no_date".to_string()).or_default().push(NewsItem {
title: llm_title,
url: final_url.clone(),
summary: llm_summary,
date: None,
});
let source_domain = extract_domain(&source_url).unwrap_or_default();
*source_counts.entry(source_domain).or_insert(0) += 1;
continue;
}
let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category( let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
&class_response, &page_title, &user_categories, &classification_categories, &class_response, &page_title, &user_categories, &classification_categories,
&filled_counts, settings.max_items_per_category as usize, &filled_counts, settings.max_items_per_category as usize,
@ -529,7 +547,6 @@ pub async fn run_generation_inner(
continue; continue;
}; };
let llm_date = class_response.get("date").and_then(|d| d.as_str()).filter(|s| !s.is_empty()).map(|s| s.to_string());
article_scraped.entry(final_cat_key).or_default().push(NewsItem { article_scraped.entry(final_cat_key).or_default().push(NewsItem {
title: llm_title, title: llm_title,
url: final_url.clone(), url: final_url.clone(),
@ -747,6 +764,25 @@ pub async fn run_generation_inner(
} }
} }
let llm_date = class_response.get("date").and_then(|d| d.as_str()).filter(|s| !s.is_empty()).map(|s| s.to_string());
// Articles without any date go to "Articles sans date" category
if llm_date.is_none() {
let llm_title = class_response.get("title").and_then(|t| t.as_str()).unwrap_or(&page_title).to_string();
let llm_summary = class_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
article_scraped.entry("category_no_date".to_string()).or_default().push(NewsItem {
title: llm_title,
url: final_url.clone(),
summary: llm_summary,
date: None,
});
if let Some(domain) = extract_domain(&final_url) {
*source_counts.entry(domain).or_insert(0) += 1;
}
continue;
}
let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category( let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
&class_response, &page_title, &user_categories, &classification_categories, &class_response, &page_title, &user_categories, &classification_categories,
&filled_counts, settings.max_items_per_category as usize, &filled_counts, settings.max_items_per_category as usize,
@ -754,7 +790,6 @@ pub async fn run_generation_inner(
continue; continue;
}; };
let llm_date = class_response.get("date").and_then(|d| d.as_str()).filter(|s| !s.is_empty()).map(|s| s.to_string());
article_scraped.entry(final_cat_key).or_default().push(NewsItem { article_scraped.entry(final_cat_key).or_default().push(NewsItem {
title: llm_title, title: llm_title,
url: final_url.clone(), url: final_url.clone(),
@ -885,6 +920,11 @@ pub async fn run_generation_inner(
final_sections.push(NewsSection { title: "Divers".to_string(), items: autre_items.clone() }); final_sections.push(NewsSection { title: "Divers".to_string(), items: autre_items.clone() });
} }
} }
if let Some(no_date_items) = article_scraped.get("category_no_date") {
if !no_date_items.is_empty() {
final_sections.push(NewsSection { title: "Articles sans date".to_string(), items: no_date_items.clone() });
}
}
let sections_json = serde_json::to_value(&final_sections).map_err(|e| AppError::Internal(anyhow::anyhow!("Failed to serialize: {}", e)))?; let sections_json = serde_json::to_value(&final_sections).map_err(|e| AppError::Internal(anyhow::anyhow!("Failed to serialize: {}", e)))?;
let sections_json = sanitize_json_null_bytes(sections_json); let sections_json = sanitize_json_null_bytes(sections_json);

Loading…
Cancel
Save