feat: LLM extracts publication date as fallback for article age filtering

The classify prompt now asks the LLM to return a date field (YYYY-MM-DD).
When the scraper couldn't find a date, the LLM-extracted date is used to
filter articles that exceed max_age_days.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 91272ddfc4
commit de25a08d51

@ -88,10 +88,11 @@ pub fn build_article_classify_schema() -> Value {
"type": "object", "type": "object",
"properties": { "properties": {
"title": { "type": "string", "description": "Article title" }, "title": { "type": "string", "description": "Article title" },
"summary": { "type": "string", "description": "4-5 line summary of the article" }, "summary": { "type": "string", "description": "Summary of the article" },
"category": { "type": "string", "description": "Category name from the provided list" } "category": { "type": "string", "description": "Category name from the provided list" },
"date": { "type": "string", "description": "Publication date in YYYY-MM-DD format, or empty string if unknown" }
}, },
"required": ["title", "summary", "category"], "required": ["title", "summary", "category", "date"],
"additionalProperties": false "additionalProperties": false
}) })
} }

@ -178,7 +178,9 @@ pub fn build_article_classify_prompt(
Classe cet article dans la categorie la plus appropriee.\n\ Classe cet article dans la categorie la plus appropriee.\n\
Si aucune categorie ne correspond, utilise \"Autre\".\n\ Si aucune categorie ne correspond, utilise \"Autre\".\n\
{summary_instruction}\n\ {summary_instruction}\n\
Si le titre fourni est vide, genere un titre a partir du contenu.", Si le titre fourni est vide, genere un titre a partir du contenu.\n\
Extrais la date de publication de l'article au format YYYY-MM-DD. \
Si la date n'est pas disponible, retourne une chaine vide.",
title = if title.is_empty() { "(pas de titre)" } else { title }, title = if title.is_empty() { "(pas de titre)" } else { title },
body = body_snippet, body = body_snippet,
categories = categories_list, categories = categories_list,

@ -581,7 +581,7 @@ fn extract_date_from_json_ld(json: &serde_json::Value) -> Option<DateTime<Utc>>
/// Try to parse a date string using multiple common formats. /// Try to parse a date string using multiple common formats.
/// ///
/// Supports RFC 3339 / ISO 8601 and simple date formats. /// Supports RFC 3339 / ISO 8601 and simple date formats.
fn parse_date_string(s: &str) -> Option<DateTime<Utc>> { pub fn parse_date_string(s: &str) -> Option<DateTime<Utc>> {
let s = s.trim(); let s = s.trim();
// Try RFC 3339 / ISO 8601 with timezone // Try RFC 3339 / ISO 8601 with timezone

@ -524,6 +524,23 @@ pub async fn run_generation_inner(
} }
}; };
// Check LLM-extracted date as fallback for articles without a scraper date
if let Some(date_str) = class_response.get("date").and_then(|d| d.as_str()) {
if !date_str.is_empty() {
if let Some(parsed) = scraper::parse_date_string(date_str) {
if scraper::is_article_too_old(Some(parsed), settings.max_age_days as i64) {
tracing::info!(url = %final_url, date = date_str, "Article filtered by LLM-extracted date (too old)");
pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace {
url: &final_url, title: &page_title, source_type: "personalized_source",
source_url: Some(&source_url), category: None, synthesis_id: None,
status: "filtered_too_old", scraped_ok: true,
}));
continue;
}
}
}
}
let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category( let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
&class_response, &page_title, &user_categories, &classification_categories, &class_response, &page_title, &user_categories, &classification_categories,
&filled_counts, settings.max_items_per_category as usize, &filled_counts, settings.max_items_per_category as usize,
@ -705,6 +722,23 @@ pub async fn run_generation_inner(
} }
}; };
// Check LLM-extracted date as fallback
if let Some(date_str) = class_response.get("date").and_then(|d| d.as_str()) {
if !date_str.is_empty() {
if let Some(parsed) = scraper::parse_date_string(date_str) {
if scraper::is_article_too_old(Some(parsed), settings.max_age_days as i64) {
tracing::info!(url = %final_url, date = date_str, "Article filtered by LLM-extracted date (too old)");
pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace {
url: &final_url, title: &page_title, source_type: "brave_search",
source_url: None, category: None, synthesis_id: None,
status: "filtered_too_old", scraped_ok: true,
}));
continue;
}
}
}
}
let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category( let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
&class_response, &page_title, &user_categories, &classification_categories, &class_response, &page_title, &user_categories, &classification_categories,
&filled_counts, settings.max_items_per_category as usize, &filled_counts, settings.max_items_per_category as usize,

Loading…
Cancel
Save