diff --git a/backend/src/services/llm/schema.rs b/backend/src/services/llm/schema.rs index 428cee2..e47852e 100644 --- a/backend/src/services/llm/schema.rs +++ b/backend/src/services/llm/schema.rs @@ -88,10 +88,11 @@ pub fn build_article_classify_schema() -> Value { "type": "object", "properties": { "title": { "type": "string", "description": "Article title" }, - "summary": { "type": "string", "description": "4-5 line summary of the article" }, - "category": { "type": "string", "description": "Category name from the provided list" } + "summary": { "type": "string", "description": "Summary of the article" }, + "category": { "type": "string", "description": "Category name from the provided list" }, + "date": { "type": "string", "description": "Publication date in YYYY-MM-DD format, or empty string if unknown" } }, - "required": ["title", "summary", "category"], + "required": ["title", "summary", "category", "date"], "additionalProperties": false }) } diff --git a/backend/src/services/prompts.rs b/backend/src/services/prompts.rs index 9f93ee0..648d35e 100644 --- a/backend/src/services/prompts.rs +++ b/backend/src/services/prompts.rs @@ -178,7 +178,9 @@ pub fn build_article_classify_prompt( Classe cet article dans la categorie la plus appropriee.\n\ Si aucune categorie ne correspond, utilise \"Autre\".\n\ {summary_instruction}\n\ - Si le titre fourni est vide, genere un titre a partir du contenu.", + Si le titre fourni est vide, genere un titre a partir du contenu.\n\ + Extrais la date de publication de l'article au format YYYY-MM-DD. \ + Si la date n'est pas disponible, retourne une chaine vide.", title = if title.is_empty() { "(pas de titre)" } else { title }, body = body_snippet, categories = categories_list, diff --git a/backend/src/services/scraper.rs b/backend/src/services/scraper.rs index e03788b..bc63974 100644 --- a/backend/src/services/scraper.rs +++ b/backend/src/services/scraper.rs @@ -581,7 +581,7 @@ fn extract_date_from_json_ld(json: &serde_json::Value) -> Option> /// Try to parse a date string using multiple common formats. /// /// Supports RFC 3339 / ISO 8601 and simple date formats. -fn parse_date_string(s: &str) -> Option> { +pub fn parse_date_string(s: &str) -> Option> { let s = s.trim(); // Try RFC 3339 / ISO 8601 with timezone diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 9bd2180..49b5c5c 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -524,6 +524,23 @@ pub async fn run_generation_inner( } }; + // Check LLM-extracted date as fallback for articles without a scraper date + if let Some(date_str) = class_response.get("date").and_then(|d| d.as_str()) { + if !date_str.is_empty() { + if let Some(parsed) = scraper::parse_date_string(date_str) { + if scraper::is_article_too_old(Some(parsed), settings.max_age_days as i64) { + tracing::info!(url = %final_url, date = date_str, "Article filtered by LLM-extracted date (too old)"); + pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace { + url: &final_url, title: &page_title, source_type: "personalized_source", + source_url: Some(&source_url), category: None, synthesis_id: None, + status: "filtered_too_old", scraped_ok: true, + })); + continue; + } + } + } + } + let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category( &class_response, &page_title, &user_categories, &classification_categories, &filled_counts, settings.max_items_per_category as usize, @@ -705,6 +722,23 @@ pub async fn run_generation_inner( } }; + // Check LLM-extracted date as fallback + if let Some(date_str) = class_response.get("date").and_then(|d| d.as_str()) { + if !date_str.is_empty() { + if let Some(parsed) = scraper::parse_date_string(date_str) { + if scraper::is_article_too_old(Some(parsed), settings.max_age_days as i64) { + tracing::info!(url = %final_url, date = date_str, "Article filtered by LLM-extracted date (too old)"); + pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace { + url: &final_url, title: &page_title, source_type: "brave_search", + source_url: None, category: None, synthesis_id: None, + status: "filtered_too_old", scraped_ok: true, + })); + continue; + } + } + } + } + let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category( &class_response, &page_title, &user_categories, &classification_categories, &filled_counts, settings.max_items_per_category as usize,