feat: LLM extracts publication date as fallback for article age filtering

The classify prompt now asks the LLM to return a date field (YYYY-MM-DD). When the scraper couldn't find a date, the LLM-extracted date is used to filter articles that exceed max_age_days. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · de25a08d51
parent 91272ddfc4
commit de25a08d51
4 changed files with 42 additions and 5 deletions
--- a/backend/src/services/llm/schema.rs
+++ b/backend/src/services/llm/schema.rs
@ -88,10 +88,11 @@ pub fn build_article_classify_schema() -> Value {
        "type": "object",
        "properties": {
            "title": { "type": "string", "description": "Article title" },
-            "summary": { "type": "string", "description": "4-5 line summary of the article" },
-            "category": { "type": "string", "description": "Category name from the provided list" }
+            "summary": { "type": "string", "description": "Summary of the article" },
+            "category": { "type": "string", "description": "Category name from the provided list" },
+            "date": { "type": "string", "description": "Publication date in YYYY-MM-DD format, or empty string if unknown" }
        },
-        "required": ["title", "summary", "category"],
+        "required": ["title", "summary", "category", "date"],
        "additionalProperties": false
    })
 }
--- a/backend/src/services/prompts.rs
+++ b/backend/src/services/prompts.rs
@ -178,7 +178,9 @@ pub fn build_article_classify_prompt(
         Classe cet article dans la categorie la plus appropriee.\n\
         Si aucune categorie ne correspond, utilise \"Autre\".\n\
         {summary_instruction}\n\
-         Si le titre fourni est vide, genere un titre a partir du contenu.",
+         Si le titre fourni est vide, genere un titre a partir du contenu.\n\
+         Extrais la date de publication de l'article au format YYYY-MM-DD. \
+         Si la date n'est pas disponible, retourne une chaine vide.",
        title = if title.is_empty() { "(pas de titre)" } else { title },
        body = body_snippet,
        categories = categories_list,
--- a/backend/src/services/scraper.rs
+++ b/backend/src/services/scraper.rs
@ -581,7 +581,7 @@ fn extract_date_from_json_ld(json: &serde_json::Value) -> Option<DateTime<Utc>>
 /// Try to parse a date string using multiple common formats.
 ///
 /// Supports RFC 3339 / ISO 8601 and simple date formats.
-fn parse_date_string(s: &str) -> Option<DateTime<Utc>> {
+pub fn parse_date_string(s: &str) -> Option<DateTime<Utc>> {
    let s = s.trim();

    // Try RFC 3339 / ISO 8601 with timezone
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -524,6 +524,23 @@ pub async fn run_generation_inner(
                        }
                    };

+                    // Check LLM-extracted date as fallback for articles without a scraper date
+                    if let Some(date_str) = class_response.get("date").and_then(|d| d.as_str()) {
+                        if !date_str.is_empty() {
+                            if let Some(parsed) = scraper::parse_date_string(date_str) {
+                                if scraper::is_article_too_old(Some(parsed), settings.max_age_days as i64) {
+                                    tracing::info!(url = %final_url, date = date_str, "Article filtered by LLM-extracted date (too old)");
+                                    pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace {
+                                        url: &final_url, title: &page_title, source_type: "personalized_source",
+                                        source_url: Some(&source_url), category: None, synthesis_id: None,
+                                        status: "filtered_too_old", scraped_ok: true,
+                                    }));
+                                    continue;
+                                }
+                            }
+                        }
+                    }
+
                    let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
                        &class_response, &page_title, &user_categories, &classification_categories,
                        &filled_counts, settings.max_items_per_category as usize,
@ -705,6 +722,23 @@ pub async fn run_generation_inner(
                                }
                            };

+                            // Check LLM-extracted date as fallback
+                            if let Some(date_str) = class_response.get("date").and_then(|d| d.as_str()) {
+                                if !date_str.is_empty() {
+                                    if let Some(parsed) = scraper::parse_date_string(date_str) {
+                                        if scraper::is_article_too_old(Some(parsed), settings.max_age_days as i64) {
+                                            tracing::info!(url = %final_url, date = date_str, "Article filtered by LLM-extracted date (too old)");
+                                            pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace {
+                                                url: &final_url, title: &page_title, source_type: "brave_search",
+                                                source_url: None, category: None, synthesis_id: None,
+                                                status: "filtered_too_old", scraped_ok: true,
+                                            }));
+                                            continue;
+                                        }
+                                    }
+                                }
+                            }
+
                            let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
                                &class_response, &page_title, &user_categories, &classification_categories,
                                &filled_counts, settings.max_items_per_category as usize,