feat: LLM extracts publication date as fallback for article age filtering

The classify prompt now asks the LLM to return a date field (YYYY-MM-DD).
When the scraper couldn't find a date, the LLM-extracted date is used to
filter articles that exceed max_age_days.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 91272ddfc4
commit de25a08d51

@ -88,10 +88,11 @@ pub fn build_article_classify_schema() -> Value {
"type": "object",
"properties": {
"title": { "type": "string", "description": "Article title" },
"summary": { "type": "string", "description": "4-5 line summary of the article" },
"category": { "type": "string", "description": "Category name from the provided list" }
"summary": { "type": "string", "description": "Summary of the article" },
"category": { "type": "string", "description": "Category name from the provided list" },
"date": { "type": "string", "description": "Publication date in YYYY-MM-DD format, or empty string if unknown" }
},
"required": ["title", "summary", "category"],
"required": ["title", "summary", "category", "date"],
"additionalProperties": false
})
}

@ -178,7 +178,9 @@ pub fn build_article_classify_prompt(
Classe cet article dans la categorie la plus appropriee.\n\
Si aucune categorie ne correspond, utilise \"Autre\".\n\
{summary_instruction}\n\
Si le titre fourni est vide, genere un titre a partir du contenu.",
Si le titre fourni est vide, genere un titre a partir du contenu.\n\
Extrais la date de publication de l'article au format YYYY-MM-DD. \
Si la date n'est pas disponible, retourne une chaine vide.",
title = if title.is_empty() { "(pas de titre)" } else { title },
body = body_snippet,
categories = categories_list,

@ -581,7 +581,7 @@ fn extract_date_from_json_ld(json: &serde_json::Value) -> Option<DateTime<Utc>>
/// Try to parse a date string using multiple common formats.
///
/// Supports RFC 3339 / ISO 8601 and simple date formats.
fn parse_date_string(s: &str) -> Option<DateTime<Utc>> {
pub fn parse_date_string(s: &str) -> Option<DateTime<Utc>> {
let s = s.trim();
// Try RFC 3339 / ISO 8601 with timezone

@ -524,6 +524,23 @@ pub async fn run_generation_inner(
}
};
// Check LLM-extracted date as fallback for articles without a scraper date
if let Some(date_str) = class_response.get("date").and_then(|d| d.as_str()) {
if !date_str.is_empty() {
if let Some(parsed) = scraper::parse_date_string(date_str) {
if scraper::is_article_too_old(Some(parsed), settings.max_age_days as i64) {
tracing::info!(url = %final_url, date = date_str, "Article filtered by LLM-extracted date (too old)");
pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace {
url: &final_url, title: &page_title, source_type: "personalized_source",
source_url: Some(&source_url), category: None, synthesis_id: None,
status: "filtered_too_old", scraped_ok: true,
}));
continue;
}
}
}
}
let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
&class_response, &page_title, &user_categories, &classification_categories,
&filled_counts, settings.max_items_per_category as usize,
@ -705,6 +722,23 @@ pub async fn run_generation_inner(
}
};
// Check LLM-extracted date as fallback
if let Some(date_str) = class_response.get("date").and_then(|d| d.as_str()) {
if !date_str.is_empty() {
if let Some(parsed) = scraper::parse_date_string(date_str) {
if scraper::is_article_too_old(Some(parsed), settings.max_age_days as i64) {
tracing::info!(url = %final_url, date = date_str, "Article filtered by LLM-extracted date (too old)");
pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace {
url: &final_url, title: &page_title, source_type: "brave_search",
source_url: None, category: None, synthesis_id: None,
status: "filtered_too_old", scraped_ok: true,
}));
continue;
}
}
}
}
let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
&class_response, &page_title, &user_categories, &classification_categories,
&filled_counts, settings.max_items_per_category as usize,

Loading…
Cancel
Save