From a5f42391574d00c47b47d8b5cc0f526ae32c8d8e Mon Sep 17 00:00:00 2001 From: oabrivard Date: Wed, 25 Mar 2026 09:12:22 +0100 Subject: [PATCH] fix: distinguish filtered_too_old from filtered_empty in article tracing --- backend/src/services/synthesis.rs | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 0d35373..df1a5c7 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -406,10 +406,10 @@ async fn run_generation_inner( } // Scrape - let (body_text, page_title, final_url) = scrape_single_article(&state.http_client, &url, settings.max_age_days as i64).await; + let (body_text, page_title, final_url, drop_reason) = scrape_single_article(&state.http_client, &url, settings.max_age_days as i64).await; - if body_text.trim().is_empty() { - trace_article(&state.pool, user_id, job_id, &final_url, &page_title, "personalized_source", Some(&source_url), None, None, "filtered_empty", false).await; + if let Some(reason) = drop_reason { + trace_article(&state.pool, user_id, job_id, &final_url, &page_title, "personalized_source", Some(&source_url), None, None, reason, false).await; continue; } @@ -540,10 +540,10 @@ async fn run_generation_inner( // Scrape Phase 2 for validation emit_progress(tx, "scraping", "Verification des sources web...", 80); for (cat_key, item) in phase2_items { - let (body_text, _, final_url) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await; + let (body_text, _, final_url, drop_reason) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await; - if body_text.trim().is_empty() { - trace_article(&state.pool, user_id, job_id, &final_url, &item.title, "web_search", None, None, None, "filtered_empty", false).await; + if let Some(reason) = drop_reason { + trace_article(&state.pool, user_id, job_id, &final_url, &item.title, "web_search", None, None, None, reason, false).await; continue; } @@ -931,28 +931,30 @@ fn rotate_sources(sources: Vec, last_source_url: /// - Network errors → empty content (article kept) /// - Soft 404 → article excluded (empty content) /// - Article too old → article excluded (empty content) +/// Result of scraping a single article. +/// The 4th value is the drop reason if the article was rejected (None if OK). async fn scrape_single_article( http_client: &reqwest::Client, url: &str, max_age_days: i64, -) -> (String, String, String) { +) -> (String, String, String, Option<&'static str>) { match scraper::scrape_url(http_client, url).await { Ok(content) => { let final_url = content.url.clone(); if !content.ok || content.is_soft_404 { tracing::warn!(url = url, "Soft 404 or error page detected, skipping content"); - return (String::new(), String::new(), final_url); + return (String::new(), String::new(), final_url, Some("filtered_empty")); } if scraper::is_article_too_old(content.published_date, max_age_days) { tracing::warn!(url = url, "Article too old, skipping content"); - return (String::new(), String::new(), final_url); + return (String::new(), String::new(), final_url, Some("filtered_too_old")); } let title = content.title.unwrap_or_default(); - (content.body_text, title, final_url) + (content.body_text, title, final_url, None) } Err(e) => { tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content"); - (String::new(), String::new(), url.to_string()) + (String::new(), String::new(), url.to_string(), Some("filtered_empty")) } } }