From a5f42391574d00c47b47d8b5cc0f526ae32c8d8e Mon Sep 17 00:00:00 2001
From: oabrivard <olivier@abrivard.fr>
Date: Wed, 25 Mar 2026 09:12:22 +0100
Subject: [PATCH] fix: distinguish filtered_too_old from filtered_empty in
 article tracing

---
 backend/src/services/synthesis.rs | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs
index 0d35373..df1a5c7 100644
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@@ -406,10 +406,10 @@ async fn run_generation_inner(
             }
 
             // Scrape
-            let (body_text, page_title, final_url) = scrape_single_article(&state.http_client, &url, settings.max_age_days as i64).await;
+            let (body_text, page_title, final_url, drop_reason) = scrape_single_article(&state.http_client, &url, settings.max_age_days as i64).await;
 
-            if body_text.trim().is_empty() {
-                trace_article(&state.pool, user_id, job_id, &final_url, &page_title, "personalized_source", Some(&source_url), None, None, "filtered_empty", false).await;
+            if let Some(reason) = drop_reason {
+                trace_article(&state.pool, user_id, job_id, &final_url, &page_title, "personalized_source", Some(&source_url), None, None, reason, false).await;
                 continue;
             }
 
@@ -540,10 +540,10 @@ async fn run_generation_inner(
         // Scrape Phase 2 for validation
         emit_progress(tx, "scraping", "Verification des sources web...", 80);
         for (cat_key, item) in phase2_items {
-            let (body_text, _, final_url) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await;
+            let (body_text, _, final_url, drop_reason) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await;
 
-            if body_text.trim().is_empty() {
-                trace_article(&state.pool, user_id, job_id, &final_url, &item.title, "web_search", None, None, None, "filtered_empty", false).await;
+            if let Some(reason) = drop_reason {
+                trace_article(&state.pool, user_id, job_id, &final_url, &item.title, "web_search", None, None, None, reason, false).await;
                 continue;
             }
 
@@ -931,28 +931,30 @@ fn rotate_sources(sources: Vec<crate::models::source::Source>, last_source_url:
 /// - Network errors → empty content (article kept)
 /// - Soft 404 → article excluded (empty content)
 /// - Article too old → article excluded (empty content)
+/// Result of scraping a single article.
+/// The 4th value is the drop reason if the article was rejected (None if OK).
 async fn scrape_single_article(
     http_client: &reqwest::Client,
     url: &str,
     max_age_days: i64,
-) -> (String, String, String) {
+) -> (String, String, String, Option<&'static str>) {
     match scraper::scrape_url(http_client, url).await {
         Ok(content) => {
             let final_url = content.url.clone();
             if !content.ok || content.is_soft_404 {
                 tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
-                return (String::new(), String::new(), final_url);
+                return (String::new(), String::new(), final_url, Some("filtered_empty"));
             }
             if scraper::is_article_too_old(content.published_date, max_age_days) {
                 tracing::warn!(url = url, "Article too old, skipping content");
-                return (String::new(), String::new(), final_url);
+                return (String::new(), String::new(), final_url, Some("filtered_too_old"));
             }
             let title = content.title.unwrap_or_default();
-            (content.body_text, title, final_url)
+            (content.body_text, title, final_url, None)
         }
         Err(e) => {
             tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
-            (String::new(), String::new(), url.to_string())
+            (String::new(), String::new(), url.to_string(), Some("filtered_empty"))
         }
     }
 }