fix: distinguish filtered_too_old from filtered_empty in article tracing

3 months ago · a5f4239157
parent a760220d44
commit a5f4239157
1 changed files with 13 additions and 11 deletions
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -406,10 +406,10 @@ async fn run_generation_inner(
            }

            // Scrape
-            let (body_text, page_title, final_url) = scrape_single_article(&state.http_client, &url, settings.max_age_days as i64).await;
+            let (body_text, page_title, final_url, drop_reason) = scrape_single_article(&state.http_client, &url, settings.max_age_days as i64).await;

-            if body_text.trim().is_empty() {
-                trace_article(&state.pool, user_id, job_id, &final_url, &page_title, "personalized_source", Some(&source_url), None, None, "filtered_empty", false).await;
+            if let Some(reason) = drop_reason {
+                trace_article(&state.pool, user_id, job_id, &final_url, &page_title, "personalized_source", Some(&source_url), None, None, reason, false).await;
                continue;
            }

@ -540,10 +540,10 @@ async fn run_generation_inner(
        // Scrape Phase 2 for validation
        emit_progress(tx, "scraping", "Verification des sources web...", 80);
        for (cat_key, item) in phase2_items {
-            let (body_text, _, final_url) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await;
+            let (body_text, _, final_url, drop_reason) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await;

-            if body_text.trim().is_empty() {
-                trace_article(&state.pool, user_id, job_id, &final_url, &item.title, "web_search", None, None, None, "filtered_empty", false).await;
+            if let Some(reason) = drop_reason {
+                trace_article(&state.pool, user_id, job_id, &final_url, &item.title, "web_search", None, None, None, reason, false).await;
                continue;
            }

@ -931,28 +931,30 @@ fn rotate_sources(sources: Vec<crate::models::source::Source>, last_source_url:
 /// - Network errors → empty content (article kept)
 /// - Soft 404 → article excluded (empty content)
 /// - Article too old → article excluded (empty content)
+/// Result of scraping a single article.
+/// The 4th value is the drop reason if the article was rejected (None if OK).
 async fn scrape_single_article(
    http_client: &reqwest::Client,
    url: &str,
    max_age_days: i64,
-) -> (String, String, String) {
+) -> (String, String, String, Option<&'static str>) {
    match scraper::scrape_url(http_client, url).await {
        Ok(content) => {
            let final_url = content.url.clone();
            if !content.ok || content.is_soft_404 {
                tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
-                return (String::new(), String::new(), final_url);
+                return (String::new(), String::new(), final_url, Some("filtered_empty"));
            }
            if scraper::is_article_too_old(content.published_date, max_age_days) {
                tracing::warn!(url = url, "Article too old, skipping content");
-                return (String::new(), String::new(), final_url);
+                return (String::new(), String::new(), final_url, Some("filtered_too_old"));
            }
            let title = content.title.unwrap_or_default();
-            (content.body_text, title, final_url)
+            (content.body_text, title, final_url, None)
        }
        Err(e) => {
            tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
-            (String::new(), String::new(), url.to_string())
+            (String::new(), String::new(), url.to_string(), Some("filtered_empty"))
        }
    }
 }