fix: distinguish filtered_too_old from filtered_empty in article tracing

master
oabrivard 3 months ago
parent a760220d44
commit a5f4239157

@ -406,10 +406,10 @@ async fn run_generation_inner(
}
// Scrape
let (body_text, page_title, final_url) = scrape_single_article(&state.http_client, &url, settings.max_age_days as i64).await;
let (body_text, page_title, final_url, drop_reason) = scrape_single_article(&state.http_client, &url, settings.max_age_days as i64).await;
if body_text.trim().is_empty() {
trace_article(&state.pool, user_id, job_id, &final_url, &page_title, "personalized_source", Some(&source_url), None, None, "filtered_empty", false).await;
if let Some(reason) = drop_reason {
trace_article(&state.pool, user_id, job_id, &final_url, &page_title, "personalized_source", Some(&source_url), None, None, reason, false).await;
continue;
}
@ -540,10 +540,10 @@ async fn run_generation_inner(
// Scrape Phase 2 for validation
emit_progress(tx, "scraping", "Verification des sources web...", 80);
for (cat_key, item) in phase2_items {
let (body_text, _, final_url) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await;
let (body_text, _, final_url, drop_reason) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await;
if body_text.trim().is_empty() {
trace_article(&state.pool, user_id, job_id, &final_url, &item.title, "web_search", None, None, None, "filtered_empty", false).await;
if let Some(reason) = drop_reason {
trace_article(&state.pool, user_id, job_id, &final_url, &item.title, "web_search", None, None, None, reason, false).await;
continue;
}
@ -931,28 +931,30 @@ fn rotate_sources(sources: Vec<crate::models::source::Source>, last_source_url:
/// - Network errors → empty content (article kept)
/// - Soft 404 → article excluded (empty content)
/// - Article too old → article excluded (empty content)
/// Result of scraping a single article.
/// The 4th value is the drop reason if the article was rejected (None if OK).
async fn scrape_single_article(
http_client: &reqwest::Client,
url: &str,
max_age_days: i64,
) -> (String, String, String) {
) -> (String, String, String, Option<&'static str>) {
match scraper::scrape_url(http_client, url).await {
Ok(content) => {
let final_url = content.url.clone();
if !content.ok || content.is_soft_404 {
tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
return (String::new(), String::new(), final_url);
return (String::new(), String::new(), final_url, Some("filtered_empty"));
}
if scraper::is_article_too_old(content.published_date, max_age_days) {
tracing::warn!(url = url, "Article too old, skipping content");
return (String::new(), String::new(), final_url);
return (String::new(), String::new(), final_url, Some("filtered_too_old"));
}
let title = content.title.unwrap_or_default();
(content.body_text, title, final_url)
(content.body_text, title, final_url, None)
}
Err(e) => {
tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
(String::new(), String::new(), url.to_string())
(String::new(), String::new(), url.to_string(), Some("filtered_empty"))
}
}
}

Loading…
Cancel
Save