|
|
|
|
@ -406,10 +406,10 @@ async fn run_generation_inner(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Scrape
|
|
|
|
|
let (body_text, page_title, final_url) = scrape_single_article(&state.http_client, &url, settings.max_age_days as i64).await;
|
|
|
|
|
let (body_text, page_title, final_url, drop_reason) = scrape_single_article(&state.http_client, &url, settings.max_age_days as i64).await;
|
|
|
|
|
|
|
|
|
|
if body_text.trim().is_empty() {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &final_url, &page_title, "personalized_source", Some(&source_url), None, None, "filtered_empty", false).await;
|
|
|
|
|
if let Some(reason) = drop_reason {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &final_url, &page_title, "personalized_source", Some(&source_url), None, None, reason, false).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -540,10 +540,10 @@ async fn run_generation_inner(
|
|
|
|
|
// Scrape Phase 2 for validation
|
|
|
|
|
emit_progress(tx, "scraping", "Verification des sources web...", 80);
|
|
|
|
|
for (cat_key, item) in phase2_items {
|
|
|
|
|
let (body_text, _, final_url) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await;
|
|
|
|
|
let (body_text, _, final_url, drop_reason) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await;
|
|
|
|
|
|
|
|
|
|
if body_text.trim().is_empty() {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &final_url, &item.title, "web_search", None, None, None, "filtered_empty", false).await;
|
|
|
|
|
if let Some(reason) = drop_reason {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &final_url, &item.title, "web_search", None, None, None, reason, false).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -931,28 +931,30 @@ fn rotate_sources(sources: Vec<crate::models::source::Source>, last_source_url:
|
|
|
|
|
/// - Network errors → empty content (article kept)
|
|
|
|
|
/// - Soft 404 → article excluded (empty content)
|
|
|
|
|
/// - Article too old → article excluded (empty content)
|
|
|
|
|
/// Result of scraping a single article.
|
|
|
|
|
/// The 4th value is the drop reason if the article was rejected (None if OK).
|
|
|
|
|
async fn scrape_single_article(
|
|
|
|
|
http_client: &reqwest::Client,
|
|
|
|
|
url: &str,
|
|
|
|
|
max_age_days: i64,
|
|
|
|
|
) -> (String, String, String) {
|
|
|
|
|
) -> (String, String, String, Option<&'static str>) {
|
|
|
|
|
match scraper::scrape_url(http_client, url).await {
|
|
|
|
|
Ok(content) => {
|
|
|
|
|
let final_url = content.url.clone();
|
|
|
|
|
if !content.ok || content.is_soft_404 {
|
|
|
|
|
tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
|
|
|
|
|
return (String::new(), String::new(), final_url);
|
|
|
|
|
return (String::new(), String::new(), final_url, Some("filtered_empty"));
|
|
|
|
|
}
|
|
|
|
|
if scraper::is_article_too_old(content.published_date, max_age_days) {
|
|
|
|
|
tracing::warn!(url = url, "Article too old, skipping content");
|
|
|
|
|
return (String::new(), String::new(), final_url);
|
|
|
|
|
return (String::new(), String::new(), final_url, Some("filtered_too_old"));
|
|
|
|
|
}
|
|
|
|
|
let title = content.title.unwrap_or_default();
|
|
|
|
|
(content.body_text, title, final_url)
|
|
|
|
|
(content.body_text, title, final_url, None)
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
|
|
|
|
|
(String::new(), String::new(), url.to_string())
|
|
|
|
|
(String::new(), String::new(), url.to_string(), Some("filtered_empty"))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|