|
|
|
@ -1119,10 +1119,10 @@ async fn scrape_articles(
|
|
|
|
pct as u8,
|
|
|
|
pct as u8,
|
|
|
|
);
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
if let Ok((cat_key, item, (scraped_content, page_title))) = join_result {
|
|
|
|
if let Ok((cat_key, item, (scraped_content, page_title, final_url))) = join_result {
|
|
|
|
let scraped_item = ScrapedNewsItem {
|
|
|
|
let scraped_item = ScrapedNewsItem {
|
|
|
|
title: item.title,
|
|
|
|
title: item.title,
|
|
|
|
url: item.url,
|
|
|
|
url: final_url,
|
|
|
|
summary: item.summary,
|
|
|
|
summary: item.summary,
|
|
|
|
original_title: page_title,
|
|
|
|
original_title: page_title,
|
|
|
|
scraped_content,
|
|
|
|
scraped_content,
|
|
|
|
@ -1172,8 +1172,8 @@ async fn scrape_flat_urls(
|
|
|
|
let url = url.clone();
|
|
|
|
let url = url.clone();
|
|
|
|
let mad = max_age_days;
|
|
|
|
let mad = max_age_days;
|
|
|
|
join_set.spawn(async move {
|
|
|
|
join_set.spawn(async move {
|
|
|
|
let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
|
|
|
|
let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await;
|
|
|
|
(url, scraped_content, page_title)
|
|
|
|
(url, scraped_content, page_title, final_url)
|
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@ -1188,10 +1188,10 @@ async fn scrape_flat_urls(
|
|
|
|
pct as u8,
|
|
|
|
pct as u8,
|
|
|
|
);
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
if let Ok((url, scraped_content, page_title)) = join_result {
|
|
|
|
if let Ok((_original_url, scraped_content, page_title, final_url)) = join_result {
|
|
|
|
results.push(ScrapedNewsItem {
|
|
|
|
results.push(ScrapedNewsItem {
|
|
|
|
title: page_title.clone(),
|
|
|
|
title: page_title.clone(),
|
|
|
|
url,
|
|
|
|
url: final_url, // Use redirect-resolved URL
|
|
|
|
summary: String::new(), // No LLM summary yet
|
|
|
|
summary: String::new(), // No LLM summary yet
|
|
|
|
original_title: page_title,
|
|
|
|
original_title: page_title,
|
|
|
|
scraped_content,
|
|
|
|
scraped_content,
|
|
|
|
@ -1203,8 +1203,8 @@ async fn scrape_flat_urls(
|
|
|
|
let url = url.clone();
|
|
|
|
let url = url.clone();
|
|
|
|
let mad = max_age_days;
|
|
|
|
let mad = max_age_days;
|
|
|
|
join_set.spawn(async move {
|
|
|
|
join_set.spawn(async move {
|
|
|
|
let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
|
|
|
|
let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await;
|
|
|
|
(url, scraped_content, page_title)
|
|
|
|
(url, scraped_content, page_title, final_url)
|
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@ -1212,7 +1212,7 @@ async fn scrape_flat_urls(
|
|
|
|
results
|
|
|
|
results
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure.
|
|
|
|
/// Scrape a single article URL, returning (body_text, page_title, final_url) or empty strings on failure.
|
|
|
|
///
|
|
|
|
///
|
|
|
|
/// Handles all failure modes gracefully:
|
|
|
|
/// Handles all failure modes gracefully:
|
|
|
|
/// - Network errors → empty content (article kept)
|
|
|
|
/// - Network errors → empty content (article kept)
|
|
|
|
@ -1222,25 +1222,24 @@ async fn scrape_single_article(
|
|
|
|
http_client: &reqwest::Client,
|
|
|
|
http_client: &reqwest::Client,
|
|
|
|
url: &str,
|
|
|
|
url: &str,
|
|
|
|
max_age_days: i64,
|
|
|
|
max_age_days: i64,
|
|
|
|
) -> (String, String) {
|
|
|
|
) -> (String, String, String) {
|
|
|
|
match scraper::scrape_url(http_client, url).await {
|
|
|
|
match scraper::scrape_url(http_client, url).await {
|
|
|
|
Ok(content) => {
|
|
|
|
Ok(content) => {
|
|
|
|
|
|
|
|
let final_url = content.url.clone();
|
|
|
|
if !content.ok || content.is_soft_404 {
|
|
|
|
if !content.ok || content.is_soft_404 {
|
|
|
|
tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
|
|
|
|
tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
|
|
|
|
return (String::new(), String::new());
|
|
|
|
return (String::new(), String::new(), final_url);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if scraper::is_article_too_old(content.published_date, max_age_days) {
|
|
|
|
if scraper::is_article_too_old(content.published_date, max_age_days) {
|
|
|
|
tracing::warn!(url = url, "Article too old, skipping content");
|
|
|
|
tracing::warn!(url = url, "Article too old, skipping content");
|
|
|
|
return (String::new(), String::new());
|
|
|
|
return (String::new(), String::new(), final_url);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
let title = content.title.unwrap_or_default();
|
|
|
|
let title = content.title.unwrap_or_default();
|
|
|
|
(content.body_text, title)
|
|
|
|
(content.body_text, title, final_url)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Err(e) => {
|
|
|
|
Err(e) => {
|
|
|
|
tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
|
|
|
|
tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
|
|
|
|
(String::new(), String::new())
|
|
|
|
(String::new(), String::new(), url.to_string())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|