feat: ScrapedContent url+head_html fields, Arc<dyn LlmProvider>, 3-tuple scrape returns

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent e483789d1b
commit 23f121a58d

@ -1,6 +1,8 @@
//! Provider factory: creates the correct `LlmProvider` implementation //! Provider factory: creates the correct `LlmProvider` implementation
//! based on the provider name and the user's decrypted API key. //! based on the provider name and the user's decrypted API key.
use std::sync::Arc;
use super::anthropic::AnthropicProvider; use super::anthropic::AnthropicProvider;
use super::gemini::GeminiProvider; use super::gemini::GeminiProvider;
use super::openai::OpenAiProvider; use super::openai::OpenAiProvider;
@ -37,12 +39,12 @@ fn build_llm_client() -> Result<reqwest::Client, AppError> {
pub fn create_provider( pub fn create_provider(
provider_name: &str, provider_name: &str,
api_key: String, api_key: String,
) -> Result<Box<dyn LlmProvider>, AppError> { ) -> Result<Arc<dyn LlmProvider>, AppError> {
let http_client = build_llm_client()?; let http_client = build_llm_client()?;
match provider_name { match provider_name {
"gemini" => Ok(Box::new(GeminiProvider::new(api_key, http_client))), "gemini" => Ok(Arc::new(GeminiProvider::new(api_key, http_client))),
"openai" => Ok(Box::new(OpenAiProvider::new(api_key, http_client))), "openai" => Ok(Arc::new(OpenAiProvider::new(api_key, http_client))),
"anthropic" => Ok(Box::new(AnthropicProvider::new(api_key, http_client))), "anthropic" => Ok(Arc::new(AnthropicProvider::new(api_key, http_client))),
_ => Err(AppError::BadRequest(format!( _ => Err(AppError::BadRequest(format!(
"Unknown provider: '{}'", "Unknown provider: '{}'",
provider_name provider_name

@ -48,6 +48,10 @@ pub struct ScrapedContent {
pub body_text: String, pub body_text: String,
/// Whether the page appears to be a soft-404 (error page with 200 status). /// Whether the page appears to be a soft-404 (error page with 200 status).
pub is_soft_404: bool, pub is_soft_404: bool,
/// Final URL after following redirects.
pub url: String,
/// Raw <head> section from the HTML, preserved for LLM extraction.
pub head_html: String,
} }
/// Build a `reqwest::Client` configured for scraping. /// Build a `reqwest::Client` configured for scraping.
@ -139,6 +143,8 @@ pub async fn scrape_url(
published_date: None, published_date: None,
body_text: String::new(), body_text: String::new(),
is_soft_404: false, is_soft_404: false,
url: response.url().to_string(),
head_html: String::new(),
}); });
} }
@ -174,6 +180,14 @@ pub async fn scrape_url(
} }
let html_text = String::from_utf8_lossy(&bytes); let html_text = String::from_utf8_lossy(&bytes);
// Extract <head> section for potential LLM use
let head_html = {
let start = html_text.find("<head").unwrap_or(0);
let end = html_text.find("</head>").map(|i| i + 7).unwrap_or(start);
html_text[start..end].to_string()
};
let document = Html::parse_document(&html_text); let document = Html::parse_document(&html_text);
// Extract page title // Extract page title
@ -199,6 +213,8 @@ pub async fn scrape_url(
published_date, published_date,
body_text, body_text,
is_soft_404, is_soft_404,
url: final_url.to_string(),
head_html,
}) })
} }

@ -1119,10 +1119,10 @@ async fn scrape_articles(
pct as u8, pct as u8,
); );
if let Ok((cat_key, item, (scraped_content, page_title))) = join_result { if let Ok((cat_key, item, (scraped_content, page_title, final_url))) = join_result {
let scraped_item = ScrapedNewsItem { let scraped_item = ScrapedNewsItem {
title: item.title, title: item.title,
url: item.url, url: final_url,
summary: item.summary, summary: item.summary,
original_title: page_title, original_title: page_title,
scraped_content, scraped_content,
@ -1172,8 +1172,8 @@ async fn scrape_flat_urls(
let url = url.clone(); let url = url.clone();
let mad = max_age_days; let mad = max_age_days;
join_set.spawn(async move { join_set.spawn(async move {
let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await; let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await;
(url, scraped_content, page_title) (url, scraped_content, page_title, final_url)
}); });
} }
} }
@ -1188,10 +1188,10 @@ async fn scrape_flat_urls(
pct as u8, pct as u8,
); );
if let Ok((url, scraped_content, page_title)) = join_result { if let Ok((_original_url, scraped_content, page_title, final_url)) = join_result {
results.push(ScrapedNewsItem { results.push(ScrapedNewsItem {
title: page_title.clone(), title: page_title.clone(),
url, url: final_url, // Use redirect-resolved URL
summary: String::new(), // No LLM summary yet summary: String::new(), // No LLM summary yet
original_title: page_title, original_title: page_title,
scraped_content, scraped_content,
@ -1203,8 +1203,8 @@ async fn scrape_flat_urls(
let url = url.clone(); let url = url.clone();
let mad = max_age_days; let mad = max_age_days;
join_set.spawn(async move { join_set.spawn(async move {
let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await; let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await;
(url, scraped_content, page_title) (url, scraped_content, page_title, final_url)
}); });
} }
} }
@ -1212,7 +1212,7 @@ async fn scrape_flat_urls(
results results
} }
/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure. /// Scrape a single article URL, returning (body_text, page_title, final_url) or empty strings on failure.
/// ///
/// Handles all failure modes gracefully: /// Handles all failure modes gracefully:
/// - Network errors → empty content (article kept) /// - Network errors → empty content (article kept)
@ -1222,25 +1222,24 @@ async fn scrape_single_article(
http_client: &reqwest::Client, http_client: &reqwest::Client,
url: &str, url: &str,
max_age_days: i64, max_age_days: i64,
) -> (String, String) { ) -> (String, String, String) {
match scraper::scrape_url(http_client, url).await { match scraper::scrape_url(http_client, url).await {
Ok(content) => { Ok(content) => {
let final_url = content.url.clone();
if !content.ok || content.is_soft_404 { if !content.ok || content.is_soft_404 {
tracing::warn!(url = url, "Soft 404 or error page detected, skipping content"); tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
return (String::new(), String::new()); return (String::new(), String::new(), final_url);
} }
if scraper::is_article_too_old(content.published_date, max_age_days) { if scraper::is_article_too_old(content.published_date, max_age_days) {
tracing::warn!(url = url, "Article too old, skipping content"); tracing::warn!(url = url, "Article too old, skipping content");
return (String::new(), String::new()); return (String::new(), String::new(), final_url);
} }
let title = content.title.unwrap_or_default(); let title = content.title.unwrap_or_default();
(content.body_text, title) (content.body_text, title, final_url)
} }
Err(e) => { Err(e) => {
tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content"); tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
(String::new(), String::new()) (String::new(), String::new(), url.to_string())
} }
} }
} }

Loading…
Cancel
Save