feat: ScrapedContent url+head_html fields, Arc<dyn LlmProvider>, 3-tuple scrape returns

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent e483789d1b
commit 23f121a58d

@ -1,6 +1,8 @@
//! Provider factory: creates the correct `LlmProvider` implementation
//! based on the provider name and the user's decrypted API key.
use std::sync::Arc;
use super::anthropic::AnthropicProvider;
use super::gemini::GeminiProvider;
use super::openai::OpenAiProvider;
@ -37,12 +39,12 @@ fn build_llm_client() -> Result<reqwest::Client, AppError> {
pub fn create_provider(
provider_name: &str,
api_key: String,
) -> Result<Box<dyn LlmProvider>, AppError> {
) -> Result<Arc<dyn LlmProvider>, AppError> {
let http_client = build_llm_client()?;
match provider_name {
"gemini" => Ok(Box::new(GeminiProvider::new(api_key, http_client))),
"openai" => Ok(Box::new(OpenAiProvider::new(api_key, http_client))),
"anthropic" => Ok(Box::new(AnthropicProvider::new(api_key, http_client))),
"gemini" => Ok(Arc::new(GeminiProvider::new(api_key, http_client))),
"openai" => Ok(Arc::new(OpenAiProvider::new(api_key, http_client))),
"anthropic" => Ok(Arc::new(AnthropicProvider::new(api_key, http_client))),
_ => Err(AppError::BadRequest(format!(
"Unknown provider: '{}'",
provider_name

@ -48,6 +48,10 @@ pub struct ScrapedContent {
pub body_text: String,
/// Whether the page appears to be a soft-404 (error page with 200 status).
pub is_soft_404: bool,
/// Final URL after following redirects.
pub url: String,
/// Raw <head> section from the HTML, preserved for LLM extraction.
pub head_html: String,
}
/// Build a `reqwest::Client` configured for scraping.
@ -139,6 +143,8 @@ pub async fn scrape_url(
published_date: None,
body_text: String::new(),
is_soft_404: false,
url: response.url().to_string(),
head_html: String::new(),
});
}
@ -174,6 +180,14 @@ pub async fn scrape_url(
}
let html_text = String::from_utf8_lossy(&bytes);
// Extract <head> section for potential LLM use
let head_html = {
let start = html_text.find("<head").unwrap_or(0);
let end = html_text.find("</head>").map(|i| i + 7).unwrap_or(start);
html_text[start..end].to_string()
};
let document = Html::parse_document(&html_text);
// Extract page title
@ -199,6 +213,8 @@ pub async fn scrape_url(
published_date,
body_text,
is_soft_404,
url: final_url.to_string(),
head_html,
})
}

@ -1119,10 +1119,10 @@ async fn scrape_articles(
pct as u8,
);
if let Ok((cat_key, item, (scraped_content, page_title))) = join_result {
if let Ok((cat_key, item, (scraped_content, page_title, final_url))) = join_result {
let scraped_item = ScrapedNewsItem {
title: item.title,
url: item.url,
url: final_url,
summary: item.summary,
original_title: page_title,
scraped_content,
@ -1172,8 +1172,8 @@ async fn scrape_flat_urls(
let url = url.clone();
let mad = max_age_days;
join_set.spawn(async move {
let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
(url, scraped_content, page_title)
let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await;
(url, scraped_content, page_title, final_url)
});
}
}
@ -1188,10 +1188,10 @@ async fn scrape_flat_urls(
pct as u8,
);
if let Ok((url, scraped_content, page_title)) = join_result {
if let Ok((_original_url, scraped_content, page_title, final_url)) = join_result {
results.push(ScrapedNewsItem {
title: page_title.clone(),
url,
url: final_url, // Use redirect-resolved URL
summary: String::new(), // No LLM summary yet
original_title: page_title,
scraped_content,
@ -1203,8 +1203,8 @@ async fn scrape_flat_urls(
let url = url.clone();
let mad = max_age_days;
join_set.spawn(async move {
let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
(url, scraped_content, page_title)
let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await;
(url, scraped_content, page_title, final_url)
});
}
}
@ -1212,7 +1212,7 @@ async fn scrape_flat_urls(
results
}
/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure.
/// Scrape a single article URL, returning (body_text, page_title, final_url) or empty strings on failure.
///
/// Handles all failure modes gracefully:
/// - Network errors → empty content (article kept)
@ -1222,25 +1222,24 @@ async fn scrape_single_article(
http_client: &reqwest::Client,
url: &str,
max_age_days: i64,
) -> (String, String) {
) -> (String, String, String) {
match scraper::scrape_url(http_client, url).await {
Ok(content) => {
let final_url = content.url.clone();
if !content.ok || content.is_soft_404 {
tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
return (String::new(), String::new());
return (String::new(), String::new(), final_url);
}
if scraper::is_article_too_old(content.published_date, max_age_days) {
tracing::warn!(url = url, "Article too old, skipping content");
return (String::new(), String::new());
return (String::new(), String::new(), final_url);
}
let title = content.title.unwrap_or_default();
(content.body_text, title)
(content.body_text, title, final_url)
}
Err(e) => {
tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
(String::new(), String::new())
(String::new(), String::new(), url.to_string())
}
}
}

Loading…
Cancel
Save