From 23f121a58d1d65fb9dfd284b7f29ad109b97280c Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 10:42:07 +0100 Subject: [PATCH] feat: ScrapedContent url+head_html fields, Arc, 3-tuple scrape returns Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/services/llm/factory.rs | 10 ++++++---- backend/src/services/scraper.rs | 16 +++++++++++++++ backend/src/services/synthesis.rs | 31 ++++++++++++++--------------- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/backend/src/services/llm/factory.rs b/backend/src/services/llm/factory.rs index f9f2316..c86a3a5 100644 --- a/backend/src/services/llm/factory.rs +++ b/backend/src/services/llm/factory.rs @@ -1,6 +1,8 @@ //! Provider factory: creates the correct `LlmProvider` implementation //! based on the provider name and the user's decrypted API key. +use std::sync::Arc; + use super::anthropic::AnthropicProvider; use super::gemini::GeminiProvider; use super::openai::OpenAiProvider; @@ -37,12 +39,12 @@ fn build_llm_client() -> Result { pub fn create_provider( provider_name: &str, api_key: String, -) -> Result, AppError> { +) -> Result, AppError> { let http_client = build_llm_client()?; match provider_name { - "gemini" => Ok(Box::new(GeminiProvider::new(api_key, http_client))), - "openai" => Ok(Box::new(OpenAiProvider::new(api_key, http_client))), - "anthropic" => Ok(Box::new(AnthropicProvider::new(api_key, http_client))), + "gemini" => Ok(Arc::new(GeminiProvider::new(api_key, http_client))), + "openai" => Ok(Arc::new(OpenAiProvider::new(api_key, http_client))), + "anthropic" => Ok(Arc::new(AnthropicProvider::new(api_key, http_client))), _ => Err(AppError::BadRequest(format!( "Unknown provider: '{}'", provider_name diff --git a/backend/src/services/scraper.rs b/backend/src/services/scraper.rs index bb0b721..11452e9 100644 --- a/backend/src/services/scraper.rs +++ b/backend/src/services/scraper.rs @@ -48,6 +48,10 @@ pub struct ScrapedContent { pub body_text: String, /// Whether the page appears to be a soft-404 (error page with 200 status). pub is_soft_404: bool, + /// Final URL after following redirects. + pub url: String, + /// Raw section from the HTML, preserved for LLM extraction. + pub head_html: String, } /// Build a `reqwest::Client` configured for scraping. @@ -139,6 +143,8 @@ pub async fn scrape_url( published_date: None, body_text: String::new(), is_soft_404: false, + url: response.url().to_string(), + head_html: String::new(), }); } @@ -174,6 +180,14 @@ pub async fn scrape_url( } let html_text = String::from_utf8_lossy(&bytes); + + // Extract section for potential LLM use + let head_html = { + let start = html_text.find("").map(|i| i + 7).unwrap_or(start); + html_text[start..end].to_string() + }; + let document = Html::parse_document(&html_text); // Extract page title @@ -199,6 +213,8 @@ pub async fn scrape_url( published_date, body_text, is_soft_404, + url: final_url.to_string(), + head_html, }) } diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 98ed162..d1154ca 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -1119,10 +1119,10 @@ async fn scrape_articles( pct as u8, ); - if let Ok((cat_key, item, (scraped_content, page_title))) = join_result { + if let Ok((cat_key, item, (scraped_content, page_title, final_url))) = join_result { let scraped_item = ScrapedNewsItem { title: item.title, - url: item.url, + url: final_url, summary: item.summary, original_title: page_title, scraped_content, @@ -1172,8 +1172,8 @@ async fn scrape_flat_urls( let url = url.clone(); let mad = max_age_days; join_set.spawn(async move { - let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await; - (url, scraped_content, page_title) + let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await; + (url, scraped_content, page_title, final_url) }); } } @@ -1188,10 +1188,10 @@ async fn scrape_flat_urls( pct as u8, ); - if let Ok((url, scraped_content, page_title)) = join_result { + if let Ok((_original_url, scraped_content, page_title, final_url)) = join_result { results.push(ScrapedNewsItem { title: page_title.clone(), - url, + url: final_url, // Use redirect-resolved URL summary: String::new(), // No LLM summary yet original_title: page_title, scraped_content, @@ -1203,8 +1203,8 @@ async fn scrape_flat_urls( let url = url.clone(); let mad = max_age_days; join_set.spawn(async move { - let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await; - (url, scraped_content, page_title) + let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await; + (url, scraped_content, page_title, final_url) }); } } @@ -1212,7 +1212,7 @@ async fn scrape_flat_urls( results } -/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure. +/// Scrape a single article URL, returning (body_text, page_title, final_url) or empty strings on failure. /// /// Handles all failure modes gracefully: /// - Network errors → empty content (article kept) @@ -1222,25 +1222,24 @@ async fn scrape_single_article( http_client: &reqwest::Client, url: &str, max_age_days: i64, -) -> (String, String) { +) -> (String, String, String) { match scraper::scrape_url(http_client, url).await { Ok(content) => { + let final_url = content.url.clone(); if !content.ok || content.is_soft_404 { tracing::warn!(url = url, "Soft 404 or error page detected, skipping content"); - return (String::new(), String::new()); + return (String::new(), String::new(), final_url); } - if scraper::is_article_too_old(content.published_date, max_age_days) { tracing::warn!(url = url, "Article too old, skipping content"); - return (String::new(), String::new()); + return (String::new(), String::new(), final_url); } - let title = content.title.unwrap_or_default(); - (content.body_text, title) + (content.body_text, title, final_url) } Err(e) => { tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content"); - (String::new(), String::new()) + (String::new(), String::new(), url.to_string()) } } }