feat: ScrapedContent url+head_html fields, Arc<dyn LlmProvider>, 3-tuple scrape returns

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · 23f121a58d
parent e483789d1b
commit 23f121a58d
3 changed files with 37 additions and 20 deletions
--- a/backend/src/services/llm/factory.rs
+++ b/backend/src/services/llm/factory.rs
@ -1,6 +1,8 @@
 //! Provider factory: creates the correct `LlmProvider` implementation
 //! based on the provider name and the user's decrypted API key.

+use std::sync::Arc;
+
 use super::anthropic::AnthropicProvider;
 use super::gemini::GeminiProvider;
 use super::openai::OpenAiProvider;
@ -37,12 +39,12 @@ fn build_llm_client() -> Result<reqwest::Client, AppError> {
 pub fn create_provider(
    provider_name: &str,
    api_key: String,
-) -> Result<Box<dyn LlmProvider>, AppError> {
+) -> Result<Arc<dyn LlmProvider>, AppError> {
    let http_client = build_llm_client()?;
    match provider_name {
-        "gemini" => Ok(Box::new(GeminiProvider::new(api_key, http_client))),
-        "openai" => Ok(Box::new(OpenAiProvider::new(api_key, http_client))),
-        "anthropic" => Ok(Box::new(AnthropicProvider::new(api_key, http_client))),
+        "gemini" => Ok(Arc::new(GeminiProvider::new(api_key, http_client))),
+        "openai" => Ok(Arc::new(OpenAiProvider::new(api_key, http_client))),
+        "anthropic" => Ok(Arc::new(AnthropicProvider::new(api_key, http_client))),
        _ => Err(AppError::BadRequest(format!(
            "Unknown provider: '{}'",
            provider_name
--- a/backend/src/services/scraper.rs
+++ b/backend/src/services/scraper.rs
@ -48,6 +48,10 @@ pub struct ScrapedContent {
    pub body_text: String,
    /// Whether the page appears to be a soft-404 (error page with 200 status).
    pub is_soft_404: bool,
+    /// Final URL after following redirects.
+    pub url: String,
+    /// Raw <head> section from the HTML, preserved for LLM extraction.
+    pub head_html: String,
 }

 /// Build a `reqwest::Client` configured for scraping.
@ -139,6 +143,8 @@ pub async fn scrape_url(
            published_date: None,
            body_text: String::new(),
            is_soft_404: false,
+            url: response.url().to_string(),
+            head_html: String::new(),
        });
    }

@ -174,6 +180,14 @@ pub async fn scrape_url(
    }

    let html_text = String::from_utf8_lossy(&bytes);
+
+    // Extract <head> section for potential LLM use
+    let head_html = {
+        let start = html_text.find("<head").unwrap_or(0);
+        let end = html_text.find("</head>").map(|i| i + 7).unwrap_or(start);
+        html_text[start..end].to_string()
+    };
+
    let document = Html::parse_document(&html_text);

    // Extract page title
@ -199,6 +213,8 @@ pub async fn scrape_url(
        published_date,
        body_text,
        is_soft_404,
+        url: final_url.to_string(),
+        head_html,
    })
 }

--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -1119,10 +1119,10 @@ async fn scrape_articles(
            pct as u8,
        );

-        if let Ok((cat_key, item, (scraped_content, page_title))) = join_result {
+        if let Ok((cat_key, item, (scraped_content, page_title, final_url))) = join_result {
            let scraped_item = ScrapedNewsItem {
                title: item.title,
-                url: item.url,
+                url: final_url,
                summary: item.summary,
                original_title: page_title,
                scraped_content,
@ -1172,8 +1172,8 @@ async fn scrape_flat_urls(
            let url = url.clone();
            let mad = max_age_days;
            join_set.spawn(async move {
-                let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
-                (url, scraped_content, page_title)
+                let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await;
+                (url, scraped_content, page_title, final_url)
            });
        }
    }
@ -1188,10 +1188,10 @@ async fn scrape_flat_urls(
            pct as u8,
        );

-        if let Ok((url, scraped_content, page_title)) = join_result {
+        if let Ok((_original_url, scraped_content, page_title, final_url)) = join_result {
            results.push(ScrapedNewsItem {
                title: page_title.clone(),
-                url,
+                url: final_url, // Use redirect-resolved URL
                summary: String::new(), // No LLM summary yet
                original_title: page_title,
                scraped_content,
@ -1203,8 +1203,8 @@ async fn scrape_flat_urls(
            let url = url.clone();
            let mad = max_age_days;
            join_set.spawn(async move {
-                let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
-                (url, scraped_content, page_title)
+                let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await;
+                (url, scraped_content, page_title, final_url)
            });
        }
    }
@ -1212,7 +1212,7 @@ async fn scrape_flat_urls(
    results
 }

-/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure.
+/// Scrape a single article URL, returning (body_text, page_title, final_url) or empty strings on failure.
 ///
 /// Handles all failure modes gracefully:
 /// - Network errors → empty content (article kept)
@ -1222,25 +1222,24 @@ async fn scrape_single_article(
    http_client: &reqwest::Client,
    url: &str,
    max_age_days: i64,
-) -> (String, String) {
+) -> (String, String, String) {
    match scraper::scrape_url(http_client, url).await {
        Ok(content) => {
+            let final_url = content.url.clone();
            if !content.ok || content.is_soft_404 {
                tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
-                return (String::new(), String::new());
+                return (String::new(), String::new(), final_url);
            }
-
            if scraper::is_article_too_old(content.published_date, max_age_days) {
                tracing::warn!(url = url, "Article too old, skipping content");
-                return (String::new(), String::new());
+                return (String::new(), String::new(), final_url);
            }
-
            let title = content.title.unwrap_or_default();
-            (content.body_text, title)
+            (content.body_text, title, final_url)
        }
        Err(e) => {
            tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
-            (String::new(), String::new())
+            (String::new(), String::new(), url.to_string())
        }
    }
 }