From 23f121a58d1d65fb9dfd284b7f29ad109b97280c Mon Sep 17 00:00:00 2001
From: oabrivard <olivier@abrivard.fr>
Date: Tue, 24 Mar 2026 10:42:07 +0100
Subject: [PATCH] feat: ScrapedContent url+head_html fields, Arc<dyn
 LlmProvider>, 3-tuple scrape returns

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 backend/src/services/llm/factory.rs | 10 ++++++----
 backend/src/services/scraper.rs     | 16 +++++++++++++++
 backend/src/services/synthesis.rs   | 31 ++++++++++++++---------------
 3 files changed, 37 insertions(+), 20 deletions(-)
diff --git a/backend/src/services/llm/factory.rs b/backend/src/services/llm/factory.rs
index f9f2316..c86a3a5 100644
--- a/backend/src/services/llm/factory.rs
+++ b/backend/src/services/llm/factory.rs
@@ -1,6 +1,8 @@
 //! Provider factory: creates the correct `LlmProvider` implementation
 //! based on the provider name and the user's decrypted API key.
 
+use std::sync::Arc;
+
 use super::anthropic::AnthropicProvider;
 use super::gemini::GeminiProvider;
 use super::openai::OpenAiProvider;
@@ -37,12 +39,12 @@ fn build_llm_client() -> Result<reqwest::Client, AppError> {
 pub fn create_provider(
     provider_name: &str,
     api_key: String,
-) -> Result<Box<dyn LlmProvider>, AppError> {
+) -> Result<Arc<dyn LlmProvider>, AppError> {
     let http_client = build_llm_client()?;
     match provider_name {
-        "gemini" => Ok(Box::new(GeminiProvider::new(api_key, http_client))),
-        "openai" => Ok(Box::new(OpenAiProvider::new(api_key, http_client))),
-        "anthropic" => Ok(Box::new(AnthropicProvider::new(api_key, http_client))),
+        "gemini" => Ok(Arc::new(GeminiProvider::new(api_key, http_client))),
+        "openai" => Ok(Arc::new(OpenAiProvider::new(api_key, http_client))),
+        "anthropic" => Ok(Arc::new(AnthropicProvider::new(api_key, http_client))),
         _ => Err(AppError::BadRequest(format!(
             "Unknown provider: '{}'",
             provider_name
diff --git a/backend/src/services/scraper.rs b/backend/src/services/scraper.rs
index bb0b721..11452e9 100644
--- a/backend/src/services/scraper.rs
+++ b/backend/src/services/scraper.rs
@@ -48,6 +48,10 @@ pub struct ScrapedContent {
     pub body_text: String,
     /// Whether the page appears to be a soft-404 (error page with 200 status).
     pub is_soft_404: bool,
+    /// Final URL after following redirects.
+    pub url: String,
+    /// Raw <head> section from the HTML, preserved for LLM extraction.
+    pub head_html: String,
 }
 
 /// Build a `reqwest::Client` configured for scraping.
@@ -139,6 +143,8 @@ pub async fn scrape_url(
             published_date: None,
             body_text: String::new(),
             is_soft_404: false,
+            url: response.url().to_string(),
+            head_html: String::new(),
         });
     }
 
@@ -174,6 +180,14 @@ pub async fn scrape_url(
     }
 
     let html_text = String::from_utf8_lossy(&bytes);
+
+    // Extract <head> section for potential LLM use
+    let head_html = {
+        let start = html_text.find("<head").unwrap_or(0);
+        let end = html_text.find("</head>").map(|i| i + 7).unwrap_or(start);
+        html_text[start..end].to_string()
+    };
+
     let document = Html::parse_document(&html_text);
 
     // Extract page title
@@ -199,6 +213,8 @@ pub async fn scrape_url(
         published_date,
         body_text,
         is_soft_404,
+        url: final_url.to_string(),
+        head_html,
     })
 }
 
diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs
index 98ed162..d1154ca 100644
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@@ -1119,10 +1119,10 @@ async fn scrape_articles(
             pct as u8,
         );
 
-        if let Ok((cat_key, item, (scraped_content, page_title))) = join_result {
+        if let Ok((cat_key, item, (scraped_content, page_title, final_url))) = join_result {
             let scraped_item = ScrapedNewsItem {
                 title: item.title,
-                url: item.url,
+                url: final_url,
                 summary: item.summary,
                 original_title: page_title,
                 scraped_content,
@@ -1172,8 +1172,8 @@ async fn scrape_flat_urls(
             let url = url.clone();
             let mad = max_age_days;
             join_set.spawn(async move {
-                let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
-                (url, scraped_content, page_title)
+                let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await;
+                (url, scraped_content, page_title, final_url)
             });
         }
     }
@@ -1188,10 +1188,10 @@ async fn scrape_flat_urls(
             pct as u8,
         );
 
-        if let Ok((url, scraped_content, page_title)) = join_result {
+        if let Ok((_original_url, scraped_content, page_title, final_url)) = join_result {
             results.push(ScrapedNewsItem {
                 title: page_title.clone(),
-                url,
+                url: final_url, // Use redirect-resolved URL
                 summary: String::new(), // No LLM summary yet
                 original_title: page_title,
                 scraped_content,
@@ -1203,8 +1203,8 @@ async fn scrape_flat_urls(
             let url = url.clone();
             let mad = max_age_days;
             join_set.spawn(async move {
-                let (scraped_content, page_title) = scrape_single_article(&client, &url, mad).await;
-                (url, scraped_content, page_title)
+                let (scraped_content, page_title, final_url) = scrape_single_article(&client, &url, mad).await;
+                (url, scraped_content, page_title, final_url)
             });
         }
     }
@@ -1212,7 +1212,7 @@ async fn scrape_flat_urls(
     results
 }
 
-/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure.
+/// Scrape a single article URL, returning (body_text, page_title, final_url) or empty strings on failure.
 ///
 /// Handles all failure modes gracefully:
 /// - Network errors → empty content (article kept)
@@ -1222,25 +1222,24 @@ async fn scrape_single_article(
     http_client: &reqwest::Client,
     url: &str,
     max_age_days: i64,
-) -> (String, String) {
+) -> (String, String, String) {
     match scraper::scrape_url(http_client, url).await {
         Ok(content) => {
+            let final_url = content.url.clone();
             if !content.ok || content.is_soft_404 {
                 tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
-                return (String::new(), String::new());
+                return (String::new(), String::new(), final_url);
             }
-
             if scraper::is_article_too_old(content.published_date, max_age_days) {
                 tracing::warn!(url = url, "Article too old, skipping content");
-                return (String::new(), String::new());
+                return (String::new(), String::new(), final_url);
             }
-
             let title = content.title.unwrap_or_default();
-            (content.body_text, title)
+            (content.body_text, title, final_url)
         }
         Err(e) => {
             tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
-            (String::new(), String::new())
+            (String::new(), String::new(), url.to_string())
         }
     }
 }