From d508ea962075db6783475e6fcd4d13798372aabe Mon Sep 17 00:00:00 2001
From: oabrivard <olivier@abrivard.fr>
Date: Tue, 24 Mar 2026 10:35:03 +0100
Subject: [PATCH] =?UTF-8?q?docs:=20revise=20LLM=20scraping=20plan=20?=
 =?UTF-8?q?=E2=80=94=20fix=20Arc=20provider,=20head=5Fhtml,=20concurrency,?=
 =?UTF-8?q?=20tests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../plans/2026-03-24-llm-scraping.md          | 442 ++++++++++--------
 1 file changed, 248 insertions(+), 194 deletions(-)
diff --git a/docs/superpowers/plans/2026-03-24-llm-scraping.md b/docs/superpowers/plans/2026-03-24-llm-scraping.md
index c3887a7..99c6bae 100644
--- a/docs/superpowers/plans/2026-03-24-llm-scraping.md
+++ b/docs/superpowers/plans/2026-03-24-llm-scraping.md
@@ -1,12 +1,12 @@
-# LLM-Assisted Scraping — Implementation Plan
+# LLM-Assisted Scraping — Implementation Plan (Revised)
 
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 
 **Goal:** Add two optional LLM-powered scraping enhancements: LLM link extraction from source pages and LLM article content extraction — controlled by user settings.
 
-**Architecture:** Two boolean settings control two independent LLM scraping paths. Each has a fallback to existing heuristic-based extraction. `ScrapedContent` gains a `url` field for redirect-resolved URLs. New prompt/schema builders for both LLM calls.
+**Architecture:** Two boolean settings control two independent LLM scraping paths. Each has a fallback to existing heuristic-based extraction. `ScrapedContent` gains `url` and `head_html` fields. `create_provider` returns `Arc<dyn LlmProvider>` for safe sharing across concurrent tasks. When LLM extraction is enabled, concurrency is reduced to max 5.
 
-**Tech Stack:** Rust (reqwest, scraper crate, serde_json), existing LLM providers via `generate_rewrite_pass`
+**Tech Stack:** Rust (reqwest, scraper crate, serde_json, Arc), existing LLM providers via `generate_rewrite_pass`
 
 **Spec:** `docs/superpowers/specs/2026-03-24-llm-scraping-design.md`
 
@@ -36,38 +36,69 @@ Add to `From<UserSettings> for SettingsResponse`, `Default for UserSettings` (bo
 
 - [ ] **Step 3: Add to DB queries in `db/settings.rs`**
 
-Add both fields to `SettingsRow`, `TryFrom<SettingsRow>`, and both SQL queries (`get_or_create_default` + `upsert`). Follow the pattern of the last column added.
+Add both fields to `SettingsRow`, `TryFrom<SettingsRow>`, and both SQL queries. Follow the pattern of the last column added.
 
 - [ ] **Step 4: Update test fixtures**
 
-Add `use_llm_for_source_links: false, use_llm_for_article_extraction: false` to:
-- `valid_request()` in `models/settings.rs` tests
-- `test_settings()` in `services/prompts.rs` tests
+Add `use_llm_for_source_links: false, use_llm_for_article_extraction: false` to `valid_request()` in settings tests and `test_settings()` in prompts tests.
 
 - [ ] **Step 5: Update CLAUDE.md migration count to 14**
 
 - [ ] **Step 6: Run tests + commit**
 
-Run: `cd backend && cargo test --lib`
-
 ```bash
+cd backend && cargo test --lib
 git add backend/migrations/20260324000014_add_llm_scraping_settings.sql backend/src/models/settings.rs backend/src/db/settings.rs backend/src/services/prompts.rs CLAUDE.md
 git commit -m "feat: add use_llm_for_source_links and use_llm_for_article_extraction settings"
 ```
 
 ---
 
-### Task 2: Add `url` field to `ScrapedContent` + update `scrape_single_article`
+### Task 2: Add `url` and `head_html` to `ScrapedContent` + `Arc<dyn LlmProvider>` + update scraping functions
 
 **Files:**
 - Modify: `backend/src/services/scraper.rs`
+- Modify: `backend/src/services/llm/factory.rs`
+- Modify: `backend/src/services/llm/mod.rs` (trait needs `Send + Sync`)
 - Modify: `backend/src/services/synthesis.rs`
+- Modify: `backend/src/handlers/api_keys.rs`
+
+- [ ] **Step 1: Add `url` and `head_html` to `ScrapedContent` in `scraper.rs`**
+
+```rust
+pub struct ScrapedContent {
+    pub ok: bool,
+    pub status: u16,
+    pub title: Option<String>,
+    pub published_date: Option<DateTime<Utc>>,
+    pub body_text: String,
+    pub is_soft_404: bool,
+    pub url: String,
+    pub head_html: String,
+}
+```
+
+In `scrape_url`, before parsing the document, extract `<head>`:
+```rust
+let html_text = String::from_utf8_lossy(&bytes);
 
-- [ ] **Step 1: Add `url` to `ScrapedContent` in `scraper.rs`**
+// Extract <head> section for potential LLM use
+let head_html = extract_head_section(&html_text);
 
-Add `pub url: String` to the `ScrapedContent` struct (after `is_soft_404`).
+let document = Html::parse_document(&html_text);
+```
 
-In `scrape_url`, populate it from `final_url`:
+Add helper:
+```rust
+/// Extract the <head>...</head> section from raw HTML.
+fn extract_head_section(html: &str) -> String {
+    let start = html.find("<head").unwrap_or(0);
+    let end = html.find("</head>").map(|i| i + 7).unwrap_or(start);
+    html[start..end].to_string()
+}
+```
+
+Populate in the return:
 ```rust
 Ok(ScrapedContent {
     ok: !is_soft_404,
@@ -77,12 +108,47 @@ Ok(ScrapedContent {
     body_text,
     is_soft_404,
     url: final_url.to_string(),
+    head_html,
 })
 ```
 
-- [ ] **Step 2: Update `scrape_single_article` to return `(String, String, String)`**
+- [ ] **Step 2: Change `create_provider` to return `Arc<dyn LlmProvider>`**
 
-In `synthesis.rs`, change `scrape_single_article` return type from `(String, String)` to `(String, String, String)` — `(body_text, page_title, final_url)`:
+In `backend/src/services/llm/factory.rs`, change the return type:
+```rust
+use std::sync::Arc;
+
+pub fn create_provider(
+    provider_name: &str,
+    api_key: String,
+) -> Result<Arc<dyn LlmProvider>, AppError> {
+    let http_client = build_llm_client()?;
+    match provider_name {
+        "gemini" => Ok(Arc::new(GeminiProvider::new(api_key, http_client))),
+        "openai" => Ok(Arc::new(OpenAiProvider::new(api_key, http_client))),
+        "anthropic" => Ok(Arc::new(AnthropicProvider::new(api_key, http_client))),
+        _ => Err(AppError::BadRequest(format!("Unknown provider: '{}'", provider_name))),
+    }
+}
+```
+
+Update all factory tests to use `Arc` (they call methods on the provider, which works the same).
+
+Ensure `LlmProvider` trait in `llm/mod.rs` has `Send + Sync` bounds:
+```rust
+#[async_trait]
+pub trait LlmProvider: Send + Sync {
+```
+
+- [ ] **Step 3: Update all callers of `create_provider`**
+
+In `synthesis.rs` `run_generation_inner`: `let provider = create_provider(...)` — now returns `Arc`. Method calls on `Arc<dyn LlmProvider>` work via auto-deref. Update `provider.generate_search_pass(...)` calls — they should work as-is since `Arc<T>` derefs to `T`.
+
+In `handlers/api_keys.rs`: `let llm_provider = factory::create_provider(...)` — same, just works via deref.
+
+- [ ] **Step 4: Update `scrape_single_article` to return 3-tuple**
+
+Change return type from `(String, String)` to `(String, String, String)` — `(body_text, page_title, final_url)`:
 
 ```rust
 async fn scrape_single_article(
@@ -92,69 +158,36 @@ async fn scrape_single_article(
 ) -> (String, String, String) {
     match scraper::scrape_url(http_client, url).await {
         Ok(content) => {
+            let final_url = content.url.clone();
             if !content.ok || content.is_soft_404 {
-                tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
-                return (String::new(), String::new(), content.url);
+                return (String::new(), String::new(), final_url);
             }
-
             if scraper::is_article_too_old(content.published_date, max_age_days) {
-                tracing::warn!(url = url, "Article too old, skipping content");
-                return (String::new(), String::new(), content.url);
+                return (String::new(), String::new(), final_url);
             }
-
             let title = content.title.unwrap_or_default();
-            (content.body_text, title, content.url)
+            (content.body_text, title, final_url)
         }
         Err(e) => {
-            tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
+            tracing::warn!(url = url, error = %e, "Failed to scrape URL");
             (String::new(), String::new(), url.to_string())
         }
     }
 }
 ```
 
-- [ ] **Step 3: Update callers of `scrape_single_article`**
-
-In `scrape_articles` and `scrape_flat_urls`, update destructuring from `(scraped_content, page_title)` to `(scraped_content, page_title, final_url)`. Use `final_url` to set `ScrapedNewsItem.url` instead of the input URL:
-
-In `scrape_articles` (inside the `join_set.spawn`):
-```rust
-let scraped = scrape_single_article(&client, &url, mad).await;
-(cat_key, item, scraped)
-```
-And in the result handler:
-```rust
-if let Ok((cat_key, item, (scraped_content, page_title, final_url))) = join_result {
-    let scraped_item = ScrapedNewsItem {
-        title: item.title,
-        url: final_url, // Use redirect-resolved URL instead of item.url
-        summary: item.summary,
-        original_title: page_title,
-        scraped_content,
-    };
-```
+- [ ] **Step 5: Update callers of `scrape_single_article`**
 
-Same pattern in `scrape_flat_urls`:
-```rust
-if let Ok((url, scraped_content, page_title, final_url)) = join_result {
-    results.push(ScrapedNewsItem {
-        title: page_title.clone(),
-        url: final_url, // Use redirect-resolved URL
-        summary: String::new(),
-        original_title: page_title,
-        scraped_content,
-    });
-```
+In `scrape_articles`: update spawn closure to return `(cat_key, item, (scraped_content, page_title, final_url))`. In result handler, use `final_url` for `ScrapedNewsItem.url`.
 
-Note: the `join_set.spawn` closure must also capture and return `final_url`. Update the spawn to return 4-tuple: `(url, scraped_content, page_title, final_url)`.
+In `scrape_flat_urls`: update spawn closure to return `(original_url, scraped_content, page_title, final_url)`. Use `final_url` for `ScrapedNewsItem.url`.
 
-- [ ] **Step 4: Run tests + commit**
-
-Run: `cd backend && cargo test --lib`
+- [ ] **Step 6: Run tests + commit**
 
 ```bash
-git add backend/src/services/scraper.rs backend/src/services/synthesis.rs
-git commit -m "feat: add url field to ScrapedContent, use redirect-resolved URLs"
+cd backend && cargo test --lib
+git add backend/src/services/scraper.rs backend/src/services/llm/factory.rs backend/src/services/llm/mod.rs backend/src/services/synthesis.rs backend/src/handlers/api_keys.rs
+git commit -m "feat: ScrapedContent url+head_html fields, Arc<dyn LlmProvider>, 3-tuple scrape returns"
 ```
 
 ---
@@ -165,14 +198,10 @@ git commit -m "feat: add url field to ScrapedContent, use redirect-resolved URLs
 - Modify: `backend/src/services/prompts.rs`
 - Modify: `backend/src/services/llm/schema.rs`
 
-- [ ] **Step 1: Add `build_link_extraction_prompt` to `prompts.rs`**
+- [ ] **Step 1: Add `build_link_extraction_prompt` and `build_article_extraction_prompt` to `prompts.rs`**
 
 ```rust
 /// Build a prompt for LLM-assisted link extraction from a source page.
-///
-/// # Arguments
-/// * `head_html` — the <head> section of the page
-/// * `body_html` — first 8000 chars of the <body> section
 pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) {
     let system_prompt =
         "Tu es un assistant qui analyse des pages web. \
@@ -180,6 +209,8 @@ pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String
          Reponds uniquement au format JSON demande."
             .to_string();
 
+    let body_truncated: String = body_html.chars().take(8000).collect();
+
     let user_prompt = format!(
         "Voici le contenu HTML d'une page de blog ou de site d'actualites.\n\n\
          <head>\n{head}\n</head>\n\n\
@@ -188,21 +219,13 @@ pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String
          (pas les liens de navigation, tags, categories, login, pages statiques, etc.).\n\
          Retourne les URLs completes dans le format JSON demande.",
         head = head_html,
-        body = body_html,
+        body = body_truncated,
     );
 
     (system_prompt, user_prompt)
 }
-```
-
-- [ ] **Step 2: Add `build_article_extraction_prompt` to `prompts.rs`**
 
-```rust
 /// Build a prompt for LLM-assisted article content extraction.
-///
-/// # Arguments
-/// * `head_html` — the <head> section (contains meta tags, og:*, canonical)
-/// * `body_text` — cleaned body text from existing HTML stripping
 pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (String, String) {
     let system_prompt =
         "Tu es un assistant qui analyse des articles web. \
@@ -228,10 +251,11 @@ pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (Str
 }
 ```
 
-- [ ] **Step 3: Add schemas to `schema.rs`**
+Note: `build_link_extraction_prompt` truncates body using `.chars().take(8000)` (UTF-8 safe).
+
+- [ ] **Step 2: Add schemas to `schema.rs`**
 
 ```rust
-/// Build a JSON Schema for LLM link extraction response.
 pub fn build_link_extraction_schema() -> Value {
     serde_json::json!({
         "type": "object",
@@ -246,7 +270,6 @@ pub fn build_link_extraction_schema() -> Value {
     })
 }
 
-/// Build a JSON Schema for LLM article content extraction response.
 pub fn build_article_extraction_schema() -> Value {
     serde_json::json!({
         "type": "object",
@@ -262,22 +285,32 @@ pub fn build_article_extraction_schema() -> Value {
 }
 ```
 
-- [ ] **Step 4: Add tests**
+- [ ] **Step 3: Add tests for prompts and schemas**
 
 In `prompts.rs` tests:
 ```rust
     #[test]
     fn link_extraction_prompt_includes_html() {
-        let (_, user) = build_link_extraction_prompt("<title>Blog</title>", "<a href='/post'>P</a>");
+        let (sys, user) = build_link_extraction_prompt("<title>Blog</title>", "<a href='/post'>P</a>");
         assert!(user.contains("<title>Blog</title>"));
         assert!(user.contains("articles"));
+        assert!(sys.contains("liens"));
+    }
+
+    #[test]
+    fn link_extraction_prompt_truncates_body() {
+        let long_body = "x".repeat(20000);
+        let (_, user) = build_link_extraction_prompt("", &long_body);
+        // Should not contain the full 20000 chars
+        assert!(user.len() < 15000);
     }
 
     #[test]
     fn article_extraction_prompt_includes_content() {
-        let (_, user) = build_article_extraction_prompt("<meta name='date' content='2026'>", "Article body text here");
-        assert!(user.contains("Article body text here"));
+        let (_, user) = build_article_extraction_prompt("<meta name='date'>", "Article body here");
+        assert!(user.contains("Article body here"));
         assert!(user.contains("published_date"));
+        assert!(user.contains("is_error_page"));
     }
 ```
 
@@ -291,7 +324,7 @@ In `schema.rs` tests:
     }
 
     #[test]
-    fn article_extraction_schema_has_all_fields() {
+    fn article_extraction_schema_strict_mode_compatible() {
         let schema = build_article_extraction_schema();
         let props = schema["properties"].as_object().unwrap();
         assert!(props.contains_key("title"));
@@ -299,14 +332,15 @@ In `schema.rs` tests:
         assert!(props.contains_key("body_text"));
         assert!(props.contains_key("is_error_page"));
         assert_eq!(schema["additionalProperties"], false);
+        // published_date is string (not ["string", "null"]) for OpenAI strict mode
+        assert_eq!(props["published_date"]["type"], "string");
     }
 ```
 
-- [ ] **Step 5: Run tests + commit**
-
-Run: `cd backend && cargo test --lib`
+- [ ] **Step 4: Run tests + commit**
 
 ```bash
+cd backend && cargo test --lib
 git add backend/src/services/prompts.rs backend/src/services/llm/schema.rs
 git commit -m "feat: add LLM prompts and schemas for link and article extraction"
 ```
@@ -318,37 +352,34 @@ git commit -m "feat: add LLM prompts and schemas for link and article extraction
 **Files:**
 - Modify: `backend/src/services/source_scraper.rs`
 
-- [ ] **Step 1: Update `extract_article_links` to accept optional LLM provider**
-
-Add a new public function `extract_article_links_with_llm` that accepts LLM parameters. The existing `extract_article_links` stays unchanged for non-LLM path.
+- [ ] **Step 1: Add `extract_article_links_with_llm`**
 
 ```rust
+use std::sync::Arc;
 use crate::services::llm::LlmProvider;
 use crate::services::llm::schema::build_link_extraction_schema;
 use crate::services::prompts::build_link_extraction_prompt;
 
 /// Extract article links using LLM analysis of the page HTML.
 ///
-/// Falls back to heuristic extraction if the LLM call fails or returns empty results.
+/// Falls back to heuristic extraction if the LLM call fails or returns empty.
 pub async fn extract_article_links_with_llm(
     http_client: &reqwest::Client,
     source_url: &str,
     max_links: usize,
-    provider: &dyn LlmProvider,
+    provider: &Arc<dyn LlmProvider>,
     model: &str,
 ) -> Result<Vec<String>, AppError> {
     let base_url = Url::parse(source_url)
         .map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?;
     let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
 
-    // Fetch the page
     let response = http_client.get(source_url).send().await.map_err(|e| {
         tracing::warn!(url = source_url, error = %e, "Failed to fetch source page");
         AppError::Internal(anyhow::anyhow!("Failed to fetch source page"))
     })?;
 
     if !response.status().is_success() {
-        tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200");
         return Ok(Vec::new());
     }
 
@@ -356,67 +387,60 @@ pub async fn extract_article_links_with_llm(
         AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
     })?;
 
-    // Extract <head> and first 8000 chars of <body> for the LLM
     let (head_html, body_html) = extract_head_and_body(&html_text);
-
     let (system, user) = build_link_extraction_prompt(&head_html, &body_html);
     let schema = build_link_extraction_schema();
 
     match provider.generate_rewrite_pass(model, &system, &user, &schema).await {
-        Ok(response) => {
-            let urls = response
+        Ok(llm_response) => {
+            let urls: Vec<String> = llm_response
                 .get("urls")
                 .and_then(|u| u.as_array())
                 .map(|arr| {
                     arr.iter()
                         .filter_map(|v| v.as_str())
                         .filter_map(|href| {
-                            // Resolve relative URLs
                             let resolved = base_url.join(href).ok()?;
-                            // Filter: http/https only, same domain
                             if resolved.scheme() != "http" && resolved.scheme() != "https" {
                                 return None;
                             }
-                            let domain = resolved.host_str()?.to_lowercase();
-                            if domain != base_domain {
+                            if resolved.host_str()?.to_lowercase() != base_domain {
                                 return None;
                             }
                             Some(resolved.to_string())
                         })
-                        .collect::<Vec<_>>()
+                        .collect()
                 })
                 .unwrap_or_default();
 
             if urls.is_empty() {
-                tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic extraction");
+                tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic");
                 let fallback = extract_links_from_html(&html_text, &base_url, &base_domain);
                 Ok(fallback.into_iter().take(max_links).collect())
             } else {
-                // Deduplicate
                 let mut seen = std::collections::HashSet::new();
                 let deduped: Vec<String> = urls.into_iter().filter(|u| seen.insert(u.clone())).collect();
                 Ok(deduped.into_iter().take(max_links).collect())
             }
         }
         Err(e) => {
-            tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back to heuristic");
+            tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back");
             let fallback = extract_links_from_html(&html_text, &base_url, &base_domain);
             Ok(fallback.into_iter().take(max_links).collect())
         }
     }
 }
 
-/// Extract the <head> section and first N chars of <body> from HTML.
-fn extract_head_and_body(html: &str) -> (String, String) {
+/// Extract <head> section and first 8000 chars of <body> from HTML (UTF-8 safe).
+pub fn extract_head_and_body(html: &str) -> (String, String) {
     let head_start = html.find("<head").unwrap_or(0);
     let head_end = html.find("</head>").map(|i| i + 7).unwrap_or(head_start);
     let head = &html[head_start..head_end];
 
     let body_start = html.find("<body").unwrap_or(head_end);
-    let body_end = (body_start + 8000).min(html.len());
-    let body = &html[body_start..body_end];
+    let body: String = html[body_start..].chars().take(8000).collect();
 
-    (head.to_string(), body.to_string())
+    (head.to_string(), body)
 }
 ```
 
@@ -432,19 +456,18 @@ fn extract_head_and_body(html: &str) -> (String, String) {
     }
 
     #[test]
-    fn extract_head_and_body_truncates_body() {
+    fn extract_head_and_body_truncates_body_safely() {
         let long_body = "x".repeat(20000);
         let html = format!("<head></head><body>{}</body>", long_body);
         let (_, body) = extract_head_and_body(&html);
-        assert!(body.len() <= 8006); // <body> tag + 8000 chars
+        assert_eq!(body.chars().count(), 8000);
     }
 ```
 
 - [ ] **Step 3: Run tests + commit**
 
-Run: `cd backend && cargo test --lib`
-
 ```bash
+cd backend && cargo test --lib
 git add backend/src/services/source_scraper.rs
 git commit -m "feat: LLM-assisted source link extraction with heuristic fallback"
 ```
@@ -456,25 +479,19 @@ git commit -m "feat: LLM-assisted source link extraction with heuristic fallback
 **Files:**
 - Modify: `backend/src/services/synthesis.rs`
 
-- [ ] **Step 1: Add `scrape_single_article_with_llm` function**
+- [ ] **Step 1: Add `scrape_single_article_with_llm`**
 
-Add a new async function alongside `scrape_single_article`:
+This function receives the LLM provider via `Arc` and uses `head_html` from `ScrapedContent`:
 
 ```rust
-/// Scrape an article URL using LLM for content extraction.
-///
-/// Falls back to heuristic extraction if the LLM call fails.
 async fn scrape_single_article_with_llm(
     http_client: &reqwest::Client,
     url: &str,
     max_age_days: i64,
-    provider: &dyn crate::services::llm::LlmProvider,
-    model: &str,
+    provider: Arc<dyn crate::services::llm::LlmProvider>,
+    model: String,
 ) -> (String, String, String) {
-    // First, do the HTTP fetch (same as regular scraping)
-    let fetch_result = scraper::scrape_url(http_client, url).await;
-
-    let content = match fetch_result {
+    let content = match scraper::scrape_url(http_client, url).await {
         Ok(c) => c,
         Err(e) => {
             tracing::warn!(url = url, error = %e, "Failed to fetch URL for LLM extraction");
@@ -488,44 +505,35 @@ async fn scrape_single_article_with_llm(
         return (String::new(), String::new(), final_url);
     }
 
-    // Extract <head> from the raw HTML for the LLM
-    // We need to re-fetch the raw HTML or extract it from the scraper
-    // Since scraper already parsed it, we'll use the existing body_text + title as input
-    let head_html = String::new(); // The scraper doesn't preserve <head> — use empty
-    let body_text = &content.body_text;
-
     let (system, user) = crate::services::prompts::build_article_extraction_prompt(
-        &head_html,
-        body_text,
+        &content.head_html,
+        &content.body_text,
     );
     let schema = crate::services::llm::schema::build_article_extraction_schema();
 
-    match provider.generate_rewrite_pass(model, &system, &user, &schema).await {
+    match provider.generate_rewrite_pass(&model, &system, &user, &schema).await {
         Ok(response) => {
             let title = response.get("title").and_then(|t| t.as_str()).unwrap_or("").to_string();
-            let extracted_body = response.get("body_text").and_then(|b| b.as_str()).unwrap_or("").to_string();
+            let body = response.get("body_text").and_then(|b| b.as_str()).unwrap_or("").to_string();
             let is_error = response.get("is_error_page").and_then(|e| e.as_bool()).unwrap_or(false);
             let date_str = response.get("published_date").and_then(|d| d.as_str()).unwrap_or("");
 
-            if is_error || extracted_body.trim().is_empty() {
+            if is_error || body.trim().is_empty() {
                 return (String::new(), String::new(), final_url);
             }
 
-            // Check date if provided
             if !date_str.is_empty() {
                 if let Ok(date) = chrono::DateTime::parse_from_rfc3339(date_str) {
                     if scraper::is_article_too_old(Some(date.with_timezone(&chrono::Utc)), max_age_days) {
-                        tracing::warn!(url = url, "LLM-extracted article too old");
                         return (String::new(), String::new(), final_url);
                     }
                 }
             }
 
-            (extracted_body, title, final_url)
+            (body, title, final_url)
         }
         Err(e) => {
-            tracing::warn!(url = url, error = %e, "LLM article extraction failed, using heuristic fallback");
-            // Fall back to existing heuristic data
+            tracing::warn!(url = url, error = %e, "LLM extraction failed, using heuristic fallback");
             if scraper::is_article_too_old(content.published_date, max_age_days) {
                 return (String::new(), String::new(), final_url);
             }
@@ -536,40 +544,72 @@ async fn scrape_single_article_with_llm(
 }
 ```
 
-- [ ] **Step 2: Update pipeline to use LLM extraction when enabled**
+Note: `provider: Arc<dyn LlmProvider>` and `model: String` — both are `'static` and can be moved into spawned tasks.
 
-In `run_generation_inner`, the scraping calls need to branch based on `settings.use_llm_for_article_extraction`. The simplest approach: update `scrape_flat_urls` and `scrape_articles` to accept an optional provider+model, and use `scrape_single_article_with_llm` when provided.
+- [ ] **Step 2: Update `scrape_flat_urls` and `scrape_articles` for LLM dispatch**
 
-Add a wrapper that the pipeline calls:
+Add a parameter `llm: Option<(Arc<dyn LlmProvider>, String)>` to both functions. When `Some`, use `scrape_single_article_with_llm` instead of `scrape_single_article`. Set `max_concurrent = 5` when LLM is enabled, `10` otherwise.
 
+In the spawn closures, clone the `Arc` and `String`:
 ```rust
-/// Scrape a single article, optionally using LLM extraction.
-async fn scrape_article_dispatch(
-    http_client: &reqwest::Client,
-    url: &str,
-    max_age_days: i64,
-    llm: Option<(&dyn crate::services::llm::LlmProvider, &str)>,
-) -> (String, String, String) {
-    match llm {
-        Some((provider, model)) => {
-            scrape_single_article_with_llm(http_client, url, max_age_days, provider, model).await
-        }
-        None => scrape_single_article(http_client, url, max_age_days).await,
-    }
+if let Some((ref provider, ref model)) = llm {
+    let provider = Arc::clone(provider);
+    let model = model.clone();
+    join_set.spawn(async move {
+        let scraped = scrape_single_article_with_llm(&client, &url, mad, provider, model).await;
+        // ...
+    });
+} else {
+    join_set.spawn(async move {
+        let scraped = scrape_single_article(&client, &url, mad).await;
+        // ...
+    });
 }
 ```
 
-Update `scrape_flat_urls` and `scrape_articles` to use `scrape_article_dispatch`. The provider and model are passed from `run_generation_inner` based on `settings.use_llm_for_article_extraction`.
+Add progress reporting for LLM extraction:
+```rust
+let progress_label = if llm.is_some() {
+    format!("Extraction IA des articles ({}/{})...", completed, total)
+} else {
+    format!("Verification des sources ({}/{})...", completed, total)
+};
+emit_progress(tx, "scraping", &progress_label, pct as u8);
+```
 
-Similarly, update the Phase 1 source scraping in `run_generation_inner` to call `extract_article_links_with_llm` vs `extract_article_links` based on `settings.use_llm_for_source_links`.
+- [ ] **Step 3: Update `run_generation_inner` to pass LLM params**
 
-- [ ] **Step 3: Run tests + commit**
+In Phase 1 and Phase 2 scraping calls, pass the LLM option:
+```rust
+let llm_for_scraping = if settings.use_llm_for_article_extraction {
+    Some((Arc::clone(&provider), model_research.clone()))
+} else {
+    None
+};
+```
+
+Pass `llm_for_scraping` to `scrape_flat_urls` and `scrape_articles`.
+
+Similarly for source link extraction:
+```rust
+if settings.use_llm_for_source_links {
+    source_scraper::extract_article_links_with_llm(
+        &state.http_client, &source.url, max_links_per_source,
+        &provider, &model_research,
+    ).await
+} else {
+    source_scraper::extract_article_links(
+        &state.http_client, &source.url, max_links_per_source,
+    ).await
+}
+```
 
-Run: `cd backend && cargo test --lib`
+- [ ] **Step 4: Run tests + commit**
 
 ```bash
+cd backend && cargo test --lib
 git add backend/src/services/synthesis.rs
-git commit -m "feat: LLM-assisted article extraction with heuristic fallback"
+git commit -m "feat: LLM-assisted article extraction with Arc provider and heuristic fallback"
 ```
 
 ---
@@ -581,32 +621,29 @@ git commit -m "feat: LLM-assisted article extraction with heuristic fallback"
 - Modify: `frontend/src/i18n/fr.ts`
 - Modify: `frontend/src/pages/Settings.tsx`
 
-- [ ] **Step 1: Add fields to types**
+- [ ] **Step 1: Add fields to types + DEFAULT_SETTINGS**
 
-In `frontend/src/types.ts`, add to `UserSettings`:
 ```typescript
+// In UserSettings interface:
 use_llm_for_source_links: boolean;
 use_llm_for_article_extraction: boolean;
-```
 
-Add to `DEFAULT_SETTINGS`:
-```typescript
+// In DEFAULT_SETTINGS:
 use_llm_for_source_links: false,
 use_llm_for_article_extraction: false,
 ```
 
 - [ ] **Step 2: Add i18n labels**
 
-In `frontend/src/i18n/fr.ts`:
 ```typescript
 'settings.advancedExtraction': 'Extraction avancee',
 'settings.useLlmForSourceLinks': "Utiliser l'IA pour extraire les liens",
 'settings.useLlmForArticleExtraction': "Utiliser l'IA pour extraire le contenu",
 ```
 
-- [ ] **Step 3: Add checkboxes to Settings page**
+- [ ] **Step 3: Add checkboxes in Settings page**
 
-In `frontend/src/pages/Settings.tsx`, add a new section after the existing generation settings (after the grid with maxAgeDays/maxItemsPerCategory/maxArticlesPerSource/diversityWindow):
+Add after the generation settings grid, before the search agent behavior section:
 
 ```tsx
           {/* Advanced extraction */}
@@ -655,9 +692,8 @@ In `frontend/src/pages/Settings.tsx`, add a new section after the existing gener
 
 - [ ] **Step 4: Run frontend tests + commit**
 
-Run: `cd frontend && npx tsc --noEmit && npx vitest run`
-
 ```bash
+cd frontend && npx tsc --noEmit && npx vitest run
 git add frontend/src/types.ts frontend/src/i18n/fr.ts frontend/src/pages/Settings.tsx
 git commit -m "feat: add LLM scraping toggles to Settings page"
 ```
@@ -671,15 +707,23 @@ git commit -m "feat: add LLM scraping toggles to Settings page"
 
 - [ ] **Step 1: Update settings payload**
 
-Add the new boolean fields to the PUT settings call:
+Add to the PUT settings body:
 ```typescript
 use_llm_for_source_links: false,
 use_llm_for_article_extraction: false,
 ```
 
-- [ ] **Step 2: Add comprehensive validation after synthesis fetch**
+- [ ] **Step 2: Add comprehensive validation using `request` fixture**
 
-After the existing structure validation, add:
+Update the test function signature to include the `request` fixture:
+```typescript
+test('full generation pipeline produces valid synthesis', async ({
+    page,
+    request,
+}) => {
+```
+
+Add after existing structure validation:
 
 ```typescript
     // Comprehensive synthesis validation
@@ -688,20 +732,15 @@ After the existing structure validation, add:
 
     for (const section of synthesis.sections) {
       for (const item of section.items) {
-        // Collect URLs for duplicate check
         allUrls.push(item.url);
-
-        // Count domains for source diversity check
         try {
           const domain = new URL(item.url).hostname;
           domainCounts[domain] = (domainCounts[domain] || 0) + 1;
         } catch {}
       }
 
-      // Category article count check
-      if (section.title !== 'Autre') {
-        expect(section.items.length).toBeLessThanOrEqual(4); // max_items_per_category
-      }
+      // Category article count check (including Autre)
+      expect(section.items.length).toBeLessThanOrEqual(4); // max_items_per_category
     }
 
     // No duplicate URLs across all sections
@@ -713,20 +752,12 @@ After the existing structure validation, add:
       expect(count).toBeLessThanOrEqual(3);
     }
 
-    // Verify article links actually work (HTTP 200)
-    // Test a sample of up to 3 URLs to avoid slowness
+    // Verify a sample of article links actually work (using Playwright request API, no CORS issues)
     const sampleUrls = allUrls.slice(0, 3);
     for (const articleUrl of sampleUrls) {
-      const linkCheck = await page.evaluate(async (url: string) => {
-        try {
-          const resp = await fetch(url, { method: 'HEAD', redirect: 'follow' });
-          return resp.status;
-        } catch {
-          return 0;
-        }
-      }, articleUrl);
-      expect(linkCheck).toBeGreaterThanOrEqual(200);
-      expect(linkCheck).toBeLessThan(400);
+      const resp = await request.head(articleUrl);
+      expect(resp.status()).toBeGreaterThanOrEqual(200);
+      expect(resp.status()).toBeLessThan(400);
     }
 ```
 
@@ -742,5 +773,28 @@ sleep 25 && npx tsx seed.ts && npx playwright test generation-live --reporter=li
 
 ```bash
 git add e2e/tests/generation-live.spec.ts
-git commit -m "test: comprehensive E2E synthesis validation (duplicates, links, counts)"
+git commit -m "test: comprehensive E2E synthesis validation (duplicates, links, counts, domains)"
+```
+
+---
+
+### Task 8: Update integration test
+
+**Files:**
+- Modify: `backend/tests/api_syntheses_test.rs`
+
+- [ ] **Step 1: Update settings payload in `generate_pipeline_resolves_model_from_admin_config`**
+
+Add the new boolean fields to the PUT settings body:
+```rust
+"use_llm_for_source_links": false,
+"use_llm_for_article_extraction": false,
+```
+
+- [ ] **Step 2: Run integration test compilation check + commit**
+
+```bash
+cd backend && cargo test --no-run
+git add backend/tests/api_syntheses_test.rs
+git commit -m "test: update integration test with LLM scraping settings"
 ```