From d508ea962075db6783475e6fcd4d13798372aabe Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 10:35:03 +0100 Subject: [PATCH] =?UTF-8?q?docs:=20revise=20LLM=20scraping=20plan=20?= =?UTF-8?q?=E2=80=94=20fix=20Arc=20provider,=20head=5Fhtml,=20concurrency,?= =?UTF-8?q?=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../plans/2026-03-24-llm-scraping.md | 442 ++++++++++-------- 1 file changed, 248 insertions(+), 194 deletions(-) diff --git a/docs/superpowers/plans/2026-03-24-llm-scraping.md b/docs/superpowers/plans/2026-03-24-llm-scraping.md index c3887a7..99c6bae 100644 --- a/docs/superpowers/plans/2026-03-24-llm-scraping.md +++ b/docs/superpowers/plans/2026-03-24-llm-scraping.md @@ -1,12 +1,12 @@ -# LLM-Assisted Scraping — Implementation Plan +# LLM-Assisted Scraping — Implementation Plan (Revised) > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. **Goal:** Add two optional LLM-powered scraping enhancements: LLM link extraction from source pages and LLM article content extraction — controlled by user settings. -**Architecture:** Two boolean settings control two independent LLM scraping paths. Each has a fallback to existing heuristic-based extraction. `ScrapedContent` gains a `url` field for redirect-resolved URLs. New prompt/schema builders for both LLM calls. +**Architecture:** Two boolean settings control two independent LLM scraping paths. Each has a fallback to existing heuristic-based extraction. `ScrapedContent` gains `url` and `head_html` fields. `create_provider` returns `Arc` for safe sharing across concurrent tasks. When LLM extraction is enabled, concurrency is reduced to max 5. -**Tech Stack:** Rust (reqwest, scraper crate, serde_json), existing LLM providers via `generate_rewrite_pass` +**Tech Stack:** Rust (reqwest, scraper crate, serde_json, Arc), existing LLM providers via `generate_rewrite_pass` **Spec:** `docs/superpowers/specs/2026-03-24-llm-scraping-design.md` @@ -36,38 +36,69 @@ Add to `From for SettingsResponse`, `Default for UserSettings` (bo - [ ] **Step 3: Add to DB queries in `db/settings.rs`** -Add both fields to `SettingsRow`, `TryFrom`, and both SQL queries (`get_or_create_default` + `upsert`). Follow the pattern of the last column added. +Add both fields to `SettingsRow`, `TryFrom`, and both SQL queries. Follow the pattern of the last column added. - [ ] **Step 4: Update test fixtures** -Add `use_llm_for_source_links: false, use_llm_for_article_extraction: false` to: -- `valid_request()` in `models/settings.rs` tests -- `test_settings()` in `services/prompts.rs` tests +Add `use_llm_for_source_links: false, use_llm_for_article_extraction: false` to `valid_request()` in settings tests and `test_settings()` in prompts tests. - [ ] **Step 5: Update CLAUDE.md migration count to 14** - [ ] **Step 6: Run tests + commit** -Run: `cd backend && cargo test --lib` - ```bash +cd backend && cargo test --lib git add backend/migrations/20260324000014_add_llm_scraping_settings.sql backend/src/models/settings.rs backend/src/db/settings.rs backend/src/services/prompts.rs CLAUDE.md git commit -m "feat: add use_llm_for_source_links and use_llm_for_article_extraction settings" ``` --- -### Task 2: Add `url` field to `ScrapedContent` + update `scrape_single_article` +### Task 2: Add `url` and `head_html` to `ScrapedContent` + `Arc` + update scraping functions **Files:** - Modify: `backend/src/services/scraper.rs` +- Modify: `backend/src/services/llm/factory.rs` +- Modify: `backend/src/services/llm/mod.rs` (trait needs `Send + Sync`) - Modify: `backend/src/services/synthesis.rs` +- Modify: `backend/src/handlers/api_keys.rs` + +- [ ] **Step 1: Add `url` and `head_html` to `ScrapedContent` in `scraper.rs`** + +```rust +pub struct ScrapedContent { + pub ok: bool, + pub status: u16, + pub title: Option, + pub published_date: Option>, + pub body_text: String, + pub is_soft_404: bool, + pub url: String, + pub head_html: String, +} +``` + +In `scrape_url`, before parsing the document, extract ``: +```rust +let html_text = String::from_utf8_lossy(&bytes); -- [ ] **Step 1: Add `url` to `ScrapedContent` in `scraper.rs`** +// Extract section for potential LLM use +let head_html = extract_head_section(&html_text); -Add `pub url: String` to the `ScrapedContent` struct (after `is_soft_404`). +let document = Html::parse_document(&html_text); +``` -In `scrape_url`, populate it from `final_url`: +Add helper: +```rust +/// Extract the ... section from raw HTML. +fn extract_head_section(html: &str) -> String { + let start = html.find("").map(|i| i + 7).unwrap_or(start); + html[start..end].to_string() +} +``` + +Populate in the return: ```rust Ok(ScrapedContent { ok: !is_soft_404, @@ -77,12 +108,47 @@ Ok(ScrapedContent { body_text, is_soft_404, url: final_url.to_string(), + head_html, }) ``` -- [ ] **Step 2: Update `scrape_single_article` to return `(String, String, String)`** +- [ ] **Step 2: Change `create_provider` to return `Arc`** -In `synthesis.rs`, change `scrape_single_article` return type from `(String, String)` to `(String, String, String)` — `(body_text, page_title, final_url)`: +In `backend/src/services/llm/factory.rs`, change the return type: +```rust +use std::sync::Arc; + +pub fn create_provider( + provider_name: &str, + api_key: String, +) -> Result, AppError> { + let http_client = build_llm_client()?; + match provider_name { + "gemini" => Ok(Arc::new(GeminiProvider::new(api_key, http_client))), + "openai" => Ok(Arc::new(OpenAiProvider::new(api_key, http_client))), + "anthropic" => Ok(Arc::new(AnthropicProvider::new(api_key, http_client))), + _ => Err(AppError::BadRequest(format!("Unknown provider: '{}'", provider_name))), + } +} +``` + +Update all factory tests to use `Arc` (they call methods on the provider, which works the same). + +Ensure `LlmProvider` trait in `llm/mod.rs` has `Send + Sync` bounds: +```rust +#[async_trait] +pub trait LlmProvider: Send + Sync { +``` + +- [ ] **Step 3: Update all callers of `create_provider`** + +In `synthesis.rs` `run_generation_inner`: `let provider = create_provider(...)` — now returns `Arc`. Method calls on `Arc` work via auto-deref. Update `provider.generate_search_pass(...)` calls — they should work as-is since `Arc` derefs to `T`. + +In `handlers/api_keys.rs`: `let llm_provider = factory::create_provider(...)` — same, just works via deref. + +- [ ] **Step 4: Update `scrape_single_article` to return 3-tuple** + +Change return type from `(String, String)` to `(String, String, String)` — `(body_text, page_title, final_url)`: ```rust async fn scrape_single_article( @@ -92,69 +158,36 @@ async fn scrape_single_article( ) -> (String, String, String) { match scraper::scrape_url(http_client, url).await { Ok(content) => { + let final_url = content.url.clone(); if !content.ok || content.is_soft_404 { - tracing::warn!(url = url, "Soft 404 or error page detected, skipping content"); - return (String::new(), String::new(), content.url); + return (String::new(), String::new(), final_url); } - if scraper::is_article_too_old(content.published_date, max_age_days) { - tracing::warn!(url = url, "Article too old, skipping content"); - return (String::new(), String::new(), content.url); + return (String::new(), String::new(), final_url); } - let title = content.title.unwrap_or_default(); - (content.body_text, title, content.url) + (content.body_text, title, final_url) } Err(e) => { - tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content"); + tracing::warn!(url = url, error = %e, "Failed to scrape URL"); (String::new(), String::new(), url.to_string()) } } } ``` -- [ ] **Step 3: Update callers of `scrape_single_article`** - -In `scrape_articles` and `scrape_flat_urls`, update destructuring from `(scraped_content, page_title)` to `(scraped_content, page_title, final_url)`. Use `final_url` to set `ScrapedNewsItem.url` instead of the input URL: - -In `scrape_articles` (inside the `join_set.spawn`): -```rust -let scraped = scrape_single_article(&client, &url, mad).await; -(cat_key, item, scraped) -``` -And in the result handler: -```rust -if let Ok((cat_key, item, (scraped_content, page_title, final_url))) = join_result { - let scraped_item = ScrapedNewsItem { - title: item.title, - url: final_url, // Use redirect-resolved URL instead of item.url - summary: item.summary, - original_title: page_title, - scraped_content, - }; -``` +- [ ] **Step 5: Update callers of `scrape_single_article`** -Same pattern in `scrape_flat_urls`: -```rust -if let Ok((url, scraped_content, page_title, final_url)) = join_result { - results.push(ScrapedNewsItem { - title: page_title.clone(), - url: final_url, // Use redirect-resolved URL - summary: String::new(), - original_title: page_title, - scraped_content, - }); -``` +In `scrape_articles`: update spawn closure to return `(cat_key, item, (scraped_content, page_title, final_url))`. In result handler, use `final_url` for `ScrapedNewsItem.url`. -Note: the `join_set.spawn` closure must also capture and return `final_url`. Update the spawn to return 4-tuple: `(url, scraped_content, page_title, final_url)`. +In `scrape_flat_urls`: update spawn closure to return `(original_url, scraped_content, page_title, final_url)`. Use `final_url` for `ScrapedNewsItem.url`. -- [ ] **Step 4: Run tests + commit** - -Run: `cd backend && cargo test --lib` +- [ ] **Step 6: Run tests + commit** ```bash -git add backend/src/services/scraper.rs backend/src/services/synthesis.rs -git commit -m "feat: add url field to ScrapedContent, use redirect-resolved URLs" +cd backend && cargo test --lib +git add backend/src/services/scraper.rs backend/src/services/llm/factory.rs backend/src/services/llm/mod.rs backend/src/services/synthesis.rs backend/src/handlers/api_keys.rs +git commit -m "feat: ScrapedContent url+head_html fields, Arc, 3-tuple scrape returns" ``` --- @@ -165,14 +198,10 @@ git commit -m "feat: add url field to ScrapedContent, use redirect-resolved URLs - Modify: `backend/src/services/prompts.rs` - Modify: `backend/src/services/llm/schema.rs` -- [ ] **Step 1: Add `build_link_extraction_prompt` to `prompts.rs`** +- [ ] **Step 1: Add `build_link_extraction_prompt` and `build_article_extraction_prompt` to `prompts.rs`** ```rust /// Build a prompt for LLM-assisted link extraction from a source page. -/// -/// # Arguments -/// * `head_html` — the section of the page -/// * `body_html` — first 8000 chars of the section pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) { let system_prompt = "Tu es un assistant qui analyse des pages web. \ @@ -180,6 +209,8 @@ pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String Reponds uniquement au format JSON demande." .to_string(); + let body_truncated: String = body_html.chars().take(8000).collect(); + let user_prompt = format!( "Voici le contenu HTML d'une page de blog ou de site d'actualites.\n\n\ \n{head}\n\n\n\ @@ -188,21 +219,13 @@ pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String (pas les liens de navigation, tags, categories, login, pages statiques, etc.).\n\ Retourne les URLs completes dans le format JSON demande.", head = head_html, - body = body_html, + body = body_truncated, ); (system_prompt, user_prompt) } -``` - -- [ ] **Step 2: Add `build_article_extraction_prompt` to `prompts.rs`** -```rust /// Build a prompt for LLM-assisted article content extraction. -/// -/// # Arguments -/// * `head_html` — the section (contains meta tags, og:*, canonical) -/// * `body_text` — cleaned body text from existing HTML stripping pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (String, String) { let system_prompt = "Tu es un assistant qui analyse des articles web. \ @@ -228,10 +251,11 @@ pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (Str } ``` -- [ ] **Step 3: Add schemas to `schema.rs`** +Note: `build_link_extraction_prompt` truncates body using `.chars().take(8000)` (UTF-8 safe). + +- [ ] **Step 2: Add schemas to `schema.rs`** ```rust -/// Build a JSON Schema for LLM link extraction response. pub fn build_link_extraction_schema() -> Value { serde_json::json!({ "type": "object", @@ -246,7 +270,6 @@ pub fn build_link_extraction_schema() -> Value { }) } -/// Build a JSON Schema for LLM article content extraction response. pub fn build_article_extraction_schema() -> Value { serde_json::json!({ "type": "object", @@ -262,22 +285,32 @@ pub fn build_article_extraction_schema() -> Value { } ``` -- [ ] **Step 4: Add tests** +- [ ] **Step 3: Add tests for prompts and schemas** In `prompts.rs` tests: ```rust #[test] fn link_extraction_prompt_includes_html() { - let (_, user) = build_link_extraction_prompt("Blog", "P"); + let (sys, user) = build_link_extraction_prompt("Blog", "P"); assert!(user.contains("Blog")); assert!(user.contains("articles")); + assert!(sys.contains("liens")); + } + + #[test] + fn link_extraction_prompt_truncates_body() { + let long_body = "x".repeat(20000); + let (_, user) = build_link_extraction_prompt("", &long_body); + // Should not contain the full 20000 chars + assert!(user.len() < 15000); } #[test] fn article_extraction_prompt_includes_content() { - let (_, user) = build_article_extraction_prompt("", "Article body text here"); - assert!(user.contains("Article body text here")); + let (_, user) = build_article_extraction_prompt("", "Article body here"); + assert!(user.contains("Article body here")); assert!(user.contains("published_date")); + assert!(user.contains("is_error_page")); } ``` @@ -291,7 +324,7 @@ In `schema.rs` tests: } #[test] - fn article_extraction_schema_has_all_fields() { + fn article_extraction_schema_strict_mode_compatible() { let schema = build_article_extraction_schema(); let props = schema["properties"].as_object().unwrap(); assert!(props.contains_key("title")); @@ -299,14 +332,15 @@ In `schema.rs` tests: assert!(props.contains_key("body_text")); assert!(props.contains_key("is_error_page")); assert_eq!(schema["additionalProperties"], false); + // published_date is string (not ["string", "null"]) for OpenAI strict mode + assert_eq!(props["published_date"]["type"], "string"); } ``` -- [ ] **Step 5: Run tests + commit** - -Run: `cd backend && cargo test --lib` +- [ ] **Step 4: Run tests + commit** ```bash +cd backend && cargo test --lib git add backend/src/services/prompts.rs backend/src/services/llm/schema.rs git commit -m "feat: add LLM prompts and schemas for link and article extraction" ``` @@ -318,37 +352,34 @@ git commit -m "feat: add LLM prompts and schemas for link and article extraction **Files:** - Modify: `backend/src/services/source_scraper.rs` -- [ ] **Step 1: Update `extract_article_links` to accept optional LLM provider** - -Add a new public function `extract_article_links_with_llm` that accepts LLM parameters. The existing `extract_article_links` stays unchanged for non-LLM path. +- [ ] **Step 1: Add `extract_article_links_with_llm`** ```rust +use std::sync::Arc; use crate::services::llm::LlmProvider; use crate::services::llm::schema::build_link_extraction_schema; use crate::services::prompts::build_link_extraction_prompt; /// Extract article links using LLM analysis of the page HTML. /// -/// Falls back to heuristic extraction if the LLM call fails or returns empty results. +/// Falls back to heuristic extraction if the LLM call fails or returns empty. pub async fn extract_article_links_with_llm( http_client: &reqwest::Client, source_url: &str, max_links: usize, - provider: &dyn LlmProvider, + provider: &Arc, model: &str, ) -> Result, AppError> { let base_url = Url::parse(source_url) .map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?; let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); - // Fetch the page let response = http_client.get(source_url).send().await.map_err(|e| { tracing::warn!(url = source_url, error = %e, "Failed to fetch source page"); AppError::Internal(anyhow::anyhow!("Failed to fetch source page")) })?; if !response.status().is_success() { - tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200"); return Ok(Vec::new()); } @@ -356,67 +387,60 @@ pub async fn extract_article_links_with_llm( AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e)) })?; - // Extract and first 8000 chars of for the LLM let (head_html, body_html) = extract_head_and_body(&html_text); - let (system, user) = build_link_extraction_prompt(&head_html, &body_html); let schema = build_link_extraction_schema(); match provider.generate_rewrite_pass(model, &system, &user, &schema).await { - Ok(response) => { - let urls = response + Ok(llm_response) => { + let urls: Vec = llm_response .get("urls") .and_then(|u| u.as_array()) .map(|arr| { arr.iter() .filter_map(|v| v.as_str()) .filter_map(|href| { - // Resolve relative URLs let resolved = base_url.join(href).ok()?; - // Filter: http/https only, same domain if resolved.scheme() != "http" && resolved.scheme() != "https" { return None; } - let domain = resolved.host_str()?.to_lowercase(); - if domain != base_domain { + if resolved.host_str()?.to_lowercase() != base_domain { return None; } Some(resolved.to_string()) }) - .collect::>() + .collect() }) .unwrap_or_default(); if urls.is_empty() { - tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic extraction"); + tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic"); let fallback = extract_links_from_html(&html_text, &base_url, &base_domain); Ok(fallback.into_iter().take(max_links).collect()) } else { - // Deduplicate let mut seen = std::collections::HashSet::new(); let deduped: Vec = urls.into_iter().filter(|u| seen.insert(u.clone())).collect(); Ok(deduped.into_iter().take(max_links).collect()) } } Err(e) => { - tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back to heuristic"); + tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back"); let fallback = extract_links_from_html(&html_text, &base_url, &base_domain); Ok(fallback.into_iter().take(max_links).collect()) } } } -/// Extract the section and first N chars of from HTML. -fn extract_head_and_body(html: &str) -> (String, String) { +/// Extract section and first 8000 chars of from HTML (UTF-8 safe). +pub fn extract_head_and_body(html: &str) -> (String, String) { let head_start = html.find("").map(|i| i + 7).unwrap_or(head_start); let head = &html[head_start..head_end]; let body_start = html.find(" (String, String) { } #[test] - fn extract_head_and_body_truncates_body() { + fn extract_head_and_body_truncates_body_safely() { let long_body = "x".repeat(20000); let html = format!("{}", long_body); let (_, body) = extract_head_and_body(&html); - assert!(body.len() <= 8006); // tag + 8000 chars + assert_eq!(body.chars().count(), 8000); } ``` - [ ] **Step 3: Run tests + commit** -Run: `cd backend && cargo test --lib` - ```bash +cd backend && cargo test --lib git add backend/src/services/source_scraper.rs git commit -m "feat: LLM-assisted source link extraction with heuristic fallback" ``` @@ -456,25 +479,19 @@ git commit -m "feat: LLM-assisted source link extraction with heuristic fallback **Files:** - Modify: `backend/src/services/synthesis.rs` -- [ ] **Step 1: Add `scrape_single_article_with_llm` function** +- [ ] **Step 1: Add `scrape_single_article_with_llm`** -Add a new async function alongside `scrape_single_article`: +This function receives the LLM provider via `Arc` and uses `head_html` from `ScrapedContent`: ```rust -/// Scrape an article URL using LLM for content extraction. -/// -/// Falls back to heuristic extraction if the LLM call fails. async fn scrape_single_article_with_llm( http_client: &reqwest::Client, url: &str, max_age_days: i64, - provider: &dyn crate::services::llm::LlmProvider, - model: &str, + provider: Arc, + model: String, ) -> (String, String, String) { - // First, do the HTTP fetch (same as regular scraping) - let fetch_result = scraper::scrape_url(http_client, url).await; - - let content = match fetch_result { + let content = match scraper::scrape_url(http_client, url).await { Ok(c) => c, Err(e) => { tracing::warn!(url = url, error = %e, "Failed to fetch URL for LLM extraction"); @@ -488,44 +505,35 @@ async fn scrape_single_article_with_llm( return (String::new(), String::new(), final_url); } - // Extract from the raw HTML for the LLM - // We need to re-fetch the raw HTML or extract it from the scraper - // Since scraper already parsed it, we'll use the existing body_text + title as input - let head_html = String::new(); // The scraper doesn't preserve — use empty - let body_text = &content.body_text; - let (system, user) = crate::services::prompts::build_article_extraction_prompt( - &head_html, - body_text, + &content.head_html, + &content.body_text, ); let schema = crate::services::llm::schema::build_article_extraction_schema(); - match provider.generate_rewrite_pass(model, &system, &user, &schema).await { + match provider.generate_rewrite_pass(&model, &system, &user, &schema).await { Ok(response) => { let title = response.get("title").and_then(|t| t.as_str()).unwrap_or("").to_string(); - let extracted_body = response.get("body_text").and_then(|b| b.as_str()).unwrap_or("").to_string(); + let body = response.get("body_text").and_then(|b| b.as_str()).unwrap_or("").to_string(); let is_error = response.get("is_error_page").and_then(|e| e.as_bool()).unwrap_or(false); let date_str = response.get("published_date").and_then(|d| d.as_str()).unwrap_or(""); - if is_error || extracted_body.trim().is_empty() { + if is_error || body.trim().is_empty() { return (String::new(), String::new(), final_url); } - // Check date if provided if !date_str.is_empty() { if let Ok(date) = chrono::DateTime::parse_from_rfc3339(date_str) { if scraper::is_article_too_old(Some(date.with_timezone(&chrono::Utc)), max_age_days) { - tracing::warn!(url = url, "LLM-extracted article too old"); return (String::new(), String::new(), final_url); } } } - (extracted_body, title, final_url) + (body, title, final_url) } Err(e) => { - tracing::warn!(url = url, error = %e, "LLM article extraction failed, using heuristic fallback"); - // Fall back to existing heuristic data + tracing::warn!(url = url, error = %e, "LLM extraction failed, using heuristic fallback"); if scraper::is_article_too_old(content.published_date, max_age_days) { return (String::new(), String::new(), final_url); } @@ -536,40 +544,72 @@ async fn scrape_single_article_with_llm( } ``` -- [ ] **Step 2: Update pipeline to use LLM extraction when enabled** +Note: `provider: Arc` and `model: String` — both are `'static` and can be moved into spawned tasks. -In `run_generation_inner`, the scraping calls need to branch based on `settings.use_llm_for_article_extraction`. The simplest approach: update `scrape_flat_urls` and `scrape_articles` to accept an optional provider+model, and use `scrape_single_article_with_llm` when provided. +- [ ] **Step 2: Update `scrape_flat_urls` and `scrape_articles` for LLM dispatch** -Add a wrapper that the pipeline calls: +Add a parameter `llm: Option<(Arc, String)>` to both functions. When `Some`, use `scrape_single_article_with_llm` instead of `scrape_single_article`. Set `max_concurrent = 5` when LLM is enabled, `10` otherwise. +In the spawn closures, clone the `Arc` and `String`: ```rust -/// Scrape a single article, optionally using LLM extraction. -async fn scrape_article_dispatch( - http_client: &reqwest::Client, - url: &str, - max_age_days: i64, - llm: Option<(&dyn crate::services::llm::LlmProvider, &str)>, -) -> (String, String, String) { - match llm { - Some((provider, model)) => { - scrape_single_article_with_llm(http_client, url, max_age_days, provider, model).await - } - None => scrape_single_article(http_client, url, max_age_days).await, - } +if let Some((ref provider, ref model)) = llm { + let provider = Arc::clone(provider); + let model = model.clone(); + join_set.spawn(async move { + let scraped = scrape_single_article_with_llm(&client, &url, mad, provider, model).await; + // ... + }); +} else { + join_set.spawn(async move { + let scraped = scrape_single_article(&client, &url, mad).await; + // ... + }); } ``` -Update `scrape_flat_urls` and `scrape_articles` to use `scrape_article_dispatch`. The provider and model are passed from `run_generation_inner` based on `settings.use_llm_for_article_extraction`. +Add progress reporting for LLM extraction: +```rust +let progress_label = if llm.is_some() { + format!("Extraction IA des articles ({}/{})...", completed, total) +} else { + format!("Verification des sources ({}/{})...", completed, total) +}; +emit_progress(tx, "scraping", &progress_label, pct as u8); +``` -Similarly, update the Phase 1 source scraping in `run_generation_inner` to call `extract_article_links_with_llm` vs `extract_article_links` based on `settings.use_llm_for_source_links`. +- [ ] **Step 3: Update `run_generation_inner` to pass LLM params** -- [ ] **Step 3: Run tests + commit** +In Phase 1 and Phase 2 scraping calls, pass the LLM option: +```rust +let llm_for_scraping = if settings.use_llm_for_article_extraction { + Some((Arc::clone(&provider), model_research.clone())) +} else { + None +}; +``` + +Pass `llm_for_scraping` to `scrape_flat_urls` and `scrape_articles`. + +Similarly for source link extraction: +```rust +if settings.use_llm_for_source_links { + source_scraper::extract_article_links_with_llm( + &state.http_client, &source.url, max_links_per_source, + &provider, &model_research, + ).await +} else { + source_scraper::extract_article_links( + &state.http_client, &source.url, max_links_per_source, + ).await +} +``` -Run: `cd backend && cargo test --lib` +- [ ] **Step 4: Run tests + commit** ```bash +cd backend && cargo test --lib git add backend/src/services/synthesis.rs -git commit -m "feat: LLM-assisted article extraction with heuristic fallback" +git commit -m "feat: LLM-assisted article extraction with Arc provider and heuristic fallback" ``` --- @@ -581,32 +621,29 @@ git commit -m "feat: LLM-assisted article extraction with heuristic fallback" - Modify: `frontend/src/i18n/fr.ts` - Modify: `frontend/src/pages/Settings.tsx` -- [ ] **Step 1: Add fields to types** +- [ ] **Step 1: Add fields to types + DEFAULT_SETTINGS** -In `frontend/src/types.ts`, add to `UserSettings`: ```typescript +// In UserSettings interface: use_llm_for_source_links: boolean; use_llm_for_article_extraction: boolean; -``` -Add to `DEFAULT_SETTINGS`: -```typescript +// In DEFAULT_SETTINGS: use_llm_for_source_links: false, use_llm_for_article_extraction: false, ``` - [ ] **Step 2: Add i18n labels** -In `frontend/src/i18n/fr.ts`: ```typescript 'settings.advancedExtraction': 'Extraction avancee', 'settings.useLlmForSourceLinks': "Utiliser l'IA pour extraire les liens", 'settings.useLlmForArticleExtraction': "Utiliser l'IA pour extraire le contenu", ``` -- [ ] **Step 3: Add checkboxes to Settings page** +- [ ] **Step 3: Add checkboxes in Settings page** -In `frontend/src/pages/Settings.tsx`, add a new section after the existing generation settings (after the grid with maxAgeDays/maxItemsPerCategory/maxArticlesPerSource/diversityWindow): +Add after the generation settings grid, before the search agent behavior section: ```tsx {/* Advanced extraction */} @@ -655,9 +692,8 @@ In `frontend/src/pages/Settings.tsx`, add a new section after the existing gener - [ ] **Step 4: Run frontend tests + commit** -Run: `cd frontend && npx tsc --noEmit && npx vitest run` - ```bash +cd frontend && npx tsc --noEmit && npx vitest run git add frontend/src/types.ts frontend/src/i18n/fr.ts frontend/src/pages/Settings.tsx git commit -m "feat: add LLM scraping toggles to Settings page" ``` @@ -671,15 +707,23 @@ git commit -m "feat: add LLM scraping toggles to Settings page" - [ ] **Step 1: Update settings payload** -Add the new boolean fields to the PUT settings call: +Add to the PUT settings body: ```typescript use_llm_for_source_links: false, use_llm_for_article_extraction: false, ``` -- [ ] **Step 2: Add comprehensive validation after synthesis fetch** +- [ ] **Step 2: Add comprehensive validation using `request` fixture** -After the existing structure validation, add: +Update the test function signature to include the `request` fixture: +```typescript +test('full generation pipeline produces valid synthesis', async ({ + page, + request, +}) => { +``` + +Add after existing structure validation: ```typescript // Comprehensive synthesis validation @@ -688,20 +732,15 @@ After the existing structure validation, add: for (const section of synthesis.sections) { for (const item of section.items) { - // Collect URLs for duplicate check allUrls.push(item.url); - - // Count domains for source diversity check try { const domain = new URL(item.url).hostname; domainCounts[domain] = (domainCounts[domain] || 0) + 1; } catch {} } - // Category article count check - if (section.title !== 'Autre') { - expect(section.items.length).toBeLessThanOrEqual(4); // max_items_per_category - } + // Category article count check (including Autre) + expect(section.items.length).toBeLessThanOrEqual(4); // max_items_per_category } // No duplicate URLs across all sections @@ -713,20 +752,12 @@ After the existing structure validation, add: expect(count).toBeLessThanOrEqual(3); } - // Verify article links actually work (HTTP 200) - // Test a sample of up to 3 URLs to avoid slowness + // Verify a sample of article links actually work (using Playwright request API, no CORS issues) const sampleUrls = allUrls.slice(0, 3); for (const articleUrl of sampleUrls) { - const linkCheck = await page.evaluate(async (url: string) => { - try { - const resp = await fetch(url, { method: 'HEAD', redirect: 'follow' }); - return resp.status; - } catch { - return 0; - } - }, articleUrl); - expect(linkCheck).toBeGreaterThanOrEqual(200); - expect(linkCheck).toBeLessThan(400); + const resp = await request.head(articleUrl); + expect(resp.status()).toBeGreaterThanOrEqual(200); + expect(resp.status()).toBeLessThan(400); } ``` @@ -742,5 +773,28 @@ sleep 25 && npx tsx seed.ts && npx playwright test generation-live --reporter=li ```bash git add e2e/tests/generation-live.spec.ts -git commit -m "test: comprehensive E2E synthesis validation (duplicates, links, counts)" +git commit -m "test: comprehensive E2E synthesis validation (duplicates, links, counts, domains)" +``` + +--- + +### Task 8: Update integration test + +**Files:** +- Modify: `backend/tests/api_syntheses_test.rs` + +- [ ] **Step 1: Update settings payload in `generate_pipeline_resolves_model_from_admin_config`** + +Add the new boolean fields to the PUT settings body: +```rust +"use_llm_for_source_links": false, +"use_llm_for_article_extraction": false, +``` + +- [ ] **Step 2: Run integration test compilation check + commit** + +```bash +cd backend && cargo test --no-run +git add backend/tests/api_syntheses_test.rs +git commit -m "test: update integration test with LLM scraping settings" ```