From 175483dfe329b6bf20b790d48ed71aa0d2472ee1 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 10:27:36 +0100 Subject: [PATCH] docs: add spec and plan for LLM-assisted scraping Two optional LLM enhancements: link extraction from source pages and article content extraction. Plan needs revision for Arc threading and HTML preservation before implementation. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../plans/2026-03-24-llm-scraping.md | 746 ++++++++++++++++++ .../specs/2026-03-24-llm-scraping-design.md | 131 +++ 2 files changed, 877 insertions(+) create mode 100644 docs/superpowers/plans/2026-03-24-llm-scraping.md create mode 100644 docs/superpowers/specs/2026-03-24-llm-scraping-design.md diff --git a/docs/superpowers/plans/2026-03-24-llm-scraping.md b/docs/superpowers/plans/2026-03-24-llm-scraping.md new file mode 100644 index 0000000..c3887a7 --- /dev/null +++ b/docs/superpowers/plans/2026-03-24-llm-scraping.md @@ -0,0 +1,746 @@ +# LLM-Assisted Scraping — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add two optional LLM-powered scraping enhancements: LLM link extraction from source pages and LLM article content extraction — controlled by user settings. + +**Architecture:** Two boolean settings control two independent LLM scraping paths. Each has a fallback to existing heuristic-based extraction. `ScrapedContent` gains a `url` field for redirect-resolved URLs. New prompt/schema builders for both LLM calls. + +**Tech Stack:** Rust (reqwest, scraper crate, serde_json), existing LLM providers via `generate_rewrite_pass` + +**Spec:** `docs/superpowers/specs/2026-03-24-llm-scraping-design.md` + +--- + +### Task 1: Migration + backend model (2 bool settings) + +**Files:** +- Create: `backend/migrations/20260324000014_add_llm_scraping_settings.sql` +- Modify: `backend/src/models/settings.rs` +- Modify: `backend/src/db/settings.rs` +- Modify: `backend/src/services/prompts.rs` (test fixture) +- Modify: `CLAUDE.md` + +- [ ] **Step 1: Create migration** + +```sql +ALTER TABLE settings ADD COLUMN use_llm_for_source_links BOOLEAN NOT NULL DEFAULT false; +ALTER TABLE settings ADD COLUMN use_llm_for_article_extraction BOOLEAN NOT NULL DEFAULT false; +``` + +- [ ] **Step 2: Add fields to all structs in `models/settings.rs`** + +Add `pub use_llm_for_source_links: bool` and `pub use_llm_for_article_extraction: bool` to `UserSettings`, `SettingsResponse`, `UpdateSettingsRequest` (after `source_diversity_window`). + +Add to `From for SettingsResponse`, `Default for UserSettings` (both `false`). No validation needed for bools. + +- [ ] **Step 3: Add to DB queries in `db/settings.rs`** + +Add both fields to `SettingsRow`, `TryFrom`, and both SQL queries (`get_or_create_default` + `upsert`). Follow the pattern of the last column added. + +- [ ] **Step 4: Update test fixtures** + +Add `use_llm_for_source_links: false, use_llm_for_article_extraction: false` to: +- `valid_request()` in `models/settings.rs` tests +- `test_settings()` in `services/prompts.rs` tests + +- [ ] **Step 5: Update CLAUDE.md migration count to 14** + +- [ ] **Step 6: Run tests + commit** + +Run: `cd backend && cargo test --lib` + +```bash +git add backend/migrations/20260324000014_add_llm_scraping_settings.sql backend/src/models/settings.rs backend/src/db/settings.rs backend/src/services/prompts.rs CLAUDE.md +git commit -m "feat: add use_llm_for_source_links and use_llm_for_article_extraction settings" +``` + +--- + +### Task 2: Add `url` field to `ScrapedContent` + update `scrape_single_article` + +**Files:** +- Modify: `backend/src/services/scraper.rs` +- Modify: `backend/src/services/synthesis.rs` + +- [ ] **Step 1: Add `url` to `ScrapedContent` in `scraper.rs`** + +Add `pub url: String` to the `ScrapedContent` struct (after `is_soft_404`). + +In `scrape_url`, populate it from `final_url`: +```rust +Ok(ScrapedContent { + ok: !is_soft_404, + status, + title, + published_date, + body_text, + is_soft_404, + url: final_url.to_string(), +}) +``` + +- [ ] **Step 2: Update `scrape_single_article` to return `(String, String, String)`** + +In `synthesis.rs`, change `scrape_single_article` return type from `(String, String)` to `(String, String, String)` — `(body_text, page_title, final_url)`: + +```rust +async fn scrape_single_article( + http_client: &reqwest::Client, + url: &str, + max_age_days: i64, +) -> (String, String, String) { + match scraper::scrape_url(http_client, url).await { + Ok(content) => { + if !content.ok || content.is_soft_404 { + tracing::warn!(url = url, "Soft 404 or error page detected, skipping content"); + return (String::new(), String::new(), content.url); + } + + if scraper::is_article_too_old(content.published_date, max_age_days) { + tracing::warn!(url = url, "Article too old, skipping content"); + return (String::new(), String::new(), content.url); + } + + let title = content.title.unwrap_or_default(); + (content.body_text, title, content.url) + } + Err(e) => { + tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content"); + (String::new(), String::new(), url.to_string()) + } + } +} +``` + +- [ ] **Step 3: Update callers of `scrape_single_article`** + +In `scrape_articles` and `scrape_flat_urls`, update destructuring from `(scraped_content, page_title)` to `(scraped_content, page_title, final_url)`. Use `final_url` to set `ScrapedNewsItem.url` instead of the input URL: + +In `scrape_articles` (inside the `join_set.spawn`): +```rust +let scraped = scrape_single_article(&client, &url, mad).await; +(cat_key, item, scraped) +``` +And in the result handler: +```rust +if let Ok((cat_key, item, (scraped_content, page_title, final_url))) = join_result { + let scraped_item = ScrapedNewsItem { + title: item.title, + url: final_url, // Use redirect-resolved URL instead of item.url + summary: item.summary, + original_title: page_title, + scraped_content, + }; +``` + +Same pattern in `scrape_flat_urls`: +```rust +if let Ok((url, scraped_content, page_title, final_url)) = join_result { + results.push(ScrapedNewsItem { + title: page_title.clone(), + url: final_url, // Use redirect-resolved URL + summary: String::new(), + original_title: page_title, + scraped_content, + }); +``` + +Note: the `join_set.spawn` closure must also capture and return `final_url`. Update the spawn to return 4-tuple: `(url, scraped_content, page_title, final_url)`. + +- [ ] **Step 4: Run tests + commit** + +Run: `cd backend && cargo test --lib` + +```bash +git add backend/src/services/scraper.rs backend/src/services/synthesis.rs +git commit -m "feat: add url field to ScrapedContent, use redirect-resolved URLs" +``` + +--- + +### Task 3: LLM prompts and schemas for both extraction types + +**Files:** +- Modify: `backend/src/services/prompts.rs` +- Modify: `backend/src/services/llm/schema.rs` + +- [ ] **Step 1: Add `build_link_extraction_prompt` to `prompts.rs`** + +```rust +/// Build a prompt for LLM-assisted link extraction from a source page. +/// +/// # Arguments +/// * `head_html` — the section of the page +/// * `body_html` — first 8000 chars of the section +pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) { + let system_prompt = + "Tu es un assistant qui analyse des pages web. \ + Tu dois identifier les liens vers des articles d'actualite. \ + Reponds uniquement au format JSON demande." + .to_string(); + + let user_prompt = format!( + "Voici le contenu HTML d'une page de blog ou de site d'actualites.\n\n\ + \n{head}\n\n\n\ + \n{body}\n\n\n\ + Extrais UNIQUEMENT les URLs qui pointent vers des articles \ + (pas les liens de navigation, tags, categories, login, pages statiques, etc.).\n\ + Retourne les URLs completes dans le format JSON demande.", + head = head_html, + body = body_html, + ); + + (system_prompt, user_prompt) +} +``` + +- [ ] **Step 2: Add `build_article_extraction_prompt` to `prompts.rs`** + +```rust +/// Build a prompt for LLM-assisted article content extraction. +/// +/// # Arguments +/// * `head_html` — the section (contains meta tags, og:*, canonical) +/// * `body_text` — cleaned body text from existing HTML stripping +pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (String, String) { + let system_prompt = + "Tu es un assistant qui analyse des articles web. \ + Tu dois extraire les informations structurees de l'article. \ + Reponds uniquement au format JSON demande." + .to_string(); + + let user_prompt = format!( + "Voici le contenu d'une page web.\n\n\ + \n{head}\n\n\n\ + Contenu textuel de la page :\n{body}\n\n\ + Extrais les informations suivantes :\n\ + - title : le titre de l'article\n\ + - published_date : la date de publication au format ISO 8601 (YYYY-MM-DDTHH:MM:SSZ), \ + ou une chaine vide si introuvable\n\ + - body_text : le contenu principal de l'article (pas la navigation, pas les pubs)\n\ + - is_error_page : true si c'est une page d'erreur/404, false sinon", + head = head_html, + body = body_text, + ); + + (system_prompt, user_prompt) +} +``` + +- [ ] **Step 3: Add schemas to `schema.rs`** + +```rust +/// Build a JSON Schema for LLM link extraction response. +pub fn build_link_extraction_schema() -> Value { + serde_json::json!({ + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": { "type": "string" } + } + }, + "required": ["urls"], + "additionalProperties": false + }) +} + +/// Build a JSON Schema for LLM article content extraction response. +pub fn build_article_extraction_schema() -> Value { + serde_json::json!({ + "type": "object", + "properties": { + "title": { "type": "string", "description": "Article title" }, + "published_date": { "type": "string", "description": "ISO 8601 date or empty string if not found" }, + "body_text": { "type": "string", "description": "Main article content" }, + "is_error_page": { "type": "boolean", "description": "True if this is an error/404 page" } + }, + "required": ["title", "published_date", "body_text", "is_error_page"], + "additionalProperties": false + }) +} +``` + +- [ ] **Step 4: Add tests** + +In `prompts.rs` tests: +```rust + #[test] + fn link_extraction_prompt_includes_html() { + let (_, user) = build_link_extraction_prompt("Blog", "P"); + assert!(user.contains("Blog")); + assert!(user.contains("articles")); + } + + #[test] + fn article_extraction_prompt_includes_content() { + let (_, user) = build_article_extraction_prompt("", "Article body text here"); + assert!(user.contains("Article body text here")); + assert!(user.contains("published_date")); + } +``` + +In `schema.rs` tests: +```rust + #[test] + fn link_extraction_schema_has_urls_array() { + let schema = build_link_extraction_schema(); + assert_eq!(schema["properties"]["urls"]["type"], "array"); + assert_eq!(schema["additionalProperties"], false); + } + + #[test] + fn article_extraction_schema_has_all_fields() { + let schema = build_article_extraction_schema(); + let props = schema["properties"].as_object().unwrap(); + assert!(props.contains_key("title")); + assert!(props.contains_key("published_date")); + assert!(props.contains_key("body_text")); + assert!(props.contains_key("is_error_page")); + assert_eq!(schema["additionalProperties"], false); + } +``` + +- [ ] **Step 5: Run tests + commit** + +Run: `cd backend && cargo test --lib` + +```bash +git add backend/src/services/prompts.rs backend/src/services/llm/schema.rs +git commit -m "feat: add LLM prompts and schemas for link and article extraction" +``` + +--- + +### Task 4: LLM-assisted source link extraction in `source_scraper.rs` + +**Files:** +- Modify: `backend/src/services/source_scraper.rs` + +- [ ] **Step 1: Update `extract_article_links` to accept optional LLM provider** + +Add a new public function `extract_article_links_with_llm` that accepts LLM parameters. The existing `extract_article_links` stays unchanged for non-LLM path. + +```rust +use crate::services::llm::LlmProvider; +use crate::services::llm::schema::build_link_extraction_schema; +use crate::services::prompts::build_link_extraction_prompt; + +/// Extract article links using LLM analysis of the page HTML. +/// +/// Falls back to heuristic extraction if the LLM call fails or returns empty results. +pub async fn extract_article_links_with_llm( + http_client: &reqwest::Client, + source_url: &str, + max_links: usize, + provider: &dyn LlmProvider, + model: &str, +) -> Result, AppError> { + let base_url = Url::parse(source_url) + .map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?; + let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); + + // Fetch the page + let response = http_client.get(source_url).send().await.map_err(|e| { + tracing::warn!(url = source_url, error = %e, "Failed to fetch source page"); + AppError::Internal(anyhow::anyhow!("Failed to fetch source page")) + })?; + + if !response.status().is_success() { + tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200"); + return Ok(Vec::new()); + } + + let html_text = response.text().await.map_err(|e| { + AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e)) + })?; + + // Extract and first 8000 chars of for the LLM + let (head_html, body_html) = extract_head_and_body(&html_text); + + let (system, user) = build_link_extraction_prompt(&head_html, &body_html); + let schema = build_link_extraction_schema(); + + match provider.generate_rewrite_pass(model, &system, &user, &schema).await { + Ok(response) => { + let urls = response + .get("urls") + .and_then(|u| u.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|v| v.as_str()) + .filter_map(|href| { + // Resolve relative URLs + let resolved = base_url.join(href).ok()?; + // Filter: http/https only, same domain + if resolved.scheme() != "http" && resolved.scheme() != "https" { + return None; + } + let domain = resolved.host_str()?.to_lowercase(); + if domain != base_domain { + return None; + } + Some(resolved.to_string()) + }) + .collect::>() + }) + .unwrap_or_default(); + + if urls.is_empty() { + tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic extraction"); + let fallback = extract_links_from_html(&html_text, &base_url, &base_domain); + Ok(fallback.into_iter().take(max_links).collect()) + } else { + // Deduplicate + let mut seen = std::collections::HashSet::new(); + let deduped: Vec = urls.into_iter().filter(|u| seen.insert(u.clone())).collect(); + Ok(deduped.into_iter().take(max_links).collect()) + } + } + Err(e) => { + tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back to heuristic"); + let fallback = extract_links_from_html(&html_text, &base_url, &base_domain); + Ok(fallback.into_iter().take(max_links).collect()) + } + } +} + +/// Extract the section and first N chars of from HTML. +fn extract_head_and_body(html: &str) -> (String, String) { + let head_start = html.find("").map(|i| i + 7).unwrap_or(head_start); + let head = &html[head_start..head_end]; + + let body_start = html.find("T")); + assert!(body.contains("

Content

")); + } + + #[test] + fn extract_head_and_body_truncates_body() { + let long_body = "x".repeat(20000); + let html = format!("{}", long_body); + let (_, body) = extract_head_and_body(&html); + assert!(body.len() <= 8006); // tag + 8000 chars + } +``` + +- [ ] **Step 3: Run tests + commit** + +Run: `cd backend && cargo test --lib` + +```bash +git add backend/src/services/source_scraper.rs +git commit -m "feat: LLM-assisted source link extraction with heuristic fallback" +``` + +--- + +### Task 5: LLM-assisted article extraction in synthesis pipeline + +**Files:** +- Modify: `backend/src/services/synthesis.rs` + +- [ ] **Step 1: Add `scrape_single_article_with_llm` function** + +Add a new async function alongside `scrape_single_article`: + +```rust +/// Scrape an article URL using LLM for content extraction. +/// +/// Falls back to heuristic extraction if the LLM call fails. +async fn scrape_single_article_with_llm( + http_client: &reqwest::Client, + url: &str, + max_age_days: i64, + provider: &dyn crate::services::llm::LlmProvider, + model: &str, +) -> (String, String, String) { + // First, do the HTTP fetch (same as regular scraping) + let fetch_result = scraper::scrape_url(http_client, url).await; + + let content = match fetch_result { + Ok(c) => c, + Err(e) => { + tracing::warn!(url = url, error = %e, "Failed to fetch URL for LLM extraction"); + return (String::new(), String::new(), url.to_string()); + } + }; + + let final_url = content.url.clone(); + + if !content.ok || content.is_soft_404 { + return (String::new(), String::new(), final_url); + } + + // Extract from the raw HTML for the LLM + // We need to re-fetch the raw HTML or extract it from the scraper + // Since scraper already parsed it, we'll use the existing body_text + title as input + let head_html = String::new(); // The scraper doesn't preserve — use empty + let body_text = &content.body_text; + + let (system, user) = crate::services::prompts::build_article_extraction_prompt( + &head_html, + body_text, + ); + let schema = crate::services::llm::schema::build_article_extraction_schema(); + + match provider.generate_rewrite_pass(model, &system, &user, &schema).await { + Ok(response) => { + let title = response.get("title").and_then(|t| t.as_str()).unwrap_or("").to_string(); + let extracted_body = response.get("body_text").and_then(|b| b.as_str()).unwrap_or("").to_string(); + let is_error = response.get("is_error_page").and_then(|e| e.as_bool()).unwrap_or(false); + let date_str = response.get("published_date").and_then(|d| d.as_str()).unwrap_or(""); + + if is_error || extracted_body.trim().is_empty() { + return (String::new(), String::new(), final_url); + } + + // Check date if provided + if !date_str.is_empty() { + if let Ok(date) = chrono::DateTime::parse_from_rfc3339(date_str) { + if scraper::is_article_too_old(Some(date.with_timezone(&chrono::Utc)), max_age_days) { + tracing::warn!(url = url, "LLM-extracted article too old"); + return (String::new(), String::new(), final_url); + } + } + } + + (extracted_body, title, final_url) + } + Err(e) => { + tracing::warn!(url = url, error = %e, "LLM article extraction failed, using heuristic fallback"); + // Fall back to existing heuristic data + if scraper::is_article_too_old(content.published_date, max_age_days) { + return (String::new(), String::new(), final_url); + } + let title = content.title.unwrap_or_default(); + (content.body_text, title, final_url) + } + } +} +``` + +- [ ] **Step 2: Update pipeline to use LLM extraction when enabled** + +In `run_generation_inner`, the scraping calls need to branch based on `settings.use_llm_for_article_extraction`. The simplest approach: update `scrape_flat_urls` and `scrape_articles` to accept an optional provider+model, and use `scrape_single_article_with_llm` when provided. + +Add a wrapper that the pipeline calls: + +```rust +/// Scrape a single article, optionally using LLM extraction. +async fn scrape_article_dispatch( + http_client: &reqwest::Client, + url: &str, + max_age_days: i64, + llm: Option<(&dyn crate::services::llm::LlmProvider, &str)>, +) -> (String, String, String) { + match llm { + Some((provider, model)) => { + scrape_single_article_with_llm(http_client, url, max_age_days, provider, model).await + } + None => scrape_single_article(http_client, url, max_age_days).await, + } +} +``` + +Update `scrape_flat_urls` and `scrape_articles` to use `scrape_article_dispatch`. The provider and model are passed from `run_generation_inner` based on `settings.use_llm_for_article_extraction`. + +Similarly, update the Phase 1 source scraping in `run_generation_inner` to call `extract_article_links_with_llm` vs `extract_article_links` based on `settings.use_llm_for_source_links`. + +- [ ] **Step 3: Run tests + commit** + +Run: `cd backend && cargo test --lib` + +```bash +git add backend/src/services/synthesis.rs +git commit -m "feat: LLM-assisted article extraction with heuristic fallback" +``` + +--- + +### Task 6: Frontend settings + +**Files:** +- Modify: `frontend/src/types.ts` +- Modify: `frontend/src/i18n/fr.ts` +- Modify: `frontend/src/pages/Settings.tsx` + +- [ ] **Step 1: Add fields to types** + +In `frontend/src/types.ts`, add to `UserSettings`: +```typescript +use_llm_for_source_links: boolean; +use_llm_for_article_extraction: boolean; +``` + +Add to `DEFAULT_SETTINGS`: +```typescript +use_llm_for_source_links: false, +use_llm_for_article_extraction: false, +``` + +- [ ] **Step 2: Add i18n labels** + +In `frontend/src/i18n/fr.ts`: +```typescript +'settings.advancedExtraction': 'Extraction avancee', +'settings.useLlmForSourceLinks': "Utiliser l'IA pour extraire les liens", +'settings.useLlmForArticleExtraction': "Utiliser l'IA pour extraire le contenu", +``` + +- [ ] **Step 3: Add checkboxes to Settings page** + +In `frontend/src/pages/Settings.tsx`, add a new section after the existing generation settings (after the grid with maxAgeDays/maxItemsPerCategory/maxArticlesPerSource/diversityWindow): + +```tsx + {/* Advanced extraction */} +
+

+ {t('settings.advancedExtraction')} +

+
+
+ + setSettings((prev) => ({ + ...prev, + use_llm_for_source_links: e.currentTarget.checked, + })) + } + class="h-4 w-4 text-indigo-600 focus:ring-indigo-500 border-gray-300 rounded" + /> + +
+
+ + setSettings((prev) => ({ + ...prev, + use_llm_for_article_extraction: e.currentTarget.checked, + })) + } + class="h-4 w-4 text-indigo-600 focus:ring-indigo-500 border-gray-300 rounded" + /> + +
+
+
+``` + +- [ ] **Step 4: Run frontend tests + commit** + +Run: `cd frontend && npx tsc --noEmit && npx vitest run` + +```bash +git add frontend/src/types.ts frontend/src/i18n/fr.ts frontend/src/pages/Settings.tsx +git commit -m "feat: add LLM scraping toggles to Settings page" +``` + +--- + +### Task 7: Update E2E test with comprehensive synthesis validation + +**Files:** +- Modify: `e2e/tests/generation-live.spec.ts` + +- [ ] **Step 1: Update settings payload** + +Add the new boolean fields to the PUT settings call: +```typescript +use_llm_for_source_links: false, +use_llm_for_article_extraction: false, +``` + +- [ ] **Step 2: Add comprehensive validation after synthesis fetch** + +After the existing structure validation, add: + +```typescript + // Comprehensive synthesis validation + const allUrls: string[] = []; + const domainCounts: Record = {}; + + for (const section of synthesis.sections) { + for (const item of section.items) { + // Collect URLs for duplicate check + allUrls.push(item.url); + + // Count domains for source diversity check + try { + const domain = new URL(item.url).hostname; + domainCounts[domain] = (domainCounts[domain] || 0) + 1; + } catch {} + } + + // Category article count check + if (section.title !== 'Autre') { + expect(section.items.length).toBeLessThanOrEqual(4); // max_items_per_category + } + } + + // No duplicate URLs across all sections + const uniqueUrls = new Set(allUrls); + expect(uniqueUrls.size).toBe(allUrls.length); + + // No domain exceeds max_articles_per_source (3) + for (const [domain, count] of Object.entries(domainCounts)) { + expect(count).toBeLessThanOrEqual(3); + } + + // Verify article links actually work (HTTP 200) + // Test a sample of up to 3 URLs to avoid slowness + const sampleUrls = allUrls.slice(0, 3); + for (const articleUrl of sampleUrls) { + const linkCheck = await page.evaluate(async (url: string) => { + try { + const resp = await fetch(url, { method: 'HEAD', redirect: 'follow' }); + return resp.status; + } catch { + return 0; + } + }, articleUrl); + expect(linkCheck).toBeGreaterThanOrEqual(200); + expect(linkCheck).toBeLessThan(400); + } +``` + +- [ ] **Step 3: Run E2E test** + +```bash +cd e2e && docker compose -f docker-compose.test.yml down +docker compose -f docker-compose.test.yml up --build -d +sleep 25 && npx tsx seed.ts && npx playwright test generation-live --reporter=list +``` + +- [ ] **Step 4: Commit** + +```bash +git add e2e/tests/generation-live.spec.ts +git commit -m "test: comprehensive E2E synthesis validation (duplicates, links, counts)" +``` diff --git a/docs/superpowers/specs/2026-03-24-llm-scraping-design.md b/docs/superpowers/specs/2026-03-24-llm-scraping-design.md new file mode 100644 index 0000000..ced922a --- /dev/null +++ b/docs/superpowers/specs/2026-03-24-llm-scraping-design.md @@ -0,0 +1,131 @@ +# Design: LLM-Assisted Scraping — Link Extraction & Article Content Extraction + +**Date**: 2026-03-24 +**Scope**: Two optional LLM-powered enhancements to the scraping pipeline, controlled by user settings + +--- + +## Context + +The current scraping pipeline uses HTML parsing heuristics to extract article links from source pages and article content from individual pages. These heuristics fail on JavaScript-rendered pages, unusual HTML structures, and complex layouts. Two optional LLM-powered alternatives improve extraction quality when enabled. + +## New User Settings + +Two independent boolean toggles: + +- `use_llm_for_source_links: bool` (default `false`) — "Utiliser l'IA pour extraire les liens" +- `use_llm_for_article_extraction: bool` (default `false`) — "Utiliser l'IA pour extraire le contenu" + +Fully independent — user can enable either, both, or neither. + +**Migration:** `ALTER TABLE settings ADD COLUMN use_llm_for_source_links BOOLEAN NOT NULL DEFAULT false; ALTER TABLE settings ADD COLUMN use_llm_for_article_extraction BOOLEAN NOT NULL DEFAULT false;` + +**Frontend:** Two checkboxes in Settings page under a new "Extraction avancee" section. + +## ScrapedContent URL Field + +Add `pub url: String` to the `ScrapedContent` struct. Populated with the final URL after redirects (from `response.url().to_string()`). + +**Pipeline impact:** `scrape_single_article` returns `(String, String, String)` — `(body_text, page_title, final_url)` instead of `(String, String)`. The caller (`scrape_flat_urls`, `scrape_articles`) uses `final_url` to set `ScrapedNewsItem.url`, replacing the original input URL with the validated redirect-resolved URL. This URL becomes the canonical article URL used throughout — replacing the LLM-provided URL in the synthesis via `restore_scraped_urls`. + +## Option 1: LLM-Assisted Source Link Extraction + +When `use_llm_for_source_links` is enabled: + +1. Fetch the source page HTML (same as today) +2. Extract `` + first 8000 chars of `` for the LLM +3. **LLM prompt:** "Here is the HTML of a blog/news page. Extract only the URLs that point to actual articles (not navigation, tags, categories, login pages, etc.). Return a JSON array of URLs." +4. **LLM schema:** `{ "type": "object", "properties": { "urls": { "type": "array", "items": { "type": "string" } } }, "required": ["urls"], "additionalProperties": false }` +5. Parse the LLM response: + - Resolve relative URLs against the source URL + - Filter: only keep http/https URLs, skip malformed URLs (use `Url::parse`) + - Filter: same domain only (match existing heuristic behavior) + - Deduplicate, limit to `max_links` +6. **Fallback:** if the LLM call fails OR returns an empty array (`{"urls": []}`), fall back to the existing `extract_links_from_html`. Log a warning. + +When disabled, the existing HTML parsing + heuristic filtering is used (unchanged). + +**LLM dispatch:** Uses `model_research` via `provider.generate_rewrite_pass`. + +## Option 2: LLM-Assisted Article Content Extraction + +When `use_llm_for_article_extraction` is enabled: + +1. Fetch the article page (same as today — HTTP request, SSRF check, body size limit, streaming) +2. Capture the final URL after redirects (for `ScrapedContent.url`) +3. Extract `` section and clean body text using existing HTML stripping +4. Send both to the LLM with a structured extraction prompt +5. **LLM prompt:** "Extract the following from this article: title, publication date (ISO 8601 format, or empty string if not found), body text (main article content only, no navigation or ads), and whether this is a real article or an error/404 page." +6. **LLM schema:** (OpenAI strict mode compatible — no union types, `published_date` uses empty string instead of null) +```json +{ + "type": "object", + "properties": { + "title": { "type": "string", "description": "Article title" }, + "published_date": { "type": "string", "description": "ISO 8601 date or empty string if not found" }, + "body_text": { "type": "string", "description": "Main article content" }, + "is_error_page": { "type": "boolean", "description": "True if this is an error/404 page" } + }, + "required": ["title", "published_date", "body_text", "is_error_page"], + "additionalProperties": false +} +``` +7. Parse the LLM response into `ScrapedContent` fields: + - `title` → `ScrapedContent.title` (wrapped in `Some`) + - `published_date` → if non-empty, parse ISO 8601 → `ScrapedContent.published_date`; if empty string → `None` + - `body_text` → `ScrapedContent.body_text` + - `is_error_page` → `ScrapedContent.is_soft_404` + - `url` → from `response.url()` (not from LLM) + - `ok` → `true` if `!is_error_page` and `body_text` is non-empty + - `status` → from HTTP response status +8. **Fallback:** if the LLM call fails (network error, JSON parse failure, schema validation error, timeout), fall back to the existing HTML parsing. Log a warning. + +When disabled, the existing scraper logic is used (unchanged), with the new `url` field populated from `response.url()`. + +**Cost:** ~$0.001 per article with gpt-4o-mini. For 16 articles, ~$0.016 total. + +**Concurrency:** LLM extraction calls run with bounded concurrency (max 5) to avoid hitting provider rate limits. + +**Progress reporting:** During per-article LLM extraction, emit progress updates: "Extraction IA des articles (N/M)..." + +## Files to Modify + +- **Create:** migration `20260324000014_add_llm_scraping_settings.sql` +- **Modify:** `backend/src/models/settings.rs` — add 2 bool fields to `UserSettings`, `SettingsResponse`, `UpdateSettingsRequest`, `Default`, validation (none needed for bools) +- **Modify:** `backend/src/db/settings.rs` — add to `SettingsRow`, `TryFrom`, both SQL queries +- **Modify:** `backend/src/services/scraper.rs` — add `url: String` to `ScrapedContent`, populate from `response.url()` +- **Modify:** `backend/src/services/source_scraper.rs` — add LLM-assisted link extraction path, accept provider + model + schema params +- **Modify:** `backend/src/services/synthesis.rs` — pass settings + provider to scraper functions, update `scrape_single_article` to return `ScrapedContent` and accept optional LLM provider, add LLM extraction path +- **Modify:** `backend/src/services/prompts.rs` — add `build_link_extraction_prompt` and `build_article_extraction_prompt` +- **Modify:** `backend/src/services/llm/schema.rs` — add `build_link_extraction_schema` and `build_article_extraction_schema` +- **Modify:** `frontend/src/types.ts` — add 2 bool fields to `UserSettings` + `DEFAULT_SETTINGS` +- **Modify:** `frontend/src/i18n/fr.ts` — add labels +- **Modify:** `frontend/src/pages/Settings.tsx` — add 2 checkboxes in "Extraction avancee" section +- **Modify:** `CLAUDE.md` — update migration count +- **Modify:** `frontend/src/__tests__/fixtures.ts` — add 2 bool fields to MOCK_SETTINGS if manually constructed +- **Modify:** `backend/tests/api_syntheses_test.rs` — update integration test for new settings fields +- **Modify:** `e2e/tests/generation-live.spec.ts` — update settings payload, add comprehensive synthesis validation +- **Add:** unit tests in `source_scraper.rs` — LLM link extraction, fallback +- **Add:** unit tests in `synthesis.rs` — LLM article extraction, fallback +- **Add:** unit tests in `prompts.rs` — link extraction and article extraction prompts + +## E2E Synthesis Validation + +The E2E test generates a synthesis and validates: +- No duplicate URLs across all sections +- All article URLs return HTTP 200 (fetch each to verify links work) +- Each user-defined category has ≤ `max_items_per_category` articles +- Each source domain appears ≤ `max_articles_per_source` times globally +- No empty titles or summaries +- No Wikipedia/hallucinated URLs +- "Autre" section (if present) respects max limit +- Every summary is non-trivial (> 50 chars) + +## What Does NOT Change + +- LLM providers — reused as-is (classification uses `generate_rewrite_pass`) +- Database schema for syntheses — no changes +- Frontend synthesis display — no changes +- Rewrite pass — operates on `ScrapedContent` data regardless of extraction method +- `limit_articles_per_source`, `dedup_by_url`, `filter_homepage_urls` — unchanged +- Classification pipeline (Phase 1/Phase 2) — unchanged, just receives better data