From 2299790986fee8c4df6348391ff9ae886070d055 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Wed, 25 Mar 2026 10:41:43 +0100 Subject: [PATCH] docs: add implementation plan for pipeline improvements Co-Authored-By: Claude Opus 4.6 (1M context) --- .../plans/2026-03-25-pipeline-improvements.md | 514 ++++++++++++++++++ 1 file changed, 514 insertions(+) create mode 100644 docs/superpowers/plans/2026-03-25-pipeline-improvements.md diff --git a/docs/superpowers/plans/2026-03-25-pipeline-improvements.md b/docs/superpowers/plans/2026-03-25-pipeline-improvements.md new file mode 100644 index 0000000..26aa1da --- /dev/null +++ b/docs/superpowers/plans/2026-03-25-pipeline-improvements.md @@ -0,0 +1,514 @@ +# Pipeline Improvements — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Three pipeline improvements: remove sources from web search prompt, add article_url to LLM call logs, send structured link pairs to LLM for link extraction. + +**Architecture:** Three independent changes touching the synthesis pipeline. Each can be implemented and tested separately. Changes span backend (Rust/sqlx) and frontend (SolidJS/TypeScript). + +**Tech Stack:** Rust (Axum, sqlx), SolidJS, PostgreSQL + +**Spec:** `docs/superpowers/specs/2026-03-25-pipeline-improvements-design.md` + +--- + +### Task 1: Remove personalized sources from Phase 2 web search prompt + +**Files:** +- Modify: `backend/src/services/synthesis.rs:567` + +- [ ] **Step 1: Change the `build_search_prompt` call to pass empty sources** + +In `backend/src/services/synthesis.rs`, find the Phase 2 `build_search_prompt` call (around line 567): + +```rust +// Before: +let (sys_prompt, usr_prompt) = crate::services::prompts::build_search_prompt(&settings, &sources, ¤t_date, &[], Some(&category_gaps)); + +// After: +let (sys_prompt, usr_prompt) = crate::services::prompts::build_search_prompt(&settings, &[], ¤t_date, &[], Some(&category_gaps)); +``` + +- [ ] **Step 2: Run tests to verify nothing breaks** + +Run: `cd backend && cargo test --lib` +Expected: All tests pass (the `build_search_prompt` function already handles empty sources correctly) + +- [ ] **Step 3: Commit** + +```bash +git add backend/src/services/synthesis.rs +git commit -m "fix: remove personalized sources from Phase 2 web search prompt" +``` + +--- + +### Task 2: Add `article_url` column to LLM call logs + +**Files:** +- Create: `backend/migrations/20260325000021_add_article_url_to_llm_log.sql` +- Modify: `backend/src/db/llm_call_log.rs` +- Modify: `backend/src/services/synthesis.rs` +- Modify: `backend/src/services/source_scraper.rs` +- Modify: `frontend/src/types.ts` +- Modify: `frontend/src/pages/LlmLogs.tsx` +- Modify: `frontend/src/i18n/fr.ts` +- Modify: `CLAUDE.md` + +- [ ] **Step 1: Create migration** + +Create `backend/migrations/20260325000021_add_article_url_to_llm_log.sql`: + +```sql +ALTER TABLE llm_call_log ADD COLUMN article_url TEXT; +``` + +- [ ] **Step 2: Update `llm_call_log.rs` — insert function** + +In `backend/src/db/llm_call_log.rs`, update the `insert` function to accept and bind `article_url`: + +```rust +#[allow(clippy::too_many_arguments)] +pub async fn insert( + pool: &PgPool, + user_id: Uuid, + job_id: Uuid, + call_type: &str, + model: &str, + system_prompt: &str, + user_prompt: &str, + response_body: &str, + duration_ms: i32, + article_url: Option<&str>, +) -> Result<(), AppError> { + sqlx::query( + r#" + INSERT INTO llm_call_log (user_id, job_id, call_type, model, system_prompt, user_prompt, response_body, duration_ms, article_url) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) + "#, + ) + .bind(user_id) + .bind(job_id) + .bind(call_type) + .bind(model) + .bind(system_prompt) + .bind(user_prompt) + .bind(response_body) + .bind(duration_ms) + .bind(article_url) + .execute(pool) + .await?; + Ok(()) +} +``` + +- [ ] **Step 3: Update `LlmCallLogRow` and `list_by_job_id` query** + +Add `article_url` to the struct and SELECT: + +```rust +#[derive(Debug, Clone, Serialize, sqlx::FromRow)] +pub struct LlmCallLogRow { + pub id: Uuid, + pub call_type: String, + pub model: String, + pub system_prompt: String, + pub user_prompt: String, + pub response_body: String, + pub duration_ms: i32, + pub article_url: Option, + pub created_at: DateTime, +} +``` + +Update the SELECT in `list_by_job_id`: + +```rust +SELECT id, call_type, model, system_prompt, user_prompt, response_body, duration_ms, article_url, created_at +``` + +- [ ] **Step 4: Update `log_llm_call` helper in `synthesis.rs`** + +In `backend/src/services/synthesis.rs`, update the `log_llm_call` helper (around line 752): + +```rust +async fn log_llm_call( + pool: &sqlx::PgPool, + user_id: Uuid, + job_id: Uuid, + call_type: &str, + model: &str, + system_prompt: &str, + user_prompt: &str, + response: &serde_json::Value, + duration_ms: u64, + article_url: Option<&str>, +) { + let response_str = serde_json::to_string_pretty(response).unwrap_or_default(); + db::llm_call_log::insert( + pool, user_id, job_id, call_type, model, + system_prompt, user_prompt, &response_str, duration_ms as i32, + article_url, + ) + .await + .ok(); +} +``` + +- [ ] **Step 5: Update all `log_llm_call` call sites in `synthesis.rs`** + +Find the `log_llm_call` call for Phase 2 search (around line 572) and add `None`: + +```rust +log_llm_call(&state.pool, user_id, job_id, "search", &model_websearch, &sys_prompt, &usr_prompt, &raw_results, llm_duration, None).await; +``` + +- [ ] **Step 6: Update the classify `insert` call in `synthesis.rs`** + +Find the direct `crate::db::llm_call_log::insert` call in the classify spawn (around line 488). Add the article URL: + +```rust +// The `url` variable is already in scope in this closure (it's the article URL) +crate::db::llm_call_log::insert(&pool, uid, jid, "classify_summarize", &mdl, &sys, &usr, &resp_str, duration as i32, Some(&url)).await.ok(); +``` + +- [ ] **Step 7: Update the `insert` call in `source_scraper.rs`** + +In `backend/src/services/source_scraper.rs`, find the `crate::db::llm_call_log::insert` call (around line 172) and add `None`: + +```rust +crate::db::llm_call_log::insert( + pool, uid, jid, "link_extraction", model, + &system, &user, &response_str, llm_duration as i32, + None, +).await.ok(); +``` + +- [ ] **Step 8: Update frontend types** + +In `frontend/src/types.ts`, add `article_url` to `LlmCallLogEntry`: + +```typescript +export interface LlmCallLogEntry { + id: string; + call_type: string; + model: string; + system_prompt: string; + user_prompt: string; + response_body: string; + duration_ms: number; + article_url: string | null; + created_at: string; +} +``` + +- [ ] **Step 9: Add i18n label** + +In `frontend/src/i18n/fr.ts`, add after `'llmLogs.back'`: + +```typescript +'llmLogs.articleUrl': 'Article', +``` + +- [ ] **Step 10: Display article URL in `LlmLogs.tsx`** + +In `frontend/src/pages/LlmLogs.tsx`, add after the header `div` (the one with badge, model, duration — around line 95), before the `
`: + +```tsx + +
+ {t('llmLogs.articleUrl')}: + + {entry.article_url} + +
+
+``` + +Also add the `classify_summarize` badge mapping since it's missing: + +```typescript +const CALL_TYPE_BADGE: Record = { + search: 'bg-blue-100 text-blue-800', + classify_summarize: 'bg-purple-100 text-purple-800', + classification_phase1: 'bg-purple-100 text-purple-800', + classification_phase2: 'bg-purple-100 text-purple-800', + rewrite: 'bg-green-100 text-green-800', + link_extraction: 'bg-orange-100 text-orange-800', + article_extraction: 'bg-orange-100 text-orange-800', +}; +``` + +- [ ] **Step 11: Update CLAUDE.md migration count** + +Change `## Database (20 migrations)` to `## Database (21 migrations)`. + +- [ ] **Step 12: Build and test** + +Run: `cd backend && cargo build && cargo test --lib` +Run: `cd frontend && npx tsc --noEmit` +Expected: All pass + +- [ ] **Step 13: Commit** + +```bash +git add backend/migrations/20260325000021_add_article_url_to_llm_log.sql \ + backend/src/db/llm_call_log.rs \ + backend/src/services/synthesis.rs \ + backend/src/services/source_scraper.rs \ + frontend/src/types.ts \ + frontend/src/pages/LlmLogs.tsx \ + frontend/src/i18n/fr.ts \ + CLAUDE.md +git commit -m "feat: add article_url to LLM call logs for classify tracing" +``` + +--- + +### Task 3: Send structured link pairs to LLM instead of raw HTML + +**Files:** +- Modify: `backend/src/services/source_scraper.rs` +- Modify: `backend/src/services/prompts.rs` + +- [ ] **Step 1: Add `extract_links_as_pairs` function** + +In `backend/src/services/source_scraper.rs`, add after `extract_links_from_html`: + +```rust +/// Extract all links from HTML as (href, anchor_text) pairs for LLM analysis. +/// +/// Minimal filtering: same-domain, http/https, non-empty path. +/// No article-pattern filtering — the LLM decides which are articles. +pub fn extract_links_as_pairs( + html: &str, + base_url: &Url, +) -> Vec<(String, String)> { + let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); + let document = Html::parse_document(html); + let selector = Selector::parse("a[href]").unwrap(); + let mut pairs = Vec::new(); + + for element in document.select(&selector) { + if let Some(href) = element.value().attr("href") { + let resolved = match base_url.join(href) { + Ok(u) => u, + Err(_) => continue, + }; + + if resolved.scheme() != "http" && resolved.scheme() != "https" { + continue; + } + + let link_domain = resolved.host_str().unwrap_or("").to_lowercase(); + if link_domain != base_domain { + continue; + } + + let path = resolved.path(); + if path.is_empty() || path == "/" { + continue; + } + + let anchor_text: String = element.text().collect::>().join(" "); + let anchor_text = anchor_text.trim().to_string(); + + pairs.push((resolved.to_string(), anchor_text)); + } + } + + pairs +} +``` + +- [ ] **Step 2: Add tests for `extract_links_as_pairs`** + +In the `#[cfg(test)] mod tests` block in `source_scraper.rs`, add: + +```rust +#[test] +fn extract_pairs_returns_href_and_text() { + let html = r#" + + Breaking AI News + GPT-6 Released + "#; + let base = base_url("https://example.com/blog"); + let pairs = extract_links_as_pairs(html, &base); + assert_eq!(pairs.len(), 2); + assert!(pairs[0].0.contains("/blog/article-1")); + assert_eq!(pairs[0].1, "Breaking AI News"); + assert!(pairs[1].0.contains("/blog/article-2")); + assert_eq!(pairs[1].1, "GPT-6 Released"); +} + +#[test] +fn extract_pairs_filters_external_links() { + let html = r#"External"#; + let base = base_url("https://example.com"); + let pairs = extract_links_as_pairs(html, &base); + assert!(pairs.is_empty()); +} + +#[test] +fn extract_pairs_filters_root_path() { + let html = r#"Home"#; + let base = base_url("https://example.com"); + let pairs = extract_links_as_pairs(html, &base); + assert!(pairs.is_empty()); +} + +#[test] +fn extract_pairs_handles_empty_anchor_text() { + let html = r#""#; + let base = base_url("https://example.com"); + let pairs = extract_links_as_pairs(html, &base); + assert_eq!(pairs.len(), 1); + assert_eq!(pairs[0].1, ""); +} +``` + +- [ ] **Step 3: Run new tests** + +Run: `cd backend && cargo test --lib source_scraper` +Expected: All source_scraper tests pass (including the 4 new ones) + +- [ ] **Step 4: Update `extract_article_links_with_llm` to use structured pairs** + +In `backend/src/services/source_scraper.rs`, update `extract_article_links_with_llm`. Replace the section that calls `extract_body_html` and `build_link_extraction_prompt` (around lines 158-160): + +```rust +// Before: +let body_html = extract_body_html(&html_text); +let (system, user) = build_link_extraction_prompt(&body_html); + +// After: +let pairs = extract_links_as_pairs(&html_text, &base_url); +let links_text = format_links_for_llm(&pairs); +let (system, user) = build_link_extraction_prompt(&links_text); +``` + +Add the formatting helper in `source_scraper.rs` (before `extract_article_links_with_llm`): + +```rust +/// Format link pairs as a text list for the LLM prompt. +/// Caps at 200 links to limit token usage. +fn format_links_for_llm(pairs: &[(String, String)]) -> String { + pairs + .iter() + .take(200) + .map(|(href, text)| { + if text.is_empty() { + format!("- {}", href) + } else { + format!("- {} | \"{}\"", href, text) + } + }) + .collect::>() + .join("\n") +} +``` + +- [ ] **Step 5: Update `build_link_extraction_prompt` in `prompts.rs`** + +In `backend/src/services/prompts.rs`, update `build_link_extraction_prompt`: + +```rust +/// Build a prompt for LLM-assisted link extraction from a source page. +/// +/// Receives a pre-formatted list of (href, anchor_text) pairs, not raw HTML. +pub fn build_link_extraction_prompt(links_text: &str) -> (String, String) { + let system_prompt = + "Tu es un assistant qui analyse des listes de liens. \ + Tu dois identifier les liens vers des articles d'actualite. \ + Reponds uniquement au format JSON demande." + .to_string(); + + let user_prompt = format!( + "Voici une liste de liens extraits d'une page de blog ou de site d'actualites.\n\n\ + {links}\n\n\ + Selectionne UNIQUEMENT les URLs qui pointent vers des articles \ + (pas les liens de navigation, tags, categories, login, pages statiques, topics, \ + archive, companies, events, company, event, collections, etc.).\n\ + Retourne les URLs completes, sans les modifier, dans le format JSON demande. \ + Ne change jamais les URLs retournees, et ne les tronque jamais.", + links = links_text, + ); + + (system_prompt, user_prompt) +} +``` + +- [ ] **Step 6: Remove `extract_body_html` and its tests** + +In `backend/src/services/source_scraper.rs`: +- Delete the `extract_body_html` function +- Delete the tests `extract_body_html_gets_body_content` and `extract_body_html_truncates_safely` + +- [ ] **Step 7: Update prompt tests in `prompts.rs`** + +In `backend/src/services/prompts.rs`, update the existing link extraction tests: + +```rust +#[test] +fn link_extraction_prompt_includes_links() { + let links = "- https://example.com/post-1 | \"Breaking News\"\n- https://example.com/post-2 | \"Update\""; + let (sys, user) = build_link_extraction_prompt(links); + assert!(user.contains("https://example.com/post-1")); + assert!(user.contains("Breaking News")); + assert!(sys.contains("liens")); +} + +#[test] +fn link_extraction_prompt_empty_links() { + let (_, user) = build_link_extraction_prompt(""); + assert!(user.contains("articles")); +} +``` + +Remove the old `link_extraction_prompt_truncates_body` test. + +- [ ] **Step 8: Add test for `format_links_for_llm`** + +In `source_scraper.rs` tests: + +```rust +#[test] +fn format_links_for_llm_formats_correctly() { + let pairs = vec![ + ("https://example.com/a".to_string(), "Article One".to_string()), + ("https://example.com/b".to_string(), "".to_string()), + ]; + let result = format_links_for_llm(&pairs); + assert!(result.contains("- https://example.com/a | \"Article One\"")); + assert!(result.contains("- https://example.com/b")); + assert!(!result.contains("| \"\"")); +} + +#[test] +fn format_links_for_llm_caps_at_200() { + let pairs: Vec<(String, String)> = (0..300) + .map(|i| (format!("https://example.com/{}", i), format!("Link {}", i))) + .collect(); + let result = format_links_for_llm(&pairs); + let line_count = result.lines().count(); + assert_eq!(line_count, 200); +} +``` + +- [ ] **Step 9: Run all tests** + +Run: `cd backend && cargo test --lib` +Expected: All tests pass + +- [ ] **Step 10: Commit** + +```bash +git add backend/src/services/source_scraper.rs backend/src/services/prompts.rs +git commit -m "feat: send structured link pairs to LLM instead of raw HTML body" +```