From 2299790986fee8c4df6348391ff9ae886070d055 Mon Sep 17 00:00:00 2001
From: oabrivard <olivier@abrivard.fr>
Date: Wed, 25 Mar 2026 10:41:43 +0100
Subject: [PATCH] docs: add implementation plan for pipeline improvements

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../plans/2026-03-25-pipeline-improvements.md | 514 ++++++++++++++++++
 1 file changed, 514 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-03-25-pipeline-improvements.md

diff --git a/docs/superpowers/plans/2026-03-25-pipeline-improvements.md b/docs/superpowers/plans/2026-03-25-pipeline-improvements.md
new file mode 100644
index 0000000..26aa1da
--- /dev/null
+++ b/docs/superpowers/plans/2026-03-25-pipeline-improvements.md
@@ -0,0 +1,514 @@
+# Pipeline Improvements — Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Three pipeline improvements: remove sources from web search prompt, add article_url to LLM call logs, send structured link pairs to LLM for link extraction.
+
+**Architecture:** Three independent changes touching the synthesis pipeline. Each can be implemented and tested separately. Changes span backend (Rust/sqlx) and frontend (SolidJS/TypeScript).
+
+**Tech Stack:** Rust (Axum, sqlx), SolidJS, PostgreSQL
+
+**Spec:** `docs/superpowers/specs/2026-03-25-pipeline-improvements-design.md`
+
+---
+
+### Task 1: Remove personalized sources from Phase 2 web search prompt
+
+**Files:**
+- Modify: `backend/src/services/synthesis.rs:567`
+
+- [ ] **Step 1: Change the `build_search_prompt` call to pass empty sources**
+
+In `backend/src/services/synthesis.rs`, find the Phase 2 `build_search_prompt` call (around line 567):
+
+```rust
+// Before:
+let (sys_prompt, usr_prompt) = crate::services::prompts::build_search_prompt(&settings, &sources, &current_date, &[], Some(&category_gaps));
+
+// After:
+let (sys_prompt, usr_prompt) = crate::services::prompts::build_search_prompt(&settings, &[], &current_date, &[], Some(&category_gaps));
+```
+
+- [ ] **Step 2: Run tests to verify nothing breaks**
+
+Run: `cd backend && cargo test --lib`
+Expected: All tests pass (the `build_search_prompt` function already handles empty sources correctly)
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add backend/src/services/synthesis.rs
+git commit -m "fix: remove personalized sources from Phase 2 web search prompt"
+```
+
+---
+
+### Task 2: Add `article_url` column to LLM call logs
+
+**Files:**
+- Create: `backend/migrations/20260325000021_add_article_url_to_llm_log.sql`
+- Modify: `backend/src/db/llm_call_log.rs`
+- Modify: `backend/src/services/synthesis.rs`
+- Modify: `backend/src/services/source_scraper.rs`
+- Modify: `frontend/src/types.ts`
+- Modify: `frontend/src/pages/LlmLogs.tsx`
+- Modify: `frontend/src/i18n/fr.ts`
+- Modify: `CLAUDE.md`
+
+- [ ] **Step 1: Create migration**
+
+Create `backend/migrations/20260325000021_add_article_url_to_llm_log.sql`:
+
+```sql
+ALTER TABLE llm_call_log ADD COLUMN article_url TEXT;
+```
+
+- [ ] **Step 2: Update `llm_call_log.rs` — insert function**
+
+In `backend/src/db/llm_call_log.rs`, update the `insert` function to accept and bind `article_url`:
+
+```rust
+#[allow(clippy::too_many_arguments)]
+pub async fn insert(
+    pool: &PgPool,
+    user_id: Uuid,
+    job_id: Uuid,
+    call_type: &str,
+    model: &str,
+    system_prompt: &str,
+    user_prompt: &str,
+    response_body: &str,
+    duration_ms: i32,
+    article_url: Option<&str>,
+) -> Result<(), AppError> {
+    sqlx::query(
+        r#"
+        INSERT INTO llm_call_log (user_id, job_id, call_type, model, system_prompt, user_prompt, response_body, duration_ms, article_url)
+        VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
+        "#,
+    )
+    .bind(user_id)
+    .bind(job_id)
+    .bind(call_type)
+    .bind(model)
+    .bind(system_prompt)
+    .bind(user_prompt)
+    .bind(response_body)
+    .bind(duration_ms)
+    .bind(article_url)
+    .execute(pool)
+    .await?;
+    Ok(())
+}
+```
+
+- [ ] **Step 3: Update `LlmCallLogRow` and `list_by_job_id` query**
+
+Add `article_url` to the struct and SELECT:
+
+```rust
+#[derive(Debug, Clone, Serialize, sqlx::FromRow)]
+pub struct LlmCallLogRow {
+    pub id: Uuid,
+    pub call_type: String,
+    pub model: String,
+    pub system_prompt: String,
+    pub user_prompt: String,
+    pub response_body: String,
+    pub duration_ms: i32,
+    pub article_url: Option<String>,
+    pub created_at: DateTime<Utc>,
+}
+```
+
+Update the SELECT in `list_by_job_id`:
+
+```rust
+SELECT id, call_type, model, system_prompt, user_prompt, response_body, duration_ms, article_url, created_at
+```
+
+- [ ] **Step 4: Update `log_llm_call` helper in `synthesis.rs`**
+
+In `backend/src/services/synthesis.rs`, update the `log_llm_call` helper (around line 752):
+
+```rust
+async fn log_llm_call(
+    pool: &sqlx::PgPool,
+    user_id: Uuid,
+    job_id: Uuid,
+    call_type: &str,
+    model: &str,
+    system_prompt: &str,
+    user_prompt: &str,
+    response: &serde_json::Value,
+    duration_ms: u64,
+    article_url: Option<&str>,
+) {
+    let response_str = serde_json::to_string_pretty(response).unwrap_or_default();
+    db::llm_call_log::insert(
+        pool, user_id, job_id, call_type, model,
+        system_prompt, user_prompt, &response_str, duration_ms as i32,
+        article_url,
+    )
+    .await
+    .ok();
+}
+```
+
+- [ ] **Step 5: Update all `log_llm_call` call sites in `synthesis.rs`**
+
+Find the `log_llm_call` call for Phase 2 search (around line 572) and add `None`:
+
+```rust
+log_llm_call(&state.pool, user_id, job_id, "search", &model_websearch, &sys_prompt, &usr_prompt, &raw_results, llm_duration, None).await;
+```
+
+- [ ] **Step 6: Update the classify `insert` call in `synthesis.rs`**
+
+Find the direct `crate::db::llm_call_log::insert` call in the classify spawn (around line 488). Add the article URL:
+
+```rust
+// The `url` variable is already in scope in this closure (it's the article URL)
+crate::db::llm_call_log::insert(&pool, uid, jid, "classify_summarize", &mdl, &sys, &usr, &resp_str, duration as i32, Some(&url)).await.ok();
+```
+
+- [ ] **Step 7: Update the `insert` call in `source_scraper.rs`**
+
+In `backend/src/services/source_scraper.rs`, find the `crate::db::llm_call_log::insert` call (around line 172) and add `None`:
+
+```rust
+crate::db::llm_call_log::insert(
+    pool, uid, jid, "link_extraction", model,
+    &system, &user, &response_str, llm_duration as i32,
+    None,
+).await.ok();
+```
+
+- [ ] **Step 8: Update frontend types**
+
+In `frontend/src/types.ts`, add `article_url` to `LlmCallLogEntry`:
+
+```typescript
+export interface LlmCallLogEntry {
+  id: string;
+  call_type: string;
+  model: string;
+  system_prompt: string;
+  user_prompt: string;
+  response_body: string;
+  duration_ms: number;
+  article_url: string | null;
+  created_at: string;
+}
+```
+
+- [ ] **Step 9: Add i18n label**
+
+In `frontend/src/i18n/fr.ts`, add after `'llmLogs.back'`:
+
+```typescript
+'llmLogs.articleUrl': 'Article',
+```
+
+- [ ] **Step 10: Display article URL in `LlmLogs.tsx`**
+
+In `frontend/src/pages/LlmLogs.tsx`, add after the header `div` (the one with badge, model, duration — around line 95), before the `<div class="divide-y divide-gray-100">`:
+
+```tsx
+<Show when={entry.article_url}>
+  <div class="px-5 py-1 text-sm">
+    <span class="text-gray-500">{t('llmLogs.articleUrl')}: </span>
+    <a
+      href={entry.article_url!}
+      target="_blank"
+      rel="noopener noreferrer"
+      class="text-indigo-600 hover:text-indigo-800 underline break-all"
+    >
+      {entry.article_url}
+    </a>
+  </div>
+</Show>
+```
+
+Also add the `classify_summarize` badge mapping since it's missing:
+
+```typescript
+const CALL_TYPE_BADGE: Record<string, string> = {
+  search: 'bg-blue-100 text-blue-800',
+  classify_summarize: 'bg-purple-100 text-purple-800',
+  classification_phase1: 'bg-purple-100 text-purple-800',
+  classification_phase2: 'bg-purple-100 text-purple-800',
+  rewrite: 'bg-green-100 text-green-800',
+  link_extraction: 'bg-orange-100 text-orange-800',
+  article_extraction: 'bg-orange-100 text-orange-800',
+};
+```
+
+- [ ] **Step 11: Update CLAUDE.md migration count**
+
+Change `## Database (20 migrations)` to `## Database (21 migrations)`.
+
+- [ ] **Step 12: Build and test**
+
+Run: `cd backend && cargo build && cargo test --lib`
+Run: `cd frontend && npx tsc --noEmit`
+Expected: All pass
+
+- [ ] **Step 13: Commit**
+
+```bash
+git add backend/migrations/20260325000021_add_article_url_to_llm_log.sql \
+  backend/src/db/llm_call_log.rs \
+  backend/src/services/synthesis.rs \
+  backend/src/services/source_scraper.rs \
+  frontend/src/types.ts \
+  frontend/src/pages/LlmLogs.tsx \
+  frontend/src/i18n/fr.ts \
+  CLAUDE.md
+git commit -m "feat: add article_url to LLM call logs for classify tracing"
+```
+
+---
+
+### Task 3: Send structured link pairs to LLM instead of raw HTML
+
+**Files:**
+- Modify: `backend/src/services/source_scraper.rs`
+- Modify: `backend/src/services/prompts.rs`
+
+- [ ] **Step 1: Add `extract_links_as_pairs` function**
+
+In `backend/src/services/source_scraper.rs`, add after `extract_links_from_html`:
+
+```rust
+/// Extract all links from HTML as (href, anchor_text) pairs for LLM analysis.
+///
+/// Minimal filtering: same-domain, http/https, non-empty path.
+/// No article-pattern filtering — the LLM decides which are articles.
+pub fn extract_links_as_pairs(
+    html: &str,
+    base_url: &Url,
+) -> Vec<(String, String)> {
+    let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
+    let document = Html::parse_document(html);
+    let selector = Selector::parse("a[href]").unwrap();
+    let mut pairs = Vec::new();
+
+    for element in document.select(&selector) {
+        if let Some(href) = element.value().attr("href") {
+            let resolved = match base_url.join(href) {
+                Ok(u) => u,
+                Err(_) => continue,
+            };
+
+            if resolved.scheme() != "http" && resolved.scheme() != "https" {
+                continue;
+            }
+
+            let link_domain = resolved.host_str().unwrap_or("").to_lowercase();
+            if link_domain != base_domain {
+                continue;
+            }
+
+            let path = resolved.path();
+            if path.is_empty() || path == "/" {
+                continue;
+            }
+
+            let anchor_text: String = element.text().collect::<Vec<_>>().join(" ");
+            let anchor_text = anchor_text.trim().to_string();
+
+            pairs.push((resolved.to_string(), anchor_text));
+        }
+    }
+
+    pairs
+}
+```
+
+- [ ] **Step 2: Add tests for `extract_links_as_pairs`**
+
+In the `#[cfg(test)] mod tests` block in `source_scraper.rs`, add:
+
+```rust
+#[test]
+fn extract_pairs_returns_href_and_text() {
+    let html = r#"
+    <html><body>
+        <a href="/blog/article-1">Breaking AI News</a>
+        <a href="/blog/article-2">GPT-6 Released</a>
+    </body></html>"#;
+    let base = base_url("https://example.com/blog");
+    let pairs = extract_links_as_pairs(html, &base);
+    assert_eq!(pairs.len(), 2);
+    assert!(pairs[0].0.contains("/blog/article-1"));
+    assert_eq!(pairs[0].1, "Breaking AI News");
+    assert!(pairs[1].0.contains("/blog/article-2"));
+    assert_eq!(pairs[1].1, "GPT-6 Released");
+}
+
+#[test]
+fn extract_pairs_filters_external_links() {
+    let html = r#"<a href="https://other.com/article">External</a>"#;
+    let base = base_url("https://example.com");
+    let pairs = extract_links_as_pairs(html, &base);
+    assert!(pairs.is_empty());
+}
+
+#[test]
+fn extract_pairs_filters_root_path() {
+    let html = r#"<a href="/">Home</a>"#;
+    let base = base_url("https://example.com");
+    let pairs = extract_links_as_pairs(html, &base);
+    assert!(pairs.is_empty());
+}
+
+#[test]
+fn extract_pairs_handles_empty_anchor_text() {
+    let html = r#"<a href="/article"><img src="pic.jpg"/></a>"#;
+    let base = base_url("https://example.com");
+    let pairs = extract_links_as_pairs(html, &base);
+    assert_eq!(pairs.len(), 1);
+    assert_eq!(pairs[0].1, "");
+}
+```
+
+- [ ] **Step 3: Run new tests**
+
+Run: `cd backend && cargo test --lib source_scraper`
+Expected: All source_scraper tests pass (including the 4 new ones)
+
+- [ ] **Step 4: Update `extract_article_links_with_llm` to use structured pairs**
+
+In `backend/src/services/source_scraper.rs`, update `extract_article_links_with_llm`. Replace the section that calls `extract_body_html` and `build_link_extraction_prompt` (around lines 158-160):
+
+```rust
+// Before:
+let body_html = extract_body_html(&html_text);
+let (system, user) = build_link_extraction_prompt(&body_html);
+
+// After:
+let pairs = extract_links_as_pairs(&html_text, &base_url);
+let links_text = format_links_for_llm(&pairs);
+let (system, user) = build_link_extraction_prompt(&links_text);
+```
+
+Add the formatting helper in `source_scraper.rs` (before `extract_article_links_with_llm`):
+
+```rust
+/// Format link pairs as a text list for the LLM prompt.
+/// Caps at 200 links to limit token usage.
+fn format_links_for_llm(pairs: &[(String, String)]) -> String {
+    pairs
+        .iter()
+        .take(200)
+        .map(|(href, text)| {
+            if text.is_empty() {
+                format!("- {}", href)
+            } else {
+                format!("- {} | \"{}\"", href, text)
+            }
+        })
+        .collect::<Vec<_>>()
+        .join("\n")
+}
+```
+
+- [ ] **Step 5: Update `build_link_extraction_prompt` in `prompts.rs`**
+
+In `backend/src/services/prompts.rs`, update `build_link_extraction_prompt`:
+
+```rust
+/// Build a prompt for LLM-assisted link extraction from a source page.
+///
+/// Receives a pre-formatted list of (href, anchor_text) pairs, not raw HTML.
+pub fn build_link_extraction_prompt(links_text: &str) -> (String, String) {
+    let system_prompt =
+        "Tu es un assistant qui analyse des listes de liens. \
+         Tu dois identifier les liens vers des articles d'actualite. \
+         Reponds uniquement au format JSON demande."
+            .to_string();
+
+    let user_prompt = format!(
+        "Voici une liste de liens extraits d'une page de blog ou de site d'actualites.\n\n\
+         {links}\n\n\
+         Selectionne UNIQUEMENT les URLs qui pointent vers des articles \
+         (pas les liens de navigation, tags, categories, login, pages statiques, topics, \
+         archive, companies, events, company, event, collections, etc.).\n\
+         Retourne les URLs completes, sans les modifier, dans le format JSON demande. \
+         Ne change jamais les URLs retournees, et ne les tronque jamais.",
+        links = links_text,
+    );
+
+    (system_prompt, user_prompt)
+}
+```
+
+- [ ] **Step 6: Remove `extract_body_html` and its tests**
+
+In `backend/src/services/source_scraper.rs`:
+- Delete the `extract_body_html` function
+- Delete the tests `extract_body_html_gets_body_content` and `extract_body_html_truncates_safely`
+
+- [ ] **Step 7: Update prompt tests in `prompts.rs`**
+
+In `backend/src/services/prompts.rs`, update the existing link extraction tests:
+
+```rust
+#[test]
+fn link_extraction_prompt_includes_links() {
+    let links = "- https://example.com/post-1 | \"Breaking News\"\n- https://example.com/post-2 | \"Update\"";
+    let (sys, user) = build_link_extraction_prompt(links);
+    assert!(user.contains("https://example.com/post-1"));
+    assert!(user.contains("Breaking News"));
+    assert!(sys.contains("liens"));
+}
+
+#[test]
+fn link_extraction_prompt_empty_links() {
+    let (_, user) = build_link_extraction_prompt("");
+    assert!(user.contains("articles"));
+}
+```
+
+Remove the old `link_extraction_prompt_truncates_body` test.
+
+- [ ] **Step 8: Add test for `format_links_for_llm`**
+
+In `source_scraper.rs` tests:
+
+```rust
+#[test]
+fn format_links_for_llm_formats_correctly() {
+    let pairs = vec![
+        ("https://example.com/a".to_string(), "Article One".to_string()),
+        ("https://example.com/b".to_string(), "".to_string()),
+    ];
+    let result = format_links_for_llm(&pairs);
+    assert!(result.contains("- https://example.com/a | \"Article One\""));
+    assert!(result.contains("- https://example.com/b"));
+    assert!(!result.contains("| \"\""));
+}
+
+#[test]
+fn format_links_for_llm_caps_at_200() {
+    let pairs: Vec<(String, String)> = (0..300)
+        .map(|i| (format!("https://example.com/{}", i), format!("Link {}", i)))
+        .collect();
+    let result = format_links_for_llm(&pairs);
+    let line_count = result.lines().count();
+    assert_eq!(line_count, 200);
+}
+```
+
+- [ ] **Step 9: Run all tests**
+
+Run: `cd backend && cargo test --lib`
+Expected: All tests pass
+
+- [ ] **Step 10: Commit**
+
+```bash
+git add backend/src/services/source_scraper.rs backend/src/services/prompts.rs
+git commit -m "feat: send structured link pairs to LLM instead of raw HTML body"
+```