From 5a0495b02a47d421bd636f8cc271aff75f125862 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 18:48:31 +0100 Subject: [PATCH] docs: add article tracing implementation plan (7 tasks) --- .../plans/2026-03-24-article-tracing.md | 620 ++++++++++++++++++ 1 file changed, 620 insertions(+) create mode 100644 docs/superpowers/plans/2026-03-24-article-tracing.md diff --git a/docs/superpowers/plans/2026-03-24-article-tracing.md b/docs/superpowers/plans/2026-03-24-article-tracing.md new file mode 100644 index 0000000..1fb1017 --- /dev/null +++ b/docs/superpowers/plans/2026-03-24-article-tracing.md @@ -0,0 +1,620 @@ +# Article Tracing — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Track the origin and status of every article candidate in the pipeline, with frontend viewers for debugging synthesis quality. + +**Architecture:** Enrich `article_history` table with metadata (source_type, status, job_id). Insert dropped articles at each filtering step. Two new API endpoints. Two frontend views (global history + per-synthesis provenance). + +**Tech Stack:** Rust (sqlx), SolidJS, PostgreSQL + +**Spec:** `docs/superpowers/specs/2026-03-24-article-tracing-design.md` + +--- + +### Task 1: Migration — enrich article_history + syntheses.job_id + +**Files:** +- Create: `backend/migrations/20260324000016_enrich_article_history.sql` +- Modify: `CLAUDE.md` + +- [ ] **Step 1: Create migration** + +```sql +-- Enrich article_history with tracing metadata +ALTER TABLE article_history ADD COLUMN title TEXT NOT NULL DEFAULT ''; +ALTER TABLE article_history ADD COLUMN source_type TEXT NOT NULL DEFAULT 'unknown'; +ALTER TABLE article_history ADD COLUMN source_url TEXT; +ALTER TABLE article_history ADD COLUMN category TEXT; +ALTER TABLE article_history ADD COLUMN synthesis_id UUID REFERENCES syntheses(id) ON DELETE SET NULL; +ALTER TABLE article_history ADD COLUMN status TEXT NOT NULL DEFAULT 'used'; +ALTER TABLE article_history ADD COLUMN scraped_ok BOOLEAN NOT NULL DEFAULT true; +ALTER TABLE article_history ADD COLUMN job_id UUID NOT NULL DEFAULT gen_random_uuid(); + +-- Drop unique index — table is now a trace log +DROP INDEX idx_article_history_user_url; +CREATE INDEX idx_article_history_user_url ON article_history(user_id, url_hash); +CREATE INDEX idx_article_history_job_id ON article_history(job_id); + +-- Store job_id on syntheses for direct provenance lookup +ALTER TABLE syntheses ADD COLUMN job_id UUID; +``` + +- [ ] **Step 2: Update CLAUDE.md migration count to 16** + +- [ ] **Step 3: Commit** + +```bash +git add backend/migrations/20260324000016_enrich_article_history.sql CLAUDE.md +git commit -m "feat: enrich article_history with tracing metadata + syntheses.job_id" +``` + +--- + +### Task 2: DB module — ArticleHistoryEntry + insert/query functions + +**Files:** +- Modify: `backend/src/db/article_history.rs` + +- [ ] **Step 1: Add `ArticleHistoryEntry` struct and `insert_entry` function** + +Add a struct for inserting trace entries and a function to insert them: + +```rust +use chrono::{DateTime, Utc}; + +/// Entry for inserting into article_history with full tracing metadata. +pub struct ArticleHistoryEntry { + pub user_id: Uuid, + pub url: String, + pub url_hash: String, + pub title: String, + pub source_type: String, + pub source_url: Option, + pub category: Option, + pub synthesis_id: Option, + pub status: String, + pub scraped_ok: bool, + pub job_id: Uuid, +} + +/// Insert a single article history entry with full tracing metadata. +pub async fn insert_entry(pool: &PgPool, entry: &ArticleHistoryEntry) -> Result<(), AppError> { + sqlx::query( + r#" + INSERT INTO article_history (user_id, url_hash, url, title, source_type, source_url, category, synthesis_id, status, scraped_ok, job_id) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11) + "#, + ) + .bind(entry.user_id) + .bind(&entry.url_hash) + .bind(&entry.url) + .bind(&entry.title) + .bind(&entry.source_type) + .bind(&entry.source_url) + .bind(&entry.category) + .bind(entry.synthesis_id) + .bind(&entry.status) + .bind(entry.scraped_ok) + .bind(entry.job_id) + .execute(pool) + .await?; + Ok(()) +} +``` + +- [ ] **Step 2: Add `ArticleHistoryRow` and query functions** + +```rust +/// Row returned from article_history queries. +#[derive(Debug, Clone, serde::Serialize, sqlx::FromRow)] +pub struct ArticleHistoryRow { + pub id: Uuid, + pub url: String, + pub title: String, + pub source_type: String, + pub source_url: Option, + pub category: Option, + pub synthesis_id: Option, + pub status: String, + pub scraped_ok: bool, + pub job_id: Uuid, + pub created_at: DateTime, +} + +/// List article history with optional filters, paginated. +pub async fn list_history( + pool: &PgPool, + user_id: Uuid, + limit: i64, + offset: i64, + status_filter: Option<&str>, + source_type_filter: Option<&str>, +) -> Result, AppError> { + let rows = sqlx::query_as::<_, ArticleHistoryRow>( + r#" + SELECT id, url, title, source_type, source_url, category, synthesis_id, status, scraped_ok, job_id, created_at + FROM article_history + WHERE user_id = $1 + AND ($4::TEXT IS NULL OR status = $4) + AND ($5::TEXT IS NULL OR source_type = $5) + ORDER BY created_at DESC + LIMIT $2 OFFSET $3 + "#, + ) + .bind(user_id) + .bind(limit) + .bind(offset) + .bind(status_filter) + .bind(source_type_filter) + .fetch_all(pool) + .await?; + Ok(rows) +} + +/// Count article history entries with optional filters. +pub async fn count_history( + pool: &PgPool, + user_id: Uuid, + status_filter: Option<&str>, + source_type_filter: Option<&str>, +) -> Result { + let row = sqlx::query_scalar::<_, i64>( + r#" + SELECT COUNT(*) FROM article_history + WHERE user_id = $1 + AND ($2::TEXT IS NULL OR status = $2) + AND ($3::TEXT IS NULL OR source_type = $3) + "#, + ) + .bind(user_id) + .bind(status_filter) + .bind(source_type_filter) + .fetch_one(pool) + .await?; + Ok(row) +} + +/// List all article history entries for a generation job. +pub async fn list_by_job_id( + pool: &PgPool, + user_id: Uuid, + job_id: Uuid, +) -> Result, AppError> { + let rows = sqlx::query_as::<_, ArticleHistoryRow>( + r#" + SELECT id, url, title, source_type, source_url, category, synthesis_id, status, scraped_ok, job_id, created_at + FROM article_history + WHERE user_id = $1 AND job_id = $2 + ORDER BY created_at ASC + "#, + ) + .bind(user_id) + .bind(job_id) + .fetch_all(pool) + .await?; + Ok(rows) +} +``` + +- [ ] **Step 3: Update `cleanup_old` to preserve used entries** + +Change the DELETE query to only remove entries where `synthesis_id IS NULL`: + +```rust +pub async fn cleanup_old(pool: &PgPool, user_id: Uuid, days: i32) -> Result { + let result = sqlx::query( + "DELETE FROM article_history WHERE user_id = $1 AND created_at < now() - make_interval(days => $2) AND synthesis_id IS NULL", + ) + .bind(user_id) + .bind(days) + .execute(pool) + .await?; + Ok(result.rows_affected()) +} +``` + +- [ ] **Step 4: Run tests + commit** + +```bash +cd backend && cargo test --lib && cargo build +git add backend/src/db/article_history.rs +git commit -m "feat: article history entry struct + insert/query/cleanup functions" +``` + +--- + +### Task 3: Update syntheses DB — save job_id + +**Files:** +- Modify: `backend/src/db/syntheses.rs` +- Modify: `backend/src/models/synthesis.rs` + +- [ ] **Step 1: Add `job_id` to Synthesis model** + +In `models/synthesis.rs`, add `pub job_id: Option` to the `Synthesis` struct. It's `Option` because old syntheses won't have it. + +- [ ] **Step 2: Update `create` function to accept and save job_id** + +In `db/syntheses.rs`, change the `create` function signature to accept `job_id: Uuid`: + +```rust +pub async fn create( + pool: &PgPool, + user_id: Uuid, + week: &str, + sections_json: &serde_json::Value, + job_id: Uuid, +) -> Result { + let row = sqlx::query_as::<_, Synthesis>( + r#" + INSERT INTO syntheses (user_id, week, sections, status, job_id) + VALUES ($1, $2, $3, 'completed', $4) + RETURNING id, user_id, week, sections, status, created_at, job_id + "#, + ) + .bind(user_id) + .bind(week) + .bind(sections_json) + .bind(job_id) + .fetch_one(pool) + .await?; + Ok(row) +} +``` + +Also update `list_for_user` and `get_by_id` RETURNING clauses to include `job_id`. + +- [ ] **Step 3: Update caller in synthesis.rs** + +In `run_generation_inner`, the call to `db::syntheses::create(...)` needs to pass `job_id` (rename `_job_id` to `job_id` in the function signature). Search for `db::syntheses::create` and add `job_id` as the last argument. + +- [ ] **Step 4: Run tests + commit** + +```bash +cd backend && cargo test --lib && cargo build +git add backend/src/db/syntheses.rs backend/src/models/synthesis.rs backend/src/services/synthesis.rs +git commit -m "feat: save job_id on syntheses for provenance lookup" +``` + +--- + +### Task 4: Pipeline instrumentation — insert dropped articles at each filtering step + +**Files:** +- Modify: `backend/src/services/synthesis.rs` +- Modify: `backend/src/models/synthesis.rs` + +This is the largest task. At each filtering step in `run_generation_inner`, insert dropped articles into `article_history`. + +- [ ] **Step 1: Add `source_url` to ScrapedNewsItem** + +In `models/synthesis.rs`, add `pub source_url: Option` to `ScrapedNewsItem` (after `scraped_content`). Add `#[serde(default)]` to make it optional during deserialization. + +Update all places that construct `ScrapedNewsItem` in `synthesis.rs`: +- In `scrape_flat_urls` result handler — set `source_url: None` (will be enhanced later) +- In `scrape_articles` result handler — set `source_url: None` + +- [ ] **Step 2: Thread source_url through Phase 1** + +Change `candidate_urls` from `Vec` to `Vec<(String, String)>` — `(article_url, source_page_url)`. Update the source scraping loop to pair each link with its source URL. Update `scrape_flat_urls` to accept `&[(String, String)]` and set `source_url` on each `ScrapedNewsItem`. + +- [ ] **Step 3: Add helper function to insert trace entries** + +Add a convenience function in `synthesis.rs` to reduce boilerplate: + +```rust +/// Insert a trace entry into article_history. +async fn trace_article( + pool: &sqlx::PgPool, + user_id: Uuid, + job_id: Uuid, + url: &str, + title: &str, + source_type: &str, + source_url: Option<&str>, + category: Option<&str>, + synthesis_id: Option, + status: &str, + scraped_ok: bool, +) { + let entry = db::article_history::ArticleHistoryEntry { + user_id, + url: url.to_string(), + url_hash: hash_article_url(url), + title: title.to_string(), + source_type: source_type.to_string(), + source_url: source_url.map(|s| s.to_string()), + category: category.map(|s| s.to_string()), + synthesis_id, + status: status.to_string(), + scraped_ok, + job_id, + }; + db::article_history::insert_entry(pool, &entry).await.ok(); +} +``` + +- [ ] **Step 4: Instrument Phase 1 filtering steps** + +At each Phase 1 filtering point, call `trace_article` for dropped articles. Key insertion points: + +After empty content filter: +```rust +for article in &scraped_articles { + if article.scraped_content.trim().is_empty() { + trace_article(&state.pool, user_id, job_id, &article.url, &article.title, + "personalized_source", article.source_url.as_deref(), None, None, + "filtered_empty", false).await; + } +} +``` + +After history filter (articles that matched existing history): +```rust +// Articles removed by history filter +for article in &valid_articles_before_filter { + if existing_hashes.contains(&hash_article_url(&article.url)) { + trace_article(&state.pool, user_id, job_id, &article.url, &article.title, + "personalized_source", article.source_url.as_deref(), None, None, + "filtered_history", true).await; + } +} +``` + +Similar patterns for source diversity drops, retry drops, etc. + +- [ ] **Step 5: Instrument Phase 2 filtering steps** + +Same pattern for Phase 2 with `source_type: "web_search"` and `source_url: None`. + +- [ ] **Step 6: Insert used articles after save** + +Replace the old `insert_urls` call with `trace_article` calls for each used article: + +```rust +if settings.article_history_days > 0 { + for section in &final_sections { + for item in §ion.items { + trace_article(&state.pool, user_id, job_id, &item.url, &item.title, + "used", None, Some(§ion.title), Some(synthesis.id), + "used", true).await; + } + } +} +``` + +- [ ] **Step 7: Run tests + commit** + +```bash +cd backend && cargo test --lib && cargo build +git add backend/src/services/synthesis.rs backend/src/models/synthesis.rs +git commit -m "feat: instrument pipeline with article tracing at every filtering step" +``` + +--- + +### Task 5: API endpoints — history listing + provenance + +**Files:** +- Create: `backend/src/handlers/article_history.rs` +- Modify: `backend/src/handlers/mod.rs` +- Modify: `backend/src/router.rs` + +- [ ] **Step 1: Create handler module** + +Create `backend/src/handlers/article_history.rs` with two handlers: + +```rust +//! Handlers for article history and provenance endpoints. + +use axum::extract::{Path, Query, State}; +use axum::http::StatusCode; +use axum::response::IntoResponse; +use axum::Json; +use serde::Deserialize; +use uuid::Uuid; + +use crate::app_state::AppState; +use crate::db; +use crate::errors::AppError; +use crate::middleware::auth::AuthUser; + +#[derive(Deserialize)] +pub struct HistoryQuery { + pub limit: Option, + pub offset: Option, + pub status: Option, + pub source_type: Option, +} + +/// GET /api/v1/article-history +pub async fn list_history( + auth_user: AuthUser, + State(state): State, + Query(params): Query, +) -> Result { + let limit = params.limit.unwrap_or(50).clamp(1, 200); + let offset = params.offset.unwrap_or(0).max(0); + + let items = db::article_history::list_history( + &state.pool, auth_user.id, limit, offset, + params.status.as_deref(), params.source_type.as_deref(), + ).await?; + + let total = db::article_history::count_history( + &state.pool, auth_user.id, + params.status.as_deref(), params.source_type.as_deref(), + ).await?; + + Ok(Json(serde_json::json!({ + "items": items, + "total": total + }))) +} + +/// GET /api/v1/syntheses/:id/provenance +pub async fn get_provenance( + auth_user: AuthUser, + State(state): State, + Path(synthesis_id): Path, +) -> Result { + // Get the synthesis to find its job_id + let synthesis = db::syntheses::get_by_id_for_user(&state.pool, synthesis_id, auth_user.id) + .await? + .ok_or_else(|| AppError::NotFound("Synthesis not found".into()))?; + + let job_id = synthesis.job_id.ok_or_else(|| { + AppError::NotFound("No tracing data available for this synthesis".into()) + })?; + + let items = db::article_history::list_by_job_id(&state.pool, auth_user.id, job_id).await?; + + Ok(Json(items)) +} +``` + +- [ ] **Step 2: Register handler module and add routes** + +In `handlers/mod.rs`, add `pub mod article_history;`. + +In `router.rs`, add routes in the authenticated section: +```rust +.route("/api/v1/article-history", get(handlers::article_history::list_history)) +.route("/api/v1/syntheses/:id/provenance", get(handlers::article_history::get_provenance)) +``` + +- [ ] **Step 3: Run tests + commit** + +```bash +cd backend && cargo test --lib && cargo build +git add backend/src/handlers/article_history.rs backend/src/handlers/mod.rs backend/src/router.rs +git commit -m "feat: API endpoints for article history listing and provenance" +``` + +--- + +### Task 6: Frontend — article history page + provenance section + +**Files:** +- Create: `frontend/src/pages/ArticleHistory.tsx` +- Create: `frontend/src/api/articleHistory.ts` +- Modify: `frontend/src/pages/SynthesisDetail.tsx` +- Modify: `frontend/src/App.tsx` +- Modify: `frontend/src/pages/Settings.tsx` +- Modify: `frontend/src/i18n/fr.ts` +- Modify: `frontend/src/types.ts` + +- [ ] **Step 1: Add types** + +In `types.ts`: +```typescript +export interface ArticleHistoryEntry { + id: string; + url: string; + title: string; + source_type: string; + source_url: string | null; + category: string | null; + synthesis_id: string | null; + status: string; + scraped_ok: boolean; + job_id: string; + created_at: string; +} + +export interface ArticleHistoryResponse { + items: ArticleHistoryEntry[]; + total: number; +} +``` + +- [ ] **Step 2: Add API client** + +Create `frontend/src/api/articleHistory.ts`: +```typescript +import { api } from './client'; +import type { ArticleHistoryResponse, ArticleHistoryEntry } from '~/types'; + +export const articleHistoryApi = { + list: (params: { limit?: number; offset?: number; status?: string; source_type?: string } = {}): Promise => { + const query = new URLSearchParams(); + if (params.limit) query.set('limit', String(params.limit)); + if (params.offset) query.set('offset', String(params.offset)); + if (params.status) query.set('status', params.status); + if (params.source_type) query.set('source_type', params.source_type); + return api.get(`/article-history?${query.toString()}`); + }, + + getProvenance: (synthesisId: string): Promise => + api.get(`/syntheses/${synthesisId}/provenance`), +}; +``` + +- [ ] **Step 3: Add i18n labels** + +In `fr.ts`, add labels for the history page and provenance section (article history title, column headers, status badges, filter labels, provenance section title, empty state messages). + +- [ ] **Step 4: Create ArticleHistory page** + +Create `frontend/src/pages/ArticleHistory.tsx` — a page with: +- Filter dropdowns for status and source_type +- Paginated table showing article history entries +- Color-coded status badges +- Clickable URLs and synthesis links + +- [ ] **Step 5: Add route and Settings link** + +In `App.tsx`, add route: `` +In `Settings.tsx`, add a button/link to navigate to `/article-history`. + +- [ ] **Step 6: Add provenance section to SynthesisDetail** + +In `SynthesisDetail.tsx`, add a collapsible "Provenance" section at the bottom that calls `articleHistoryApi.getProvenance(id)` and displays the trace table. + +- [ ] **Step 7: Run frontend tests + commit** + +```bash +cd frontend && npx tsc --noEmit && npx vitest run +git add frontend/src/types.ts frontend/src/api/articleHistory.ts frontend/src/pages/ArticleHistory.tsx frontend/src/pages/SynthesisDetail.tsx frontend/src/App.tsx frontend/src/pages/Settings.tsx frontend/src/i18n/fr.ts +git commit -m "feat: article history page + provenance section in synthesis detail" +``` + +--- + +### Task 7: E2E test — verify provenance after generation + +**Files:** +- Modify: `e2e/tests/generation-live.spec.ts` + +- [ ] **Step 1: Add provenance verification** + +After the synthesis is generated and validated, call the provenance endpoint and verify: + +```typescript + // Verify provenance data exists + const provResp = await apiCall(page, 'GET', `/api/v1/syntheses/${synthesisId}/provenance`); + expect(provResp.status).toBe(200); + const provenance = provResp.data; + expect(Array.isArray(provenance)).toBe(true); + expect(provenance.length).toBeGreaterThan(0); + + // At least some entries should be 'used' + const usedEntries = provenance.filter((e: any) => e.status === 'used'); + expect(usedEntries.length).toBeGreaterThan(0); + + // Every used entry should have a synthesis_id + for (const entry of usedEntries) { + expect(entry.synthesis_id).toBe(synthesisId); + expect(entry.job_id).toBeTruthy(); + } +``` + +- [ ] **Step 2: Run E2E + commit** + +```bash +cd e2e && docker compose -f docker-compose.test.yml down && docker compose -f docker-compose.test.yml up --build -d +sleep 25 && npx tsx seed.ts && npx playwright test generation-live --reporter=list +git add e2e/tests/generation-live.spec.ts +git commit -m "test: verify provenance endpoint returns tracing data after generation" +```