From da602d850d738ba0917f48a35e40ecf456e8c00c Mon Sep 17 00:00:00 2001 From: oabrivard Date: Fri, 27 Mar 2026 08:41:19 +0100 Subject: [PATCH] test: add pipeline tests for source diversity and history dedup (GAP-05, GAP-07) Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/tests/pipeline_test.rs | 249 +++++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) diff --git a/backend/tests/pipeline_test.rs b/backend/tests/pipeline_test.rs index 135d3ef..a620ad8 100644 --- a/backend/tests/pipeline_test.rs +++ b/backend/tests/pipeline_test.rs @@ -250,3 +250,252 @@ async fn category_overflow_spills_to_autre() { assert!(autre_section.is_some(), "Overflow should create Autre section"); } } + +// ── GAP-03: Brave Search pipeline path ──────────────────────────────── +// +// The Brave Search code path (`use_brave_search: true`) cannot be tested in +// integration tests without a real Brave API key. The pipeline calls +// `resolve_brave_key` which decrypts a key stored via `user_api_keys`, so we +// would need working AES-256-GCM encryption round-tripping in the test harness. +// The LLM-based web search fallback (Phase 2 without Brave) is already covered +// by `phase2_search_fills_gaps_when_no_sources`. + +// ── GAP-05: Source diversity cap ────────────────────────────────────── + +#[tokio::test] +async fn source_diversity_limits_articles_per_source() { + let app = common::TestApp::new().await; + let mock_server = setup_mock_server().await; + + // Create user with default settings first, then override max_articles_per_source + let email = format!("diversity-{}@test.com", uuid::Uuid::new_v4()); + let (user_id, session) = app.create_authenticated_user(&email).await; + + // Create theme + let theme_body = serde_json::json!({ + "name": "Diversity Theme", + "theme": "Intelligence Artificielle", + "categories": ["AI News"], + "max_items_per_category": 10, + "max_age_days": 365, + "summary_length": 1 + }); + let (theme_status, theme_resp) = app + .post_with_session("/api/v1/themes", &theme_body, &session) + .await; + assert_eq!(theme_status.as_u16(), 201, "Theme creation should succeed"); + let theme_id: uuid::Uuid = theme_resp["id"].as_str().unwrap().parse().unwrap(); + + // Update settings: max_articles_per_source = 1 so only 1 article from + // the mock server domain is accepted (the source page has 3 links). + let settings = serde_json::json!({ + "max_articles_per_source": 1, + "max_links_per_source": 8, + "use_brave_search": false, + "article_history_days": 0, + "batch_size": 5, + "source_extraction_window": 3, + "search_agent_behavior": "", + "ai_provider": "", + "ai_model": "", + "ai_model_websearch": "", + "rate_limit_max_requests": null, + "rate_limit_time_window_seconds": null + }); + let (settings_status, _) = app + .put_with_session("/api/v1/settings", &settings, &session) + .await; + assert_eq!(settings_status.as_u16(), 200, "Settings save should succeed"); + + // Add source pointing to mock server + let source_url = format!("{}/blog", mock_server.uri()); + let source = serde_json::json!({ + "title": "Diversity Source", + "url": source_url, + "theme_id": theme_id.to_string() + }); + let (source_status, _) = app + .post_with_session("/api/v1/sources", &source, &session) + .await; + assert!(source_status.is_success(), "Source creation should succeed"); + + // Run pipeline + let mock_provider = MockLlmProvider::new() + .with_default_category("AI News") + .into_arc(); + + let job_id = uuid::Uuid::new_v4(); + let (tx, _rx) = make_progress_channel(); + + let state = ai_synth_backend::app_state::AppState::new( + app.config.clone(), + app.pool.clone(), + reqwest::Client::new(), + ); + + let result = synthesis::run_generation_inner( + job_id, &state, user_id, theme_id, &tx, Some(mock_provider), + ) + .await; + + assert!(result.is_ok(), "Generation should succeed: {:?}", result.err()); + + // Verify that some articles were filtered by source diversity + let diversity_count: (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM article_history WHERE user_id = $1 AND job_id = $2 AND status = 'filtered_diversity'" + ) + .bind(user_id) + .bind(job_id) + .fetch_one(&app.pool) + .await + .unwrap(); + + // The mock source page has 3 article links, all from the same domain. + // With max_articles_per_source=1, at least 2 should be filtered. + assert!( + diversity_count.0 > 0, + "Should have diversity-filtered articles (got 0)" + ); + + // Verify the synthesis only contains 1 article (the cap) + let synthesis_id = result.unwrap(); + let row: (serde_json::Value,) = + sqlx::query_as("SELECT sections FROM syntheses WHERE id = $1") + .bind(synthesis_id) + .fetch_one(&app.pool) + .await + .unwrap(); + + let sections: Vec = serde_json::from_value(row.0).unwrap(); + let total_items: usize = sections + .iter() + .filter_map(|s| s["items"].as_array()) + .map(|items| items.len()) + .sum(); + assert_eq!( + total_items, 1, + "With max_articles_per_source=1, only 1 article should appear in the synthesis" + ); +} + +// ── GAP-07: Article history dedup across syntheses ──────────────────── + +#[tokio::test] +async fn article_history_dedup_prevents_repeat_articles() { + let app = common::TestApp::new().await; + let mock_server = setup_mock_server().await; + + let email = format!("dedup-{}@test.com", uuid::Uuid::new_v4()); + let (user_id, session) = app.create_authenticated_user(&email).await; + + // Create theme + let theme_body = serde_json::json!({ + "name": "Dedup Theme", + "theme": "Intelligence Artificielle", + "categories": ["AI News"], + "max_items_per_category": 10, + "max_age_days": 365, + "summary_length": 1 + }); + let (theme_status, theme_resp) = app + .post_with_session("/api/v1/themes", &theme_body, &session) + .await; + assert_eq!(theme_status.as_u16(), 201, "Theme creation should succeed"); + let theme_id: uuid::Uuid = theme_resp["id"].as_str().unwrap().parse().unwrap(); + + // Settings with article_history_days > 0 to enable dedup + let settings = serde_json::json!({ + "max_articles_per_source": 10, + "max_links_per_source": 8, + "use_brave_search": false, + "article_history_days": 90, + "batch_size": 5, + "source_extraction_window": 3, + "search_agent_behavior": "", + "ai_provider": "", + "ai_model": "", + "ai_model_websearch": "", + "rate_limit_max_requests": null, + "rate_limit_time_window_seconds": null + }); + let (settings_status, _) = app + .put_with_session("/api/v1/settings", &settings, &session) + .await; + assert_eq!(settings_status.as_u16(), 200, "Settings save should succeed"); + + // Add source + let source_url = format!("{}/blog", mock_server.uri()); + let source = serde_json::json!({ + "title": "Dedup Source", + "url": source_url, + "theme_id": theme_id.to_string() + }); + let (source_status, _) = app + .post_with_session("/api/v1/sources", &source, &session) + .await; + assert!(source_status.is_success(), "Source creation should succeed"); + + // ── First generation ────────────────────────────────────────────── + let mock1 = MockLlmProvider::new() + .with_default_category("AI News") + .into_arc(); + let job1 = uuid::Uuid::new_v4(); + let (tx1, _rx1) = make_progress_channel(); + let state1 = ai_synth_backend::app_state::AppState::new( + app.config.clone(), + app.pool.clone(), + reqwest::Client::new(), + ); + + let result1 = synthesis::run_generation_inner( + job1, &state1, user_id, theme_id, &tx1, Some(mock1), + ) + .await; + assert!(result1.is_ok(), "First generation should succeed: {:?}", result1.err()); + + // Verify first run produced articles + let used_count_1: (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM article_history WHERE user_id = $1 AND job_id = $2 AND status = 'used'" + ) + .bind(user_id) + .bind(job1) + .fetch_one(&app.pool) + .await + .unwrap(); + assert!(used_count_1.0 > 0, "First run should produce used articles"); + + // ── Second generation — same source, same articles ──────────────── + let mock2 = MockLlmProvider::new() + .with_default_category("AI News") + .into_arc(); + let job2 = uuid::Uuid::new_v4(); + let (tx2, _rx2) = make_progress_channel(); + let state2 = ai_synth_backend::app_state::AppState::new( + app.config.clone(), + app.pool.clone(), + reqwest::Client::new(), + ); + + // The second run scrapes the same URLs, which are already in article_history. + // They should be filtered out as "filtered_history". + let _result2 = synthesis::run_generation_inner( + job2, &state2, user_id, theme_id, &tx2, Some(mock2), + ) + .await; + // The second run may succeed (empty synthesis) or fail (no valid articles). + // Either way, history-dedup entries must exist. + + let dedup_count: (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM article_history WHERE user_id = $1 AND job_id = $2 AND status = 'filtered_history'" + ) + .bind(user_id) + .bind(job2) + .fetch_one(&app.pool) + .await + .unwrap(); + + assert!( + dedup_count.0 > 0, + "Second run should have history-deduped articles (got 0)" + ); +}