mod common; use ai_synth_backend::services::llm::mock::MockLlmProvider; use ai_synth_backend::services::synthesis; use std::sync::atomic::AtomicBool; use std::sync::Arc; use tokio::sync::watch; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, ResponseTemplate}; async fn setup_mock_server() -> MockServer { let server = MockServer::start().await; // Source page with links to articles (for Phase 1 heuristic extraction) let base = server.uri(); Mock::given(method("GET")) .and(path("/blog")) .respond_with(ResponseTemplate::new(200).set_body_string(format!( r#" Article One Article Two Article Three "# ))) .mount(&server) .await; // Article pages for i in 1..=5 { Mock::given(method("GET")) .and(path(format!("/article-{}", i))) .respond_with(ResponseTemplate::new(200).set_body_string(format!( r#" Test Article {i}

This is the content of test article {i} about artificial intelligence.

"# ))) .mount(&server) .await; } server } async fn setup_user_with_settings( app: &common::TestApp, categories: Vec<&str>, max_items: i32, ) -> (uuid::Uuid, String, uuid::Uuid) { let email = format!("pipeline-{}@test.com", uuid::Uuid::new_v4()); let (user_id, session) = app.create_authenticated_user(&email).await; let settings = serde_json::json!({ "max_articles_per_source": 10, "max_links_per_source": 8, "use_brave_search": false, "article_history_days": 90, "batch_size": 5, "source_extraction_window": 3, "search_agent_behavior": "", "ai_provider": "", "ai_model": "", "ai_model_websearch": "", "rate_limit_max_requests": null, "rate_limit_time_window_seconds": null }); let (status, _) = app.put_with_session("/api/v1/settings", &settings, &session).await; assert_eq!(status.as_u16(), 200, "Settings save should succeed"); // Create a theme for the pipeline let categories_json: Vec = categories.iter().map(|c| serde_json::json!(c)).collect(); let theme_body = serde_json::json!({ "name": "Test Theme", "theme": "Intelligence Artificielle", "categories": categories_json, "max_items_per_category": max_items, "max_age_days": 365, "summary_length": 3 }); let (theme_status, theme_resp) = app.post_with_session("/api/v1/themes", &theme_body, &session).await; assert_eq!(theme_status.as_u16(), 201, "Theme creation should succeed"); let theme_id: uuid::Uuid = theme_resp["id"].as_str().unwrap().parse().unwrap(); (user_id, session, theme_id) } fn make_progress_channel() -> (Arc>, watch::Receiver) { let (tx, rx) = watch::channel(synthesis::ProgressEvent::Progress { step: "init".into(), message: "Starting...".into(), percent: 0, }); (Arc::new(tx), rx) } #[tokio::test] async fn phase1_heuristic_extraction_classifies_articles() { let app = common::TestApp::new().await; let mock_server = setup_mock_server().await; let (user_id, session, theme_id) = setup_user_with_settings(&app, vec!["AI News"], 4).await; // Add a source pointing to wiremock (same host as article URLs) let source_url = format!("{}/blog", mock_server.uri()); let source = serde_json::json!({"title": "Test Source", "url": source_url, "theme_id": theme_id.to_string()}); let (status, _) = app.post_with_session("/api/v1/sources", &source, &session).await; assert!(status.is_success()); let mock_provider = MockLlmProvider::new() .with_default_category("AI News") .into_arc(); let job_id = uuid::Uuid::new_v4(); let (tx, _rx) = make_progress_channel(); let state = ai_synth_backend::app_state::AppState::new( app.config.clone(), app.pool.clone(), reqwest::Client::new(), ); let result = synthesis::run_generation_inner( job_id, &state, user_id, theme_id, &tx, Some(mock_provider), &AtomicBool::new(false), ).await; assert!(result.is_ok(), "Generation should succeed: {:?}", result.err()); let synthesis_id = result.unwrap(); // Verify synthesis was saved with articles let row: (serde_json::Value,) = sqlx::query_as( "SELECT sections FROM syntheses WHERE id = $1" ) .bind(synthesis_id) .fetch_one(&app.pool) .await .expect("Synthesis should exist"); let sections: Vec = serde_json::from_value(row.0).unwrap(); assert!(!sections.is_empty(), "Should have at least one section"); let first_section = §ions[0]; assert_eq!(first_section["title"], "AI News"); let items = first_section["items"].as_array().unwrap(); assert!(!items.is_empty(), "AI News section should have articles"); // Verify article history was recorded let history_count: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM article_history WHERE user_id = $1 AND job_id = $2" ) .bind(user_id) .bind(job_id) .fetch_one(&app.pool) .await .unwrap(); assert!(history_count.0 > 0, "Article history should have entries"); } #[tokio::test] async fn phase2_search_fills_gaps_when_no_sources() { let app = common::TestApp::new().await; let mock_server = setup_mock_server().await; // No sources -- Phase 1 produces nothing let (user_id, _session, theme_id) = setup_user_with_settings(&app, vec!["AI News"], 2).await; let mock_provider = MockLlmProvider::new() .with_default_category("AI News") .with_search_urls(vec![ format!("{}/article-1", mock_server.uri()), format!("{}/article-2", mock_server.uri()), ]) .into_arc(); let job_id = uuid::Uuid::new_v4(); let (tx, _rx) = make_progress_channel(); let state = ai_synth_backend::app_state::AppState::new( app.config.clone(), app.pool.clone(), reqwest::Client::new(), ); let result = synthesis::run_generation_inner( job_id, &state, user_id, theme_id, &tx, Some(mock_provider), &AtomicBool::new(false), ).await; assert!(result.is_ok(), "Generation should succeed: {:?}", result.err()); // Verify synthesis has articles from Phase 2 let synthesis_id = result.unwrap(); let row: (serde_json::Value,) = sqlx::query_as( "SELECT sections FROM syntheses WHERE id = $1" ) .bind(synthesis_id) .fetch_one(&app.pool) .await .unwrap(); let sections: Vec = serde_json::from_value(row.0).unwrap(); assert!(!sections.is_empty(), "Should have sections from Phase 2 search"); } #[tokio::test] async fn category_overflow_spills_to_autre() { let app = common::TestApp::new().await; let mock_server = setup_mock_server().await; // max_items_per_category=1, but LLM classifies all articles to "AI News" let (user_id, session, theme_id) = setup_user_with_settings(&app, vec!["AI News"], 1).await; let source_url = format!("{}/blog", mock_server.uri()); let source = serde_json::json!({"title": "Test Source", "url": source_url, "theme_id": theme_id.to_string()}); app.post_with_session("/api/v1/sources", &source, &session).await; let mock_provider = MockLlmProvider::new() .with_default_category("AI News") .into_arc(); let job_id = uuid::Uuid::new_v4(); let (tx, _rx) = make_progress_channel(); let state = ai_synth_backend::app_state::AppState::new( app.config.clone(), app.pool.clone(), reqwest::Client::new(), ); let result = synthesis::run_generation_inner( job_id, &state, user_id, theme_id, &tx, Some(mock_provider), &AtomicBool::new(false), ).await; assert!(result.is_ok(), "Generation should succeed"); let synthesis_id = result.unwrap(); let row: (serde_json::Value,) = sqlx::query_as( "SELECT sections FROM syntheses WHERE id = $1" ) .bind(synthesis_id) .fetch_one(&app.pool) .await .unwrap(); let sections: Vec = serde_json::from_value(row.0).unwrap(); // With max_items_per_category=1 and 3 articles all classified as "AI News": // - 1 goes to AI News // - Overflow goes to Autre let ai_section = sections.iter().find(|s| s["title"] == "AI News"); let divers_section = sections.iter().find(|s| s["title"] == "Divers"); assert!(ai_section.is_some(), "Should have AI News section"); let ai_items = ai_section.unwrap()["items"].as_array().unwrap(); assert_eq!(ai_items.len(), 1, "AI News should have exactly 1 item (max)"); if sections.len() > 1 { assert!(divers_section.is_some(), "Overflow should create Divers section"); } } // ── GAP-03: Brave Search pipeline path ──────────────────────────────── // // The Brave Search code path (`use_brave_search: true`) cannot be tested in // integration tests without a real Brave API key. The pipeline calls // `resolve_brave_key` which decrypts a key stored via `user_api_keys`, so we // would need working AES-256-GCM encryption round-tripping in the test harness. // The LLM-based web search fallback (Phase 2 without Brave) is already covered // by `phase2_search_fills_gaps_when_no_sources`. // ── GAP-05: Source diversity cap ────────────────────────────────────── #[tokio::test] async fn source_diversity_limits_articles_per_source() { let app = common::TestApp::new().await; let mock_server = setup_mock_server().await; // Create user with default settings first, then override max_articles_per_source let email = format!("diversity-{}@test.com", uuid::Uuid::new_v4()); let (user_id, session) = app.create_authenticated_user(&email).await; // Create theme let theme_body = serde_json::json!({ "name": "Diversity Theme", "theme": "Intelligence Artificielle", "categories": ["AI News"], "max_items_per_category": 10, "max_age_days": 365, "summary_length": 1 }); let (theme_status, theme_resp) = app .post_with_session("/api/v1/themes", &theme_body, &session) .await; assert_eq!(theme_status.as_u16(), 201, "Theme creation should succeed"); let theme_id: uuid::Uuid = theme_resp["id"].as_str().unwrap().parse().unwrap(); // Update settings: max_articles_per_source = 1 so only 1 article from // the mock server domain is accepted (the source page has 3 links). let settings = serde_json::json!({ "max_articles_per_source": 1, "max_links_per_source": 8, "use_brave_search": false, "article_history_days": 0, "batch_size": 1, "source_extraction_window": 3, "search_agent_behavior": "", "ai_provider": "", "ai_model": "", "ai_model_websearch": "", "rate_limit_max_requests": null, "rate_limit_time_window_seconds": null }); let (settings_status, _) = app .put_with_session("/api/v1/settings", &settings, &session) .await; assert_eq!(settings_status.as_u16(), 200, "Settings save should succeed"); // Add source pointing to mock server let source_url = format!("{}/blog", mock_server.uri()); let source = serde_json::json!({ "title": "Diversity Source", "url": source_url, "theme_id": theme_id.to_string() }); let (source_status, _) = app .post_with_session("/api/v1/sources", &source, &session) .await; assert!(source_status.is_success(), "Source creation should succeed"); // Run pipeline let mock_provider = MockLlmProvider::new() .with_default_category("AI News") .into_arc(); let job_id = uuid::Uuid::new_v4(); let (tx, _rx) = make_progress_channel(); let state = ai_synth_backend::app_state::AppState::new( app.config.clone(), app.pool.clone(), reqwest::Client::new(), ); let result = synthesis::run_generation_inner( job_id, &state, user_id, theme_id, &tx, Some(mock_provider), &AtomicBool::new(false), ) .await; assert!(result.is_ok(), "Generation should succeed: {:?}", result.err()); // Verify that some articles were filtered by source diversity let diversity_count: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM article_history WHERE user_id = $1 AND job_id = $2 AND status = 'filtered_diversity'" ) .bind(user_id) .bind(job_id) .fetch_one(&app.pool) .await .unwrap(); // The mock source page has 3 article links, all from the same domain. // With max_articles_per_source=1, at least 2 should be filtered. assert!( diversity_count.0 > 0, "Should have diversity-filtered articles (got 0)" ); // Verify the synthesis only contains 1 article (the cap) let synthesis_id = result.unwrap(); let row: (serde_json::Value,) = sqlx::query_as("SELECT sections FROM syntheses WHERE id = $1") .bind(synthesis_id) .fetch_one(&app.pool) .await .unwrap(); let sections: Vec = serde_json::from_value(row.0).unwrap(); let total_items: usize = sections .iter() .filter_map(|s| s["items"].as_array()) .map(|items| items.len()) .sum(); assert_eq!( total_items, 1, "With max_articles_per_source=1, only 1 article should appear in the synthesis" ); } // ── GAP-07: Article history dedup across syntheses ──────────────────── #[tokio::test] async fn article_history_dedup_prevents_repeat_articles() { let app = common::TestApp::new().await; let mock_server = setup_mock_server().await; let email = format!("dedup-{}@test.com", uuid::Uuid::new_v4()); let (user_id, session) = app.create_authenticated_user(&email).await; // Create theme let theme_body = serde_json::json!({ "name": "Dedup Theme", "theme": "Intelligence Artificielle", "categories": ["AI News"], "max_items_per_category": 10, "max_age_days": 365, "summary_length": 1 }); let (theme_status, theme_resp) = app .post_with_session("/api/v1/themes", &theme_body, &session) .await; assert_eq!(theme_status.as_u16(), 201, "Theme creation should succeed"); let theme_id: uuid::Uuid = theme_resp["id"].as_str().unwrap().parse().unwrap(); // Settings with article_history_days > 0 to enable dedup let settings = serde_json::json!({ "max_articles_per_source": 10, "max_links_per_source": 8, "use_brave_search": false, "article_history_days": 90, "batch_size": 5, "source_extraction_window": 3, "search_agent_behavior": "", "ai_provider": "", "ai_model": "", "ai_model_websearch": "", "rate_limit_max_requests": null, "rate_limit_time_window_seconds": null }); let (settings_status, _) = app .put_with_session("/api/v1/settings", &settings, &session) .await; assert_eq!(settings_status.as_u16(), 200, "Settings save should succeed"); // Add source let source_url = format!("{}/blog", mock_server.uri()); let source = serde_json::json!({ "title": "Dedup Source", "url": source_url, "theme_id": theme_id.to_string() }); let (source_status, _) = app .post_with_session("/api/v1/sources", &source, &session) .await; assert!(source_status.is_success(), "Source creation should succeed"); // ── First generation ────────────────────────────────────────────── let mock1 = MockLlmProvider::new() .with_default_category("AI News") .into_arc(); let job1 = uuid::Uuid::new_v4(); let (tx1, _rx1) = make_progress_channel(); let state1 = ai_synth_backend::app_state::AppState::new( app.config.clone(), app.pool.clone(), reqwest::Client::new(), ); let result1 = synthesis::run_generation_inner( job1, &state1, user_id, theme_id, &tx1, Some(mock1), &AtomicBool::new(false), ) .await; assert!(result1.is_ok(), "First generation should succeed: {:?}", result1.err()); // Verify first run produced articles let used_count_1: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM article_history WHERE user_id = $1 AND job_id = $2 AND status = 'used'" ) .bind(user_id) .bind(job1) .fetch_one(&app.pool) .await .unwrap(); assert!(used_count_1.0 > 0, "First run should produce used articles"); // ── Second generation — same source, same articles ──────────────── let mock2 = MockLlmProvider::new() .with_default_category("AI News") .into_arc(); let job2 = uuid::Uuid::new_v4(); let (tx2, _rx2) = make_progress_channel(); let state2 = ai_synth_backend::app_state::AppState::new( app.config.clone(), app.pool.clone(), reqwest::Client::new(), ); // The second run scrapes the same URLs, which are already in article_history. // They should be filtered out as "filtered_history". let _result2 = synthesis::run_generation_inner( job2, &state2, user_id, theme_id, &tx2, Some(mock2), &AtomicBool::new(false), ) .await; // The second run may succeed (empty synthesis) or fail (no valid articles). // Either way, history-dedup entries must exist. let dedup_count: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM article_history WHERE user_id = $1 AND job_id = $2 AND status = 'filtered_history'" ) .bind(user_id) .bind(job2) .fetch_one(&app.pool) .await .unwrap(); assert!( dedup_count.0 > 0, "Second run should have history-deduped articles (got 0)" ); } // ── GAP-08: Preferred source ordering ───────────────────────────────── // // The pipeline places preferred sources before non-preferred ones in the // processing order (`ordered_sources = [preferred, non_preferred].concat()`). // Within each wave, articles from preferred source URLs are also placed first // before being shuffled independently within their group (preferred_urls and // other_urls are each shuffled separately). // // Deterministic ordering at the individual article level cannot be guaranteed // because articles within the preferred group are randomly shuffled. // // What we can verify deterministically: // - With a preferred source and a non-preferred source, both contribute // articles to the synthesis (i.e., preferred ordering does not prevent // non-preferred sources from being processed). // - The article_history table records entries from both sources. // // A test that tries to assert "article from preferred source appears before // article from non-preferred source in the synthesis" would be flaky due to // the intentional shuffle within each group. #[tokio::test] async fn preferred_sources_processed_first() { let app = common::TestApp::new().await; // Set up two source pages on the mock server: // /blog-a (preferred) with /article-pref // /blog-b (non-preferred) with /article-norm let server = wiremock::MockServer::start().await; let base = server.uri(); // Preferred source page Mock::given(method("GET")) .and(path("/blog-a")) .respond_with(ResponseTemplate::new(200).set_body_string(format!( r#"Preferred Article"# ))) .mount(&server) .await; // Non-preferred source page Mock::given(method("GET")) .and(path("/blog-b")) .respond_with(ResponseTemplate::new(200).set_body_string(format!( r#"Normal Article"# ))) .mount(&server) .await; // Article pages Mock::given(method("GET")) .and(path("/article-pref")) .respond_with(ResponseTemplate::new(200).set_body_string( r#"Preferred Article

This is a preferred article about artificial intelligence research.

"# .to_string(), )) .mount(&server) .await; Mock::given(method("GET")) .and(path("/article-norm")) .respond_with(ResponseTemplate::new(200).set_body_string( r#"Normal Article

This is a normal article about machine learning and AI systems.

"# .to_string(), )) .mount(&server) .await; let email = format!("preferred-order-{}@test.com", uuid::Uuid::new_v4()); let (user_id, session) = app.create_authenticated_user(&email).await; // Create theme let theme_body = serde_json::json!({ "name": "Preferred Test Theme", "theme": "Intelligence Artificielle", "categories": ["AI News"], "max_items_per_category": 10, "max_age_days": 365, "summary_length": 1 }); let (theme_status, theme_resp) = app .post_with_session("/api/v1/themes", &theme_body, &session) .await; assert_eq!(theme_status.as_u16(), 201, "Theme creation should succeed"); let theme_id: uuid::Uuid = theme_resp["id"].as_str().unwrap().parse().unwrap(); // Settings: batch_size=1, source_extraction_window=10 so both sources // are processed in a single wave, article_history_days=90 to enable tracing let settings = serde_json::json!({ "max_articles_per_source": 10, "max_links_per_source": 8, "use_brave_search": false, "article_history_days": 90, "batch_size": 1, "source_extraction_window": 10, "search_agent_behavior": "", "ai_provider": "", "ai_model": "", "ai_model_websearch": "", "rate_limit_max_requests": null, "rate_limit_time_window_seconds": null }); let (settings_status, _) = app .put_with_session("/api/v1/settings", &settings, &session) .await; assert_eq!(settings_status.as_u16(), 200, "Settings save should succeed"); // Create source A (will be marked preferred) let source_a_body = serde_json::json!({ "title": "Source A (preferred)", "url": format!("{}/blog-a", base), "theme_id": theme_id.to_string() }); let (status_a, resp_a) = app .post_with_session("/api/v1/sources", &source_a_body, &session) .await; assert!(status_a.is_success(), "Source A creation should succeed"); let source_a_id = resp_a["id"].as_str().expect("Source A should have an id"); // Create source B (non-preferred) let source_b_body = serde_json::json!({ "title": "Source B (normal)", "url": format!("{}/blog-b", base), "theme_id": theme_id.to_string() }); let (status_b, _) = app .post_with_session("/api/v1/sources", &source_b_body, &session) .await; assert!(status_b.is_success(), "Source B creation should succeed"); // Mark source A as preferred let pref_body = serde_json::json!({ "source_ids": [source_a_id], "theme_id": theme_id }); let (pref_status, _) = app .put_with_session("/api/v1/sources/preferred", &pref_body, &session) .await; assert_eq!(pref_status.as_u16(), 200, "Setting preferred sources should succeed"); // Run the pipeline let mock_provider = MockLlmProvider::new() .with_default_category("AI News") .into_arc(); let job_id = uuid::Uuid::new_v4(); let (tx, _rx) = make_progress_channel(); let state = ai_synth_backend::app_state::AppState::new( app.config.clone(), app.pool.clone(), reqwest::Client::new(), ); let result = synthesis::run_generation_inner( job_id, &state, user_id, theme_id, &tx, Some(mock_provider), &AtomicBool::new(false), ) .await; assert!(result.is_ok(), "Generation should succeed: {:?}", result.err()); // Verify both sources contributed articles to article_history. // This confirms the preferred-first ordering does not prevent non-preferred // sources from being processed. Asserting the exact order within the // synthesis is not done here because articles within each group are // randomly shuffled by the pipeline. let used_count: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM article_history WHERE user_id = $1 AND job_id = $2 AND status = 'used'", ) .bind(user_id) .bind(job_id) .fetch_one(&app.pool) .await .unwrap(); assert!( used_count.0 >= 2, "Both preferred and non-preferred source articles should appear in history (got {})", used_count.0 ); // Verify the synthesis has articles from both sources let synthesis_id = result.unwrap(); let row: (serde_json::Value,) = sqlx::query_as("SELECT sections FROM syntheses WHERE id = $1") .bind(synthesis_id) .fetch_one(&app.pool) .await .unwrap(); let sections: Vec = serde_json::from_value(row.0).unwrap(); let total_items: usize = sections .iter() .filter_map(|s| s["items"].as_array()) .map(|items| items.len()) .sum(); assert!( total_items >= 2, "Synthesis should contain articles from both sources (got {})", total_items ); }