diff --git a/backend/src/db/article_history.rs b/backend/src/db/article_history.rs index 80bc29c..ba8ff55 100644 --- a/backend/src/db/article_history.rs +++ b/backend/src/db/article_history.rs @@ -201,6 +201,15 @@ pub async fn get_last_source_url( Ok(result) } +/// Delete ALL article history entries for a user. +pub async fn delete_all_for_user(pool: &PgPool, user_id: Uuid) -> Result { + let result = sqlx::query("DELETE FROM article_history WHERE user_id = $1") + .bind(user_id) + .execute(pool) + .await?; + Ok(result.rows_affected()) +} + /// Delete history entries older than N days for this user. /// /// Only removes entries where synthesis_id IS NULL (dropped articles). diff --git a/backend/src/handlers/article_history.rs b/backend/src/handlers/article_history.rs index b64e07b..23d578d 100644 --- a/backend/src/handlers/article_history.rs +++ b/backend/src/handlers/article_history.rs @@ -54,6 +54,18 @@ pub async fn list_history( }))) } +/// DELETE /api/v1/article-history +/// +/// Deletes ALL article history entries for the authenticated user. +pub async fn clear_history( + auth_user: AuthUser, + State(state): State, +) -> Result { + let deleted = db::article_history::delete_all_for_user(&state.pool, auth_user.id).await?; + tracing::info!(user_id = %auth_user.id, deleted = deleted, "Cleared article history"); + Ok(Json(serde_json::json!({ "deleted": deleted }))) +} + /// GET /api/v1/syntheses/:id/provenance /// /// Returns all article history entries for the generation run diff --git a/backend/src/router.rs b/backend/src/router.rs index 589cf8b..81b2b12 100644 --- a/backend/src/router.rs +++ b/backend/src/router.rs @@ -55,7 +55,7 @@ pub fn build_router(state: AppState, config: &AppConfig) -> Router { .route("/syntheses/generate", post(handlers::generation::trigger_generate)) .route("/syntheses/generate/{job_id}/progress", get(handlers::generation::progress_stream)) // Article history & provenance routes (authenticated) - .route("/article-history", get(handlers::article_history::list_history)) + .route("/article-history", get(handlers::article_history::list_history).delete(handlers::article_history::clear_history)) .route("/syntheses/{id}/provenance", get(handlers::article_history::get_provenance)) // LLM call log routes (authenticated) .route("/llm-logs/{job_id}", get(handlers::llm_logs::get_logs)) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 52d295b..202d7f0 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -281,32 +281,79 @@ async fn run_generation_inner( let last_source = db::article_history::get_last_source_url(&state.pool, user_id).await.unwrap_or(None); let rotated_sources = rotate_sources(sources.clone(), last_source.as_deref()); - let max_sources = rotated_sources.len().min(10); - let max_links = 10usize; - - // 1a. Extract article links + filter against history - let mut candidate_urls: Vec<(String, String)> = Vec::new(); // (article_url, source_url) - - for source in rotated_sources.iter().take(max_sources) { - let links = if settings.use_llm_for_source_links { - source_scraper::extract_article_links_with_llm( - &state.http_client, &source.url, max_links, &provider, &model_research, - ).await - } else { - source_scraper::extract_article_links( - &state.http_client, &source.url, max_links, - ).await - }; + let max_links = 15usize; + + // 1a. Extract article links from source pages (parallel, max 5 concurrent) + let mut candidate_urls: Vec<(String, String)> = Vec::new(); + { + let mut join_set = tokio::task::JoinSet::new(); + let mut pending = rotated_sources.iter().peekable(); + let max_concurrent = 5; + + // Seed initial tasks + for _ in 0..max_concurrent { + if let Some(source) = pending.next() { + let client = state.http_client.clone(); + let source_url = source.url.clone(); + let source_title = source.title.clone(); + let use_llm = settings.use_llm_for_source_links; + let provider_clone = std::sync::Arc::clone(&provider); + let model = model_research.clone(); + let max_l = max_links; + join_set.spawn(async move { + let links = if use_llm { + source_scraper::extract_article_links_with_llm( + &client, &source_url, max_l, &provider_clone, &model, + ).await + } else { + source_scraper::extract_article_links( + &client, &source_url, max_l, + ).await + }; + (source_url, source_title, links) + }); + } + } - if let Ok(links) = links { - tracing::info!(source = %source.title, links = links.len(), "Extracted links from source"); - for link in links { - if seen_urls.insert(link.to_lowercase()) { - candidate_urls.push((link, source.url.clone())); + while let Some(join_result) = join_set.join_next().await { + if let Ok((source_url, source_title, links_result)) = join_result { + match links_result { + Ok(links) => { + tracing::info!(source = %source_title, links = links.len(), "Extracted links from source"); + for link in links { + if seen_urls.insert(link.to_lowercase()) { + candidate_urls.push((link, source_url.clone())); + } + } + } + Err(e) => { + tracing::warn!(source = %source_title, error = %e, "Failed to extract links"); + } } } - } else if let Err(e) = links { - tracing::warn!(source = %source.title, error = %e, "Failed to extract links"); + + // Spawn next task + if let Some(source) = pending.next() { + let client = state.http_client.clone(); + let source_url = source.url.clone(); + let source_title = source.title.clone(); + let use_llm = settings.use_llm_for_source_links; + let provider_clone = std::sync::Arc::clone(&provider); + let model = model_research.clone(); + let max_l = max_links; + join_set.spawn(async move { + let links = if use_llm { + source_scraper::extract_article_links_with_llm( + &client, &source_url, max_l, &provider_clone, &model, + ).await + } else { + source_scraper::extract_article_links( + &client, &source_url, max_l, + ).await + }; + (source_url, source_title, links) + }); + } } } @@ -324,6 +371,10 @@ async fn run_generation_inner( } } + // Shuffle candidates to interleave articles from different sources + use rand::seq::SliceRandom; + candidate_urls.shuffle(&mut rand::thread_rng()); + // Track url -> source for (url, source_url) in &candidate_urls { url_source.insert(url.clone(), source_url.clone());