From e056ef9d3e1144ed63dfa8a893fb1bea23fccee9 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Thu, 26 Mar 2026 01:14:35 +0100 Subject: [PATCH] refactor: extract assign_category and filter_phase2_url helpers from synthesis pipeline Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/services/synthesis.rs | 214 ++++++++++++++---------------- 1 file changed, 96 insertions(+), 118 deletions(-) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 5d500ec..16a426b 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -486,31 +486,11 @@ async fn run_generation_inner( } }; - let llm_title = class_response.get("title").and_then(|t| t.as_str()).unwrap_or(&page_title).to_string(); - let llm_summary = class_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string(); - let mut llm_category = class_response.get("category").and_then(|c| c.as_str()).unwrap_or("Autre").to_string(); - - if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) { - llm_category = "Autre".to_string(); - } - - let cat_key = if llm_category.to_lowercase() == "autre" { - "category_autre".to_string() - } else { - user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase()) - .map(|i| format!("category_{}", i)) - .unwrap_or_else(|| "category_autre".to_string()) - }; - - let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0); - let (final_cat_key, final_cat_name) = if cat_filled >= settings.max_items_per_category as usize && llm_category.to_lowercase() != "autre" { - let autre_filled = filled_counts.get("Autre").copied().unwrap_or(0); - if autre_filled >= settings.max_items_per_category as usize { - continue; - } - ("category_autre".to_string(), "Autre".to_string()) - } else { - (cat_key, llm_category) + let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category( + &class_response, &page_title, &user_categories, &classification_categories, + &filled_counts, settings.max_items_per_category as usize, + ) else { + continue; }; article_scraped.entry(final_cat_key).or_default().push(NewsItem { @@ -558,43 +538,15 @@ async fn run_generation_inner( // Filter Brave results let mut brave_urls: Vec = Vec::new(); for result in &brave_results { - let url_lower = result.url.to_lowercase(); - - // Homepage filter - if let Ok(parsed_url) = url::Url::parse(&result.url) { - let path = parsed_url.path(); - if path.is_empty() || path == "/" { - trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_homepage", false).await; - continue; - } - } - - // Cross-phase dedup - if seen_urls.contains(&url_lower) { - trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_cross_phase_dedup", false).await; + if let Some(reason) = filter_phase2_url( + &state.pool, user_id, &result.url, &seen_urls, &source_counts, + settings.article_history_days, settings.max_articles_per_source as usize, + ).await { + trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, reason, false).await; continue; } - // History dedup - if settings.article_history_days > 0 { - let hash = hash_article_url(&result.url); - let exists = db::article_history::check_urls_exist(&state.pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default(); - if exists.contains(&hash) { - trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_history", false).await; - continue; - } - } - - // Source diversity - if let Some(domain) = extract_domain(&result.url) { - let count = source_counts.get(&domain).copied().unwrap_or(0); - if count >= settings.max_articles_per_source as usize { - trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_diversity", false).await; - continue; - } - } - - seen_urls.insert(url_lower); + seen_urls.insert(result.url.to_lowercase()); url_source.insert(result.url.clone(), "brave_search".to_string()); brave_urls.push(result.url.clone()); } @@ -693,31 +645,11 @@ async fn run_generation_inner( } }; - let llm_title = class_response.get("title").and_then(|t| t.as_str()).unwrap_or(&page_title).to_string(); - let llm_summary = class_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string(); - let mut llm_category = class_response.get("category").and_then(|c| c.as_str()).unwrap_or("Autre").to_string(); - - if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) { - llm_category = "Autre".to_string(); - } - - let cat_key = if llm_category.to_lowercase() == "autre" { - "category_autre".to_string() - } else { - user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase()) - .map(|i| format!("category_{}", i)) - .unwrap_or_else(|| "category_autre".to_string()) - }; - - let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0); - let (final_cat_key, final_cat_name) = if cat_filled >= settings.max_items_per_category as usize && llm_category.to_lowercase() != "autre" { - let autre_filled = filled_counts.get("Autre").copied().unwrap_or(0); - if autre_filled >= settings.max_items_per_category as usize { - continue; - } - ("category_autre".to_string(), "Autre".to_string()) - } else { - (cat_key, llm_category) + let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category( + &class_response, &page_title, &user_categories, &classification_categories, + &filled_counts, settings.max_items_per_category as usize, + ) else { + continue; }; article_scraped.entry(final_cat_key).or_default().push(NewsItem { @@ -763,43 +695,15 @@ async fn run_generation_inner( for (cat_key, items) in parsed { for item in items { - let url_lower = item.url.to_lowercase(); - - // Homepage filter - if let Ok(parsed_url) = url::Url::parse(&item.url) { - let path = parsed_url.path(); - if path.is_empty() || path == "/" { - trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_homepage", false).await; - continue; - } - } - - // Cross-phase dedup - if seen_urls.contains(&url_lower) { - trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_cross_phase_dedup", false).await; + if let Some(reason) = filter_phase2_url( + &state.pool, user_id, &item.url, &seen_urls, &source_counts, + settings.article_history_days, settings.max_articles_per_source as usize, + ).await { + trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, reason, false).await; continue; } - // History dedup - if settings.article_history_days > 0 { - let hash = hash_article_url(&item.url); - let exists = db::article_history::check_urls_exist(&state.pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default(); - if exists.contains(&hash) { - trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_history", false).await; - continue; - } - } - - // Source limit - if let Some(domain) = extract_domain(&item.url) { - let count = source_counts.get(&domain).copied().unwrap_or(0); - if count >= settings.max_articles_per_source as usize { - trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_diversity", false).await; - continue; - } - } - - seen_urls.insert(url_lower); + seen_urls.insert(item.url.to_lowercase()); phase2_items.push((cat_key.clone(), item)); } } @@ -1046,6 +950,80 @@ fn extract_domain(url: &str) -> Option { .and_then(|u| u.host_str().map(|h| h.to_lowercase())) } +/// Assign an article to a category based on LLM classification response. +/// Returns `Some((cat_key, cat_name, title, summary))` or `None` if all categories full. +fn assign_category( + llm_response: &serde_json::Value, + page_title: &str, + user_categories: &[String], + classification_categories: &[String], + filled_counts: &HashMap, + max_items_per_category: usize, +) -> Option<(String, String, String, String)> { + let llm_title = llm_response.get("title").and_then(|t| t.as_str()).unwrap_or(page_title).to_string(); + let llm_summary = llm_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string(); + let mut llm_category = llm_response.get("category").and_then(|c| c.as_str()).unwrap_or("Autre").to_string(); + + if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) { + llm_category = "Autre".to_string(); + } + + let cat_key = if llm_category.to_lowercase() == "autre" { + "category_autre".to_string() + } else { + user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase()) + .map(|i| format!("category_{}", i)) + .unwrap_or_else(|| "category_autre".to_string()) + }; + + let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0); + if cat_filled >= max_items_per_category && llm_category.to_lowercase() != "autre" { + let autre_filled = filled_counts.get("Autre").copied().unwrap_or(0); + if autre_filled >= max_items_per_category { + return None; + } + Some(("category_autre".to_string(), "Autre".to_string(), llm_title, llm_summary)) + } else { + Some((cat_key, llm_category, llm_title, llm_summary)) + } +} + +/// Check if a Phase 2 URL passes all filters. +/// Returns the filter reason if rejected, None if accepted. +async fn filter_phase2_url( + pool: &sqlx::PgPool, + user_id: Uuid, + url: &str, + seen_urls: &std::collections::HashSet, + source_counts: &HashMap, + article_history_days: i32, + max_articles_per_source: usize, +) -> Option<&'static str> { + if let Ok(parsed_url) = url::Url::parse(url) { + let path = parsed_url.path(); + if path.is_empty() || path == "/" { + return Some("filtered_homepage"); + } + } + if seen_urls.contains(&url.to_lowercase()) { + return Some("filtered_cross_phase_dedup"); + } + if article_history_days > 0 { + let hash = hash_article_url(url); + let exists = db::article_history::check_urls_exist(pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default(); + if exists.contains(&hash) { + return Some("filtered_history"); + } + } + if let Some(domain) = extract_domain(url) { + let count = source_counts.get(&domain).copied().unwrap_or(0); + if count >= max_articles_per_source { + return Some("filtered_diversity"); + } + } + None +} + /// Normalize an article URL for consistent history hashing. /// /// Strips fragments, trailing slashes, and known tracking query parameters