From 68b1956059d30df6b71e450b83f78facfcd6634d Mon Sep 17 00:00:00 2001 From: oabrivard Date: Sat, 28 Mar 2026 20:16:13 +0100 Subject: [PATCH] refactor: extract synthesis helpers (assign_category, filter_phase2_url, tracing) into helpers.rs Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/services/synthesis/helpers.rs | 286 +++++++++++++++ .../{synthesis.rs => synthesis/mod.rs} | 335 +----------------- 2 files changed, 292 insertions(+), 329 deletions(-) create mode 100644 backend/src/services/synthesis/helpers.rs rename backend/src/services/{synthesis.rs => synthesis/mod.rs} (83%) diff --git a/backend/src/services/synthesis/helpers.rs b/backend/src/services/synthesis/helpers.rs new file mode 100644 index 0000000..1adab50 --- /dev/null +++ b/backend/src/services/synthesis/helpers.rs @@ -0,0 +1,286 @@ +//! Helper functions for the synthesis pipeline. +//! +//! Contains article tracing, URL normalization/hashing, category assignment, +//! and Phase 2 URL filtering logic. + +use std::collections::HashMap; + +use uuid::Uuid; + +use crate::db; +use crate::util::token::hash_token; + +/// Structured parameters for article history tracing. +pub(crate) struct ArticleTrace<'a> { + pub url: &'a str, + pub title: &'a str, + pub source_type: &'a str, + pub source_url: Option<&'a str>, + pub category: Option<&'a str>, + pub synthesis_id: Option, + pub status: &'a str, + pub scraped_ok: bool, + pub published_date: Option<&'a str>, +} + +/// Build an article history entry from trace parameters (no DB call). +pub(crate) fn build_trace_entry( + user_id: Uuid, + job_id: Uuid, + trace: &ArticleTrace<'_>, +) -> db::article_history::ArticleHistoryEntry { + db::article_history::ArticleHistoryEntry { + user_id, + url: trace.url.to_string(), + url_hash: hash_article_url(trace.url), + title: trace.title.to_string(), + source_type: trace.source_type.to_string(), + source_url: trace.source_url.map(|s| s.to_string()), + category: trace.category.map(|s| s.to_string()), + synthesis_id: trace.synthesis_id, + status: trace.status.to_string(), + scraped_ok: trace.scraped_ok, + job_id, + published_date: trace.published_date.map(|s| s.to_string()), + } +} + +/// Extract the domain (host) from a URL, or None if unparseable. +pub(crate) fn extract_domain(url: &str) -> Option { + url::Url::parse(url) + .ok() + .and_then(|u| u.host_str().map(|h| h.to_lowercase())) +} + +/// Assign an article to a category based on LLM classification response. +/// Returns `Some((cat_key, cat_name, title, summary))` or `None` if all categories full. +pub(crate) fn assign_category( + llm_response: &serde_json::Value, + page_title: &str, + user_categories: &[String], + classification_categories: &[String], + filled_counts: &HashMap, + max_items_per_category: usize, +) -> Option<(String, String, String, String)> { + let llm_title = llm_response.get("title").and_then(|t| t.as_str()).unwrap_or(page_title).to_string(); + let llm_summary = llm_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string(); + let mut llm_category = llm_response.get("category").and_then(|c| c.as_str()).unwrap_or("Divers").to_string(); + + if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) { + llm_category = "Divers".to_string(); + } + + let cat_key = if llm_category.to_lowercase() == "autre" { + "category_autre".to_string() + } else { + user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase()) + .map(|i| format!("category_{}", i)) + .unwrap_or_else(|| "category_autre".to_string()) + }; + + let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0); + if cat_filled >= max_items_per_category && llm_category.to_lowercase() != "autre" { + let autre_filled = filled_counts.get("Divers").copied().unwrap_or(0); + if autre_filled >= max_items_per_category { + return None; + } + Some(("category_autre".to_string(), "Divers".to_string(), llm_title, llm_summary)) + } else { + Some((cat_key, llm_category, llm_title, llm_summary)) + } +} + +/// Check if a Phase 2 URL passes all filters. +/// Returns the filter reason if rejected, None if accepted. +pub(crate) async fn filter_phase2_url( + pool: &sqlx::PgPool, + user_id: Uuid, + url: &str, + seen_urls: &std::collections::HashSet, + source_counts: &HashMap, + article_history_days: i32, + max_articles_per_source: usize, +) -> Option<&'static str> { + if let Ok(parsed_url) = url::Url::parse(url) { + let path = parsed_url.path(); + if path.is_empty() || path == "/" { + return Some("filtered_homepage"); + } + } + if seen_urls.contains(&url.to_lowercase()) { + return Some("filtered_cross_phase_dedup"); + } + if article_history_days > 0 { + let hash = hash_article_url(url); + let exists = db::article_history::check_urls_exist(pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default(); + if exists.contains(&hash) { + return Some("filtered_history"); + } + } + if let Some(domain) = extract_domain(url) { + let count = source_counts.get(&domain).copied().unwrap_or(0); + if count >= max_articles_per_source { + return Some("filtered_diversity"); + } + } + None +} + +/// Normalize an article URL for consistent history hashing. +/// +/// Strips fragments, trailing slashes, and known tracking query parameters +/// so that the same article with different UTM tags is recognized as a duplicate. +pub(crate) fn normalize_article_url(url_str: &str) -> String { + let Ok(mut parsed) = url::Url::parse(url_str) else { + return url_str.to_lowercase(); + }; + + // Strip fragment + parsed.set_fragment(None); + + // Strip known tracking query parameters + let tracking_params: &[&str] = &[ + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + "ref", "source", "fbclid", "gclid", + ]; + + let filtered_pairs: Vec<(String, String)> = parsed + .query_pairs() + .filter(|(key, _)| !tracking_params.contains(&key.as_ref())) + .map(|(k, v)| (k.into_owned(), v.into_owned())) + .collect(); + + if filtered_pairs.is_empty() { + parsed.set_query(None); + } else { + let query_string = filtered_pairs + .iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect::>() + .join("&"); + parsed.set_query(Some(&query_string)); + } + + // Strip trailing slash (unless path is just "/") + let path = parsed.path().to_string(); + if path.len() > 1 && path.ends_with('/') { + parsed.set_path(&path[..path.len() - 1]); + } + + parsed.to_string().to_lowercase() +} + +/// Compute the hash of a normalized article URL for history lookup. +pub(crate) fn hash_article_url(url: &str) -> String { + let normalized = normalize_article_url(url); + hash_token(&normalized) +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── hash_article_url tests ───────────────────────────────────── + + #[test] + fn hash_article_url_deterministic() { + let h1 = hash_article_url("https://example.com/article?utm_source=twitter"); + let h2 = hash_article_url("https://example.com/article?utm_source=newsletter"); + assert_eq!(h1, h2, "Same article with different UTM params should hash the same"); + } + + #[test] + fn hash_article_url_different_articles() { + let h1 = hash_article_url("https://example.com/article-1"); + let h2 = hash_article_url("https://example.com/article-2"); + assert_ne!(h1, h2); + } + + // ── assign_category tests ─────────────────────────────────── + + #[test] + fn assign_category_maps_to_correct_category() { + let response = serde_json::json!({ + "title": "Test Article", + "summary": "Test summary", + "category": "AI News", + "date": "2026-03-25", + "is_article": true + }); + let user_cats = vec!["AI News".to_string(), "Research".to_string()]; + let class_cats = vec![ + "AI News".to_string(), + "Research".to_string(), + "Divers".to_string(), + ]; + let filled = std::collections::HashMap::new(); + + let result = + assign_category(&response, "Fallback Title", &user_cats, &class_cats, &filled, 4); + assert!(result.is_some()); + let (cat_key, cat_name, title, _summary) = result.unwrap(); + assert_eq!(cat_key, "category_0"); + assert_eq!(cat_name, "AI News"); + assert_eq!(title, "Test Article"); + } + + #[test] + fn assign_category_overflows_to_divers() { + let response = serde_json::json!({ + "title": "Overflow Article", + "summary": "...", + "category": "AI News", + "date": "", + "is_article": true + }); + let user_cats = vec!["AI News".to_string()]; + let class_cats = vec!["AI News".to_string(), "Divers".to_string()]; + let mut filled = std::collections::HashMap::new(); + filled.insert("AI News".to_string(), 4usize); // already full + + let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4); + assert!(result.is_some()); + let (cat_key, cat_name, _, _) = result.unwrap(); + assert_eq!(cat_key, "category_autre"); + assert_eq!(cat_name, "Divers"); + } + + #[test] + fn assign_category_returns_none_when_all_full() { + let response = serde_json::json!({ + "title": "No Room", + "summary": "...", + "category": "AI News", + "date": "", + "is_article": true + }); + let user_cats = vec!["AI News".to_string()]; + let class_cats = vec!["AI News".to_string(), "Divers".to_string()]; + let mut filled = std::collections::HashMap::new(); + filled.insert("AI News".to_string(), 4usize); + filled.insert("Divers".to_string(), 4usize); + + let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4); + assert!(result.is_none()); + } + + #[test] + fn assign_category_unknown_category_maps_to_divers() { + let response = serde_json::json!({ + "title": "Unknown Cat", + "summary": "...", + "category": "Nonexistent Category", + "date": "", + "is_article": true + }); + let user_cats = vec!["AI News".to_string()]; + let class_cats = vec!["AI News".to_string(), "Divers".to_string()]; + let filled = std::collections::HashMap::new(); + + let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4); + assert!(result.is_some()); + let (cat_key, cat_name, _, _) = result.unwrap(); + assert_eq!(cat_key, "category_autre"); + assert_eq!(cat_name, "Divers"); + } +} diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis/mod.rs similarity index 83% rename from backend/src/services/synthesis.rs rename to backend/src/services/synthesis/mod.rs index 91bf664..9a67df9 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis/mod.rs @@ -28,6 +28,12 @@ use crate::services::llm::factory::create_provider; use crate::services::scraper; use crate::services::source_scraper; +mod helpers; +pub(crate) use helpers::{ + ArticleTrace, assign_category, build_trace_entry, extract_domain, + filter_phase2_url, hash_article_url, +}; + // Re-export for downstream consumers that previously imported from synthesis. pub use crate::services::job_store::{emit_progress, JobStore, ProgressEvent}; @@ -824,41 +830,6 @@ async fn scrape_and_classify_batch( Ok(()) } -/// Structured parameters for article history tracing. -struct ArticleTrace<'a> { - url: &'a str, - title: &'a str, - source_type: &'a str, - source_url: Option<&'a str>, - category: Option<&'a str>, - synthesis_id: Option, - status: &'a str, - scraped_ok: bool, - published_date: Option<&'a str>, -} - -/// Build an article history entry from trace parameters (no DB call). -fn build_trace_entry( - user_id: Uuid, - job_id: Uuid, - trace: &ArticleTrace<'_>, -) -> db::article_history::ArticleHistoryEntry { - db::article_history::ArticleHistoryEntry { - user_id, - url: trace.url.to_string(), - url_hash: hash_article_url(trace.url), - title: trace.title.to_string(), - source_type: trace.source_type.to_string(), - source_url: trace.source_url.map(|s| s.to_string()), - category: trace.category.map(|s| s.to_string()), - synthesis_id: trace.synthesis_id, - status: trace.status.to_string(), - scraped_ok: trace.scraped_ok, - job_id, - published_date: trace.published_date.map(|s| s.to_string()), - } -} - /// Log an LLM call with full prompt, response, and timing. #[allow(clippy::too_many_arguments)] async fn log_llm_call( @@ -961,137 +932,6 @@ async fn check_rate_limit( } } -/// Extract the domain (host) from a URL, or None if unparseable. -fn extract_domain(url: &str) -> Option { - url::Url::parse(url) - .ok() - .and_then(|u| u.host_str().map(|h| h.to_lowercase())) -} - -/// Assign an article to a category based on LLM classification response. -/// Returns `Some((cat_key, cat_name, title, summary))` or `None` if all categories full. -fn assign_category( - llm_response: &serde_json::Value, - page_title: &str, - user_categories: &[String], - classification_categories: &[String], - filled_counts: &HashMap, - max_items_per_category: usize, -) -> Option<(String, String, String, String)> { - let llm_title = llm_response.get("title").and_then(|t| t.as_str()).unwrap_or(page_title).to_string(); - let llm_summary = llm_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string(); - let mut llm_category = llm_response.get("category").and_then(|c| c.as_str()).unwrap_or("Divers").to_string(); - - if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) { - llm_category = "Divers".to_string(); - } - - let cat_key = if llm_category.to_lowercase() == "autre" { - "category_autre".to_string() - } else { - user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase()) - .map(|i| format!("category_{}", i)) - .unwrap_or_else(|| "category_autre".to_string()) - }; - - let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0); - if cat_filled >= max_items_per_category && llm_category.to_lowercase() != "autre" { - let autre_filled = filled_counts.get("Divers").copied().unwrap_or(0); - if autre_filled >= max_items_per_category { - return None; - } - Some(("category_autre".to_string(), "Divers".to_string(), llm_title, llm_summary)) - } else { - Some((cat_key, llm_category, llm_title, llm_summary)) - } -} - -/// Check if a Phase 2 URL passes all filters. -/// Returns the filter reason if rejected, None if accepted. -async fn filter_phase2_url( - pool: &sqlx::PgPool, - user_id: Uuid, - url: &str, - seen_urls: &std::collections::HashSet, - source_counts: &HashMap, - article_history_days: i32, - max_articles_per_source: usize, -) -> Option<&'static str> { - if let Ok(parsed_url) = url::Url::parse(url) { - let path = parsed_url.path(); - if path.is_empty() || path == "/" { - return Some("filtered_homepage"); - } - } - if seen_urls.contains(&url.to_lowercase()) { - return Some("filtered_cross_phase_dedup"); - } - if article_history_days > 0 { - let hash = hash_article_url(url); - let exists = db::article_history::check_urls_exist(pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default(); - if exists.contains(&hash) { - return Some("filtered_history"); - } - } - if let Some(domain) = extract_domain(url) { - let count = source_counts.get(&domain).copied().unwrap_or(0); - if count >= max_articles_per_source { - return Some("filtered_diversity"); - } - } - None -} - -/// Normalize an article URL for consistent history hashing. -/// -/// Strips fragments, trailing slashes, and known tracking query parameters -/// so that the same article with different UTM tags is recognized as a duplicate. -fn normalize_article_url(url_str: &str) -> String { - let Ok(mut parsed) = url::Url::parse(url_str) else { - return url_str.to_lowercase(); - }; - - // Strip fragment - parsed.set_fragment(None); - - // Strip known tracking query parameters - let tracking_params: &[&str] = &[ - "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", - "ref", "source", "fbclid", "gclid", - ]; - - let filtered_pairs: Vec<(String, String)> = parsed - .query_pairs() - .filter(|(key, _)| !tracking_params.contains(&key.as_ref())) - .map(|(k, v)| (k.into_owned(), v.into_owned())) - .collect(); - - if filtered_pairs.is_empty() { - parsed.set_query(None); - } else { - let query_string = filtered_pairs - .iter() - .map(|(k, v)| format!("{}={}", k, v)) - .collect::>() - .join("&"); - parsed.set_query(Some(&query_string)); - } - - // Strip trailing slash (unless path is just "/") - let path = parsed.path().to_string(); - if path.len() > 1 && path.ends_with('/') { - parsed.set_path(&path[..path.len() - 1]); - } - - parsed.to_string().to_lowercase() -} - -/// Compute the hash of a normalized article URL for history lookup. -fn hash_article_url(url: &str) -> String { - let normalized = normalize_article_url(url); - crate::util::token::hash_token(&normalized) -} - /// Decrypt the Brave Search API key for a user. async fn resolve_brave_key( state: &AppState, @@ -1408,82 +1248,6 @@ mod tests { assert_eq!(sanitized, json); } - // ── normalize_article_url tests ───────────────────────────── - - #[test] - fn normalize_strips_fragment() { - assert_eq!( - normalize_article_url("https://example.com/article#section"), - "https://example.com/article" - ); - } - - #[test] - fn normalize_strips_utm_params() { - assert_eq!( - normalize_article_url("https://example.com/article?utm_source=twitter&utm_medium=social"), - "https://example.com/article" - ); - } - - #[test] - fn normalize_keeps_non_tracking_params() { - let result = normalize_article_url("https://example.com/search?q=test&utm_source=twitter"); - assert!(result.contains("q=test")); - assert!(!result.contains("utm_source")); - } - - #[test] - fn normalize_strips_trailing_slash() { - assert_eq!( - normalize_article_url("https://example.com/article/"), - "https://example.com/article" - ); - } - - #[test] - fn normalize_keeps_root_slash() { - assert_eq!( - normalize_article_url("https://example.com/"), - "https://example.com/" - ); - } - - #[test] - fn normalize_lowercases() { - assert_eq!( - normalize_article_url("https://Example.COM/Article"), - "https://example.com/article" - ); - } - - #[test] - fn normalize_strips_fbclid() { - let result = normalize_article_url("https://example.com/post?fbclid=abc123"); - assert!(!result.contains("fbclid")); - assert!(!result.contains("?")); - } - - #[test] - fn normalize_handles_invalid_url() { - let result = normalize_article_url("not a url at all"); - assert_eq!(result, "not a url at all"); - } - - #[test] - fn hash_article_url_deterministic() { - let h1 = hash_article_url("https://example.com/article?utm_source=twitter"); - let h2 = hash_article_url("https://example.com/article?utm_source=newsletter"); - assert_eq!(h1, h2, "Same article with different UTM params should hash the same"); - } - - #[test] - fn hash_article_url_different_articles() { - let h1 = hash_article_url("https://example.com/article-1"); - let h2 = hash_article_url("https://example.com/article-2"); - assert_ne!(h1, h2); - } - // ── rotate_sources tests ────────────────────────────────── #[test] @@ -1526,91 +1290,4 @@ mod tests { assert!(result.ends_with("...")); } - // ── assign_category tests ─────────────────────────────────── - - #[test] - fn assign_category_maps_to_correct_category() { - let response = serde_json::json!({ - "title": "Test Article", - "summary": "Test summary", - "category": "AI News", - "date": "2026-03-25", - "is_article": true - }); - let user_cats = vec!["AI News".to_string(), "Research".to_string()]; - let class_cats = vec![ - "AI News".to_string(), - "Research".to_string(), - "Divers".to_string(), - ]; - let filled = std::collections::HashMap::new(); - - let result = - assign_category(&response, "Fallback Title", &user_cats, &class_cats, &filled, 4); - assert!(result.is_some()); - let (cat_key, cat_name, title, _summary) = result.unwrap(); - assert_eq!(cat_key, "category_0"); - assert_eq!(cat_name, "AI News"); - assert_eq!(title, "Test Article"); - } - - #[test] - fn assign_category_overflows_to_divers() { - let response = serde_json::json!({ - "title": "Overflow Article", - "summary": "...", - "category": "AI News", - "date": "", - "is_article": true - }); - let user_cats = vec!["AI News".to_string()]; - let class_cats = vec!["AI News".to_string(), "Divers".to_string()]; - let mut filled = std::collections::HashMap::new(); - filled.insert("AI News".to_string(), 4usize); // already full - - let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4); - assert!(result.is_some()); - let (cat_key, cat_name, _, _) = result.unwrap(); - assert_eq!(cat_key, "category_autre"); - assert_eq!(cat_name, "Divers"); - } - - #[test] - fn assign_category_returns_none_when_all_full() { - let response = serde_json::json!({ - "title": "No Room", - "summary": "...", - "category": "AI News", - "date": "", - "is_article": true - }); - let user_cats = vec!["AI News".to_string()]; - let class_cats = vec!["AI News".to_string(), "Divers".to_string()]; - let mut filled = std::collections::HashMap::new(); - filled.insert("AI News".to_string(), 4usize); - filled.insert("Divers".to_string(), 4usize); - - let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4); - assert!(result.is_none()); - } - - #[test] - fn assign_category_unknown_category_maps_to_divers() { - let response = serde_json::json!({ - "title": "Unknown Cat", - "summary": "...", - "category": "Nonexistent Category", - "date": "", - "is_article": true - }); - let user_cats = vec!["AI News".to_string()]; - let class_cats = vec!["AI News".to_string(), "Divers".to_string()]; - let filled = std::collections::HashMap::new(); - - let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4); - assert!(result.is_some()); - let (cat_key, cat_name, _, _) = result.unwrap(); - assert_eq!(cat_key, "category_autre"); - assert_eq!(cat_name, "Divers"); - } }