refactor: extract synthesis helpers (assign_category, filter_phase2_url, tracing) into helpers.rs

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 2 months ago
parent b60a55993c
commit 68b1956059

@ -0,0 +1,286 @@
//! Helper functions for the synthesis pipeline.
//!
//! Contains article tracing, URL normalization/hashing, category assignment,
//! and Phase 2 URL filtering logic.
use std::collections::HashMap;
use uuid::Uuid;
use crate::db;
use crate::util::token::hash_token;
/// Structured parameters for article history tracing.
pub(crate) struct ArticleTrace<'a> {
pub url: &'a str,
pub title: &'a str,
pub source_type: &'a str,
pub source_url: Option<&'a str>,
pub category: Option<&'a str>,
pub synthesis_id: Option<Uuid>,
pub status: &'a str,
pub scraped_ok: bool,
pub published_date: Option<&'a str>,
}
/// Build an article history entry from trace parameters (no DB call).
pub(crate) fn build_trace_entry(
user_id: Uuid,
job_id: Uuid,
trace: &ArticleTrace<'_>,
) -> db::article_history::ArticleHistoryEntry {
db::article_history::ArticleHistoryEntry {
user_id,
url: trace.url.to_string(),
url_hash: hash_article_url(trace.url),
title: trace.title.to_string(),
source_type: trace.source_type.to_string(),
source_url: trace.source_url.map(|s| s.to_string()),
category: trace.category.map(|s| s.to_string()),
synthesis_id: trace.synthesis_id,
status: trace.status.to_string(),
scraped_ok: trace.scraped_ok,
job_id,
published_date: trace.published_date.map(|s| s.to_string()),
}
}
/// Extract the domain (host) from a URL, or None if unparseable.
pub(crate) fn extract_domain(url: &str) -> Option<String> {
url::Url::parse(url)
.ok()
.and_then(|u| u.host_str().map(|h| h.to_lowercase()))
}
/// Assign an article to a category based on LLM classification response.
/// Returns `Some((cat_key, cat_name, title, summary))` or `None` if all categories full.
pub(crate) fn assign_category(
llm_response: &serde_json::Value,
page_title: &str,
user_categories: &[String],
classification_categories: &[String],
filled_counts: &HashMap<String, usize>,
max_items_per_category: usize,
) -> Option<(String, String, String, String)> {
let llm_title = llm_response.get("title").and_then(|t| t.as_str()).unwrap_or(page_title).to_string();
let llm_summary = llm_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
let mut llm_category = llm_response.get("category").and_then(|c| c.as_str()).unwrap_or("Divers").to_string();
if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) {
llm_category = "Divers".to_string();
}
let cat_key = if llm_category.to_lowercase() == "autre" {
"category_autre".to_string()
} else {
user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase())
.map(|i| format!("category_{}", i))
.unwrap_or_else(|| "category_autre".to_string())
};
let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0);
if cat_filled >= max_items_per_category && llm_category.to_lowercase() != "autre" {
let autre_filled = filled_counts.get("Divers").copied().unwrap_or(0);
if autre_filled >= max_items_per_category {
return None;
}
Some(("category_autre".to_string(), "Divers".to_string(), llm_title, llm_summary))
} else {
Some((cat_key, llm_category, llm_title, llm_summary))
}
}
/// Check if a Phase 2 URL passes all filters.
/// Returns the filter reason if rejected, None if accepted.
pub(crate) async fn filter_phase2_url(
pool: &sqlx::PgPool,
user_id: Uuid,
url: &str,
seen_urls: &std::collections::HashSet<String>,
source_counts: &HashMap<String, usize>,
article_history_days: i32,
max_articles_per_source: usize,
) -> Option<&'static str> {
if let Ok(parsed_url) = url::Url::parse(url) {
let path = parsed_url.path();
if path.is_empty() || path == "/" {
return Some("filtered_homepage");
}
}
if seen_urls.contains(&url.to_lowercase()) {
return Some("filtered_cross_phase_dedup");
}
if article_history_days > 0 {
let hash = hash_article_url(url);
let exists = db::article_history::check_urls_exist(pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
if exists.contains(&hash) {
return Some("filtered_history");
}
}
if let Some(domain) = extract_domain(url) {
let count = source_counts.get(&domain).copied().unwrap_or(0);
if count >= max_articles_per_source {
return Some("filtered_diversity");
}
}
None
}
/// Normalize an article URL for consistent history hashing.
///
/// Strips fragments, trailing slashes, and known tracking query parameters
/// so that the same article with different UTM tags is recognized as a duplicate.
pub(crate) fn normalize_article_url(url_str: &str) -> String {
let Ok(mut parsed) = url::Url::parse(url_str) else {
return url_str.to_lowercase();
};
// Strip fragment
parsed.set_fragment(None);
// Strip known tracking query parameters
let tracking_params: &[&str] = &[
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
"ref", "source", "fbclid", "gclid",
];
let filtered_pairs: Vec<(String, String)> = parsed
.query_pairs()
.filter(|(key, _)| !tracking_params.contains(&key.as_ref()))
.map(|(k, v)| (k.into_owned(), v.into_owned()))
.collect();
if filtered_pairs.is_empty() {
parsed.set_query(None);
} else {
let query_string = filtered_pairs
.iter()
.map(|(k, v)| format!("{}={}", k, v))
.collect::<Vec<_>>()
.join("&");
parsed.set_query(Some(&query_string));
}
// Strip trailing slash (unless path is just "/")
let path = parsed.path().to_string();
if path.len() > 1 && path.ends_with('/') {
parsed.set_path(&path[..path.len() - 1]);
}
parsed.to_string().to_lowercase()
}
/// Compute the hash of a normalized article URL for history lookup.
pub(crate) fn hash_article_url(url: &str) -> String {
let normalized = normalize_article_url(url);
hash_token(&normalized)
}
#[cfg(test)]
mod tests {
use super::*;
// ── hash_article_url tests ─────────────────────────────────────
#[test]
fn hash_article_url_deterministic() {
let h1 = hash_article_url("https://example.com/article?utm_source=twitter");
let h2 = hash_article_url("https://example.com/article?utm_source=newsletter");
assert_eq!(h1, h2, "Same article with different UTM params should hash the same");
}
#[test]
fn hash_article_url_different_articles() {
let h1 = hash_article_url("https://example.com/article-1");
let h2 = hash_article_url("https://example.com/article-2");
assert_ne!(h1, h2);
}
// ── assign_category tests ───────────────────────────────────
#[test]
fn assign_category_maps_to_correct_category() {
let response = serde_json::json!({
"title": "Test Article",
"summary": "Test summary",
"category": "AI News",
"date": "2026-03-25",
"is_article": true
});
let user_cats = vec!["AI News".to_string(), "Research".to_string()];
let class_cats = vec![
"AI News".to_string(),
"Research".to_string(),
"Divers".to_string(),
];
let filled = std::collections::HashMap::new();
let result =
assign_category(&response, "Fallback Title", &user_cats, &class_cats, &filled, 4);
assert!(result.is_some());
let (cat_key, cat_name, title, _summary) = result.unwrap();
assert_eq!(cat_key, "category_0");
assert_eq!(cat_name, "AI News");
assert_eq!(title, "Test Article");
}
#[test]
fn assign_category_overflows_to_divers() {
let response = serde_json::json!({
"title": "Overflow Article",
"summary": "...",
"category": "AI News",
"date": "",
"is_article": true
});
let user_cats = vec!["AI News".to_string()];
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
let mut filled = std::collections::HashMap::new();
filled.insert("AI News".to_string(), 4usize); // already full
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
assert!(result.is_some());
let (cat_key, cat_name, _, _) = result.unwrap();
assert_eq!(cat_key, "category_autre");
assert_eq!(cat_name, "Divers");
}
#[test]
fn assign_category_returns_none_when_all_full() {
let response = serde_json::json!({
"title": "No Room",
"summary": "...",
"category": "AI News",
"date": "",
"is_article": true
});
let user_cats = vec!["AI News".to_string()];
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
let mut filled = std::collections::HashMap::new();
filled.insert("AI News".to_string(), 4usize);
filled.insert("Divers".to_string(), 4usize);
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
assert!(result.is_none());
}
#[test]
fn assign_category_unknown_category_maps_to_divers() {
let response = serde_json::json!({
"title": "Unknown Cat",
"summary": "...",
"category": "Nonexistent Category",
"date": "",
"is_article": true
});
let user_cats = vec!["AI News".to_string()];
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
let filled = std::collections::HashMap::new();
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
assert!(result.is_some());
let (cat_key, cat_name, _, _) = result.unwrap();
assert_eq!(cat_key, "category_autre");
assert_eq!(cat_name, "Divers");
}
}

@ -28,6 +28,12 @@ use crate::services::llm::factory::create_provider;
use crate::services::scraper;
use crate::services::source_scraper;
mod helpers;
pub(crate) use helpers::{
ArticleTrace, assign_category, build_trace_entry, extract_domain,
filter_phase2_url, hash_article_url,
};
// Re-export for downstream consumers that previously imported from synthesis.
pub use crate::services::job_store::{emit_progress, JobStore, ProgressEvent};
@ -824,41 +830,6 @@ async fn scrape_and_classify_batch(
Ok(())
}
/// Structured parameters for article history tracing.
struct ArticleTrace<'a> {
url: &'a str,
title: &'a str,
source_type: &'a str,
source_url: Option<&'a str>,
category: Option<&'a str>,
synthesis_id: Option<Uuid>,
status: &'a str,
scraped_ok: bool,
published_date: Option<&'a str>,
}
/// Build an article history entry from trace parameters (no DB call).
fn build_trace_entry(
user_id: Uuid,
job_id: Uuid,
trace: &ArticleTrace<'_>,
) -> db::article_history::ArticleHistoryEntry {
db::article_history::ArticleHistoryEntry {
user_id,
url: trace.url.to_string(),
url_hash: hash_article_url(trace.url),
title: trace.title.to_string(),
source_type: trace.source_type.to_string(),
source_url: trace.source_url.map(|s| s.to_string()),
category: trace.category.map(|s| s.to_string()),
synthesis_id: trace.synthesis_id,
status: trace.status.to_string(),
scraped_ok: trace.scraped_ok,
job_id,
published_date: trace.published_date.map(|s| s.to_string()),
}
}
/// Log an LLM call with full prompt, response, and timing.
#[allow(clippy::too_many_arguments)]
async fn log_llm_call(
@ -961,137 +932,6 @@ async fn check_rate_limit(
}
}
/// Extract the domain (host) from a URL, or None if unparseable.
fn extract_domain(url: &str) -> Option<String> {
url::Url::parse(url)
.ok()
.and_then(|u| u.host_str().map(|h| h.to_lowercase()))
}
/// Assign an article to a category based on LLM classification response.
/// Returns `Some((cat_key, cat_name, title, summary))` or `None` if all categories full.
fn assign_category(
llm_response: &serde_json::Value,
page_title: &str,
user_categories: &[String],
classification_categories: &[String],
filled_counts: &HashMap<String, usize>,
max_items_per_category: usize,
) -> Option<(String, String, String, String)> {
let llm_title = llm_response.get("title").and_then(|t| t.as_str()).unwrap_or(page_title).to_string();
let llm_summary = llm_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
let mut llm_category = llm_response.get("category").and_then(|c| c.as_str()).unwrap_or("Divers").to_string();
if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) {
llm_category = "Divers".to_string();
}
let cat_key = if llm_category.to_lowercase() == "autre" {
"category_autre".to_string()
} else {
user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase())
.map(|i| format!("category_{}", i))
.unwrap_or_else(|| "category_autre".to_string())
};
let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0);
if cat_filled >= max_items_per_category && llm_category.to_lowercase() != "autre" {
let autre_filled = filled_counts.get("Divers").copied().unwrap_or(0);
if autre_filled >= max_items_per_category {
return None;
}
Some(("category_autre".to_string(), "Divers".to_string(), llm_title, llm_summary))
} else {
Some((cat_key, llm_category, llm_title, llm_summary))
}
}
/// Check if a Phase 2 URL passes all filters.
/// Returns the filter reason if rejected, None if accepted.
async fn filter_phase2_url(
pool: &sqlx::PgPool,
user_id: Uuid,
url: &str,
seen_urls: &std::collections::HashSet<String>,
source_counts: &HashMap<String, usize>,
article_history_days: i32,
max_articles_per_source: usize,
) -> Option<&'static str> {
if let Ok(parsed_url) = url::Url::parse(url) {
let path = parsed_url.path();
if path.is_empty() || path == "/" {
return Some("filtered_homepage");
}
}
if seen_urls.contains(&url.to_lowercase()) {
return Some("filtered_cross_phase_dedup");
}
if article_history_days > 0 {
let hash = hash_article_url(url);
let exists = db::article_history::check_urls_exist(pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
if exists.contains(&hash) {
return Some("filtered_history");
}
}
if let Some(domain) = extract_domain(url) {
let count = source_counts.get(&domain).copied().unwrap_or(0);
if count >= max_articles_per_source {
return Some("filtered_diversity");
}
}
None
}
/// Normalize an article URL for consistent history hashing.
///
/// Strips fragments, trailing slashes, and known tracking query parameters
/// so that the same article with different UTM tags is recognized as a duplicate.
fn normalize_article_url(url_str: &str) -> String {
let Ok(mut parsed) = url::Url::parse(url_str) else {
return url_str.to_lowercase();
};
// Strip fragment
parsed.set_fragment(None);
// Strip known tracking query parameters
let tracking_params: &[&str] = &[
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
"ref", "source", "fbclid", "gclid",
];
let filtered_pairs: Vec<(String, String)> = parsed
.query_pairs()
.filter(|(key, _)| !tracking_params.contains(&key.as_ref()))
.map(|(k, v)| (k.into_owned(), v.into_owned()))
.collect();
if filtered_pairs.is_empty() {
parsed.set_query(None);
} else {
let query_string = filtered_pairs
.iter()
.map(|(k, v)| format!("{}={}", k, v))
.collect::<Vec<_>>()
.join("&");
parsed.set_query(Some(&query_string));
}
// Strip trailing slash (unless path is just "/")
let path = parsed.path().to_string();
if path.len() > 1 && path.ends_with('/') {
parsed.set_path(&path[..path.len() - 1]);
}
parsed.to_string().to_lowercase()
}
/// Compute the hash of a normalized article URL for history lookup.
fn hash_article_url(url: &str) -> String {
let normalized = normalize_article_url(url);
crate::util::token::hash_token(&normalized)
}
/// Decrypt the Brave Search API key for a user.
async fn resolve_brave_key(
state: &AppState,
@ -1408,82 +1248,6 @@ mod tests {
assert_eq!(sanitized, json);
}
// ── normalize_article_url tests ─────────────────────────────
#[test]
fn normalize_strips_fragment() {
assert_eq!(
normalize_article_url("https://example.com/article#section"),
"https://example.com/article"
);
}
#[test]
fn normalize_strips_utm_params() {
assert_eq!(
normalize_article_url("https://example.com/article?utm_source=twitter&utm_medium=social"),
"https://example.com/article"
);
}
#[test]
fn normalize_keeps_non_tracking_params() {
let result = normalize_article_url("https://example.com/search?q=test&utm_source=twitter");
assert!(result.contains("q=test"));
assert!(!result.contains("utm_source"));
}
#[test]
fn normalize_strips_trailing_slash() {
assert_eq!(
normalize_article_url("https://example.com/article/"),
"https://example.com/article"
);
}
#[test]
fn normalize_keeps_root_slash() {
assert_eq!(
normalize_article_url("https://example.com/"),
"https://example.com/"
);
}
#[test]
fn normalize_lowercases() {
assert_eq!(
normalize_article_url("https://Example.COM/Article"),
"https://example.com/article"
);
}
#[test]
fn normalize_strips_fbclid() {
let result = normalize_article_url("https://example.com/post?fbclid=abc123");
assert!(!result.contains("fbclid"));
assert!(!result.contains("?"));
}
#[test]
fn normalize_handles_invalid_url() {
let result = normalize_article_url("not a url at all");
assert_eq!(result, "not a url at all");
}
#[test]
fn hash_article_url_deterministic() {
let h1 = hash_article_url("https://example.com/article?utm_source=twitter");
let h2 = hash_article_url("https://example.com/article?utm_source=newsletter");
assert_eq!(h1, h2, "Same article with different UTM params should hash the same");
}
#[test]
fn hash_article_url_different_articles() {
let h1 = hash_article_url("https://example.com/article-1");
let h2 = hash_article_url("https://example.com/article-2");
assert_ne!(h1, h2);
}
// ── rotate_sources tests ──────────────────────────────────
#[test]
@ -1526,91 +1290,4 @@ mod tests {
assert!(result.ends_with("..."));
}
// ── assign_category tests ───────────────────────────────────
#[test]
fn assign_category_maps_to_correct_category() {
let response = serde_json::json!({
"title": "Test Article",
"summary": "Test summary",
"category": "AI News",
"date": "2026-03-25",
"is_article": true
});
let user_cats = vec!["AI News".to_string(), "Research".to_string()];
let class_cats = vec![
"AI News".to_string(),
"Research".to_string(),
"Divers".to_string(),
];
let filled = std::collections::HashMap::new();
let result =
assign_category(&response, "Fallback Title", &user_cats, &class_cats, &filled, 4);
assert!(result.is_some());
let (cat_key, cat_name, title, _summary) = result.unwrap();
assert_eq!(cat_key, "category_0");
assert_eq!(cat_name, "AI News");
assert_eq!(title, "Test Article");
}
#[test]
fn assign_category_overflows_to_divers() {
let response = serde_json::json!({
"title": "Overflow Article",
"summary": "...",
"category": "AI News",
"date": "",
"is_article": true
});
let user_cats = vec!["AI News".to_string()];
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
let mut filled = std::collections::HashMap::new();
filled.insert("AI News".to_string(), 4usize); // already full
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
assert!(result.is_some());
let (cat_key, cat_name, _, _) = result.unwrap();
assert_eq!(cat_key, "category_autre");
assert_eq!(cat_name, "Divers");
}
#[test]
fn assign_category_returns_none_when_all_full() {
let response = serde_json::json!({
"title": "No Room",
"summary": "...",
"category": "AI News",
"date": "",
"is_article": true
});
let user_cats = vec!["AI News".to_string()];
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
let mut filled = std::collections::HashMap::new();
filled.insert("AI News".to_string(), 4usize);
filled.insert("Divers".to_string(), 4usize);
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
assert!(result.is_none());
}
#[test]
fn assign_category_unknown_category_maps_to_divers() {
let response = serde_json::json!({
"title": "Unknown Cat",
"summary": "...",
"category": "Nonexistent Category",
"date": "",
"is_article": true
});
let user_cats = vec!["AI News".to_string()];
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
let filled = std::collections::HashMap::new();
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
assert!(result.is_some());
let (cat_key, cat_name, _, _) = result.unwrap();
assert_eq!(cat_key, "category_autre");
assert_eq!(cat_name, "Divers");
}
}
Loading…
Cancel
Save