refactor: extract synthesis helpers (assign_category, filter_phase2_url, tracing) into helpers.rs
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>master
parent
b60a55993c
commit
68b1956059
@ -0,0 +1,286 @@
|
||||
//! Helper functions for the synthesis pipeline.
|
||||
//!
|
||||
//! Contains article tracing, URL normalization/hashing, category assignment,
|
||||
//! and Phase 2 URL filtering logic.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::db;
|
||||
use crate::util::token::hash_token;
|
||||
|
||||
/// Structured parameters for article history tracing.
|
||||
pub(crate) struct ArticleTrace<'a> {
|
||||
pub url: &'a str,
|
||||
pub title: &'a str,
|
||||
pub source_type: &'a str,
|
||||
pub source_url: Option<&'a str>,
|
||||
pub category: Option<&'a str>,
|
||||
pub synthesis_id: Option<Uuid>,
|
||||
pub status: &'a str,
|
||||
pub scraped_ok: bool,
|
||||
pub published_date: Option<&'a str>,
|
||||
}
|
||||
|
||||
/// Build an article history entry from trace parameters (no DB call).
|
||||
pub(crate) fn build_trace_entry(
|
||||
user_id: Uuid,
|
||||
job_id: Uuid,
|
||||
trace: &ArticleTrace<'_>,
|
||||
) -> db::article_history::ArticleHistoryEntry {
|
||||
db::article_history::ArticleHistoryEntry {
|
||||
user_id,
|
||||
url: trace.url.to_string(),
|
||||
url_hash: hash_article_url(trace.url),
|
||||
title: trace.title.to_string(),
|
||||
source_type: trace.source_type.to_string(),
|
||||
source_url: trace.source_url.map(|s| s.to_string()),
|
||||
category: trace.category.map(|s| s.to_string()),
|
||||
synthesis_id: trace.synthesis_id,
|
||||
status: trace.status.to_string(),
|
||||
scraped_ok: trace.scraped_ok,
|
||||
job_id,
|
||||
published_date: trace.published_date.map(|s| s.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the domain (host) from a URL, or None if unparseable.
|
||||
pub(crate) fn extract_domain(url: &str) -> Option<String> {
|
||||
url::Url::parse(url)
|
||||
.ok()
|
||||
.and_then(|u| u.host_str().map(|h| h.to_lowercase()))
|
||||
}
|
||||
|
||||
/// Assign an article to a category based on LLM classification response.
|
||||
/// Returns `Some((cat_key, cat_name, title, summary))` or `None` if all categories full.
|
||||
pub(crate) fn assign_category(
|
||||
llm_response: &serde_json::Value,
|
||||
page_title: &str,
|
||||
user_categories: &[String],
|
||||
classification_categories: &[String],
|
||||
filled_counts: &HashMap<String, usize>,
|
||||
max_items_per_category: usize,
|
||||
) -> Option<(String, String, String, String)> {
|
||||
let llm_title = llm_response.get("title").and_then(|t| t.as_str()).unwrap_or(page_title).to_string();
|
||||
let llm_summary = llm_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
|
||||
let mut llm_category = llm_response.get("category").and_then(|c| c.as_str()).unwrap_or("Divers").to_string();
|
||||
|
||||
if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) {
|
||||
llm_category = "Divers".to_string();
|
||||
}
|
||||
|
||||
let cat_key = if llm_category.to_lowercase() == "autre" {
|
||||
"category_autre".to_string()
|
||||
} else {
|
||||
user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase())
|
||||
.map(|i| format!("category_{}", i))
|
||||
.unwrap_or_else(|| "category_autre".to_string())
|
||||
};
|
||||
|
||||
let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0);
|
||||
if cat_filled >= max_items_per_category && llm_category.to_lowercase() != "autre" {
|
||||
let autre_filled = filled_counts.get("Divers").copied().unwrap_or(0);
|
||||
if autre_filled >= max_items_per_category {
|
||||
return None;
|
||||
}
|
||||
Some(("category_autre".to_string(), "Divers".to_string(), llm_title, llm_summary))
|
||||
} else {
|
||||
Some((cat_key, llm_category, llm_title, llm_summary))
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a Phase 2 URL passes all filters.
|
||||
/// Returns the filter reason if rejected, None if accepted.
|
||||
pub(crate) async fn filter_phase2_url(
|
||||
pool: &sqlx::PgPool,
|
||||
user_id: Uuid,
|
||||
url: &str,
|
||||
seen_urls: &std::collections::HashSet<String>,
|
||||
source_counts: &HashMap<String, usize>,
|
||||
article_history_days: i32,
|
||||
max_articles_per_source: usize,
|
||||
) -> Option<&'static str> {
|
||||
if let Ok(parsed_url) = url::Url::parse(url) {
|
||||
let path = parsed_url.path();
|
||||
if path.is_empty() || path == "/" {
|
||||
return Some("filtered_homepage");
|
||||
}
|
||||
}
|
||||
if seen_urls.contains(&url.to_lowercase()) {
|
||||
return Some("filtered_cross_phase_dedup");
|
||||
}
|
||||
if article_history_days > 0 {
|
||||
let hash = hash_article_url(url);
|
||||
let exists = db::article_history::check_urls_exist(pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
|
||||
if exists.contains(&hash) {
|
||||
return Some("filtered_history");
|
||||
}
|
||||
}
|
||||
if let Some(domain) = extract_domain(url) {
|
||||
let count = source_counts.get(&domain).copied().unwrap_or(0);
|
||||
if count >= max_articles_per_source {
|
||||
return Some("filtered_diversity");
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Normalize an article URL for consistent history hashing.
|
||||
///
|
||||
/// Strips fragments, trailing slashes, and known tracking query parameters
|
||||
/// so that the same article with different UTM tags is recognized as a duplicate.
|
||||
pub(crate) fn normalize_article_url(url_str: &str) -> String {
|
||||
let Ok(mut parsed) = url::Url::parse(url_str) else {
|
||||
return url_str.to_lowercase();
|
||||
};
|
||||
|
||||
// Strip fragment
|
||||
parsed.set_fragment(None);
|
||||
|
||||
// Strip known tracking query parameters
|
||||
let tracking_params: &[&str] = &[
|
||||
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
|
||||
"ref", "source", "fbclid", "gclid",
|
||||
];
|
||||
|
||||
let filtered_pairs: Vec<(String, String)> = parsed
|
||||
.query_pairs()
|
||||
.filter(|(key, _)| !tracking_params.contains(&key.as_ref()))
|
||||
.map(|(k, v)| (k.into_owned(), v.into_owned()))
|
||||
.collect();
|
||||
|
||||
if filtered_pairs.is_empty() {
|
||||
parsed.set_query(None);
|
||||
} else {
|
||||
let query_string = filtered_pairs
|
||||
.iter()
|
||||
.map(|(k, v)| format!("{}={}", k, v))
|
||||
.collect::<Vec<_>>()
|
||||
.join("&");
|
||||
parsed.set_query(Some(&query_string));
|
||||
}
|
||||
|
||||
// Strip trailing slash (unless path is just "/")
|
||||
let path = parsed.path().to_string();
|
||||
if path.len() > 1 && path.ends_with('/') {
|
||||
parsed.set_path(&path[..path.len() - 1]);
|
||||
}
|
||||
|
||||
parsed.to_string().to_lowercase()
|
||||
}
|
||||
|
||||
/// Compute the hash of a normalized article URL for history lookup.
|
||||
pub(crate) fn hash_article_url(url: &str) -> String {
|
||||
let normalized = normalize_article_url(url);
|
||||
hash_token(&normalized)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// ── hash_article_url tests ─────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn hash_article_url_deterministic() {
|
||||
let h1 = hash_article_url("https://example.com/article?utm_source=twitter");
|
||||
let h2 = hash_article_url("https://example.com/article?utm_source=newsletter");
|
||||
assert_eq!(h1, h2, "Same article with different UTM params should hash the same");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hash_article_url_different_articles() {
|
||||
let h1 = hash_article_url("https://example.com/article-1");
|
||||
let h2 = hash_article_url("https://example.com/article-2");
|
||||
assert_ne!(h1, h2);
|
||||
}
|
||||
|
||||
// ── assign_category tests ───────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn assign_category_maps_to_correct_category() {
|
||||
let response = serde_json::json!({
|
||||
"title": "Test Article",
|
||||
"summary": "Test summary",
|
||||
"category": "AI News",
|
||||
"date": "2026-03-25",
|
||||
"is_article": true
|
||||
});
|
||||
let user_cats = vec!["AI News".to_string(), "Research".to_string()];
|
||||
let class_cats = vec![
|
||||
"AI News".to_string(),
|
||||
"Research".to_string(),
|
||||
"Divers".to_string(),
|
||||
];
|
||||
let filled = std::collections::HashMap::new();
|
||||
|
||||
let result =
|
||||
assign_category(&response, "Fallback Title", &user_cats, &class_cats, &filled, 4);
|
||||
assert!(result.is_some());
|
||||
let (cat_key, cat_name, title, _summary) = result.unwrap();
|
||||
assert_eq!(cat_key, "category_0");
|
||||
assert_eq!(cat_name, "AI News");
|
||||
assert_eq!(title, "Test Article");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn assign_category_overflows_to_divers() {
|
||||
let response = serde_json::json!({
|
||||
"title": "Overflow Article",
|
||||
"summary": "...",
|
||||
"category": "AI News",
|
||||
"date": "",
|
||||
"is_article": true
|
||||
});
|
||||
let user_cats = vec!["AI News".to_string()];
|
||||
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
|
||||
let mut filled = std::collections::HashMap::new();
|
||||
filled.insert("AI News".to_string(), 4usize); // already full
|
||||
|
||||
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
|
||||
assert!(result.is_some());
|
||||
let (cat_key, cat_name, _, _) = result.unwrap();
|
||||
assert_eq!(cat_key, "category_autre");
|
||||
assert_eq!(cat_name, "Divers");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn assign_category_returns_none_when_all_full() {
|
||||
let response = serde_json::json!({
|
||||
"title": "No Room",
|
||||
"summary": "...",
|
||||
"category": "AI News",
|
||||
"date": "",
|
||||
"is_article": true
|
||||
});
|
||||
let user_cats = vec!["AI News".to_string()];
|
||||
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
|
||||
let mut filled = std::collections::HashMap::new();
|
||||
filled.insert("AI News".to_string(), 4usize);
|
||||
filled.insert("Divers".to_string(), 4usize);
|
||||
|
||||
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
|
||||
assert!(result.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn assign_category_unknown_category_maps_to_divers() {
|
||||
let response = serde_json::json!({
|
||||
"title": "Unknown Cat",
|
||||
"summary": "...",
|
||||
"category": "Nonexistent Category",
|
||||
"date": "",
|
||||
"is_article": true
|
||||
});
|
||||
let user_cats = vec!["AI News".to_string()];
|
||||
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
|
||||
let filled = std::collections::HashMap::new();
|
||||
|
||||
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
|
||||
assert!(result.is_some());
|
||||
let (cat_key, cat_name, _, _) = result.unwrap();
|
||||
assert_eq!(cat_key, "category_autre");
|
||||
assert_eq!(cat_name, "Divers");
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue