refactor: extract synthesis helpers (assign_category, filter_phase2_url, tracing) into helpers.rs
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>master
parent
b60a55993c
commit
68b1956059
@ -0,0 +1,286 @@
|
|||||||
|
//! Helper functions for the synthesis pipeline.
|
||||||
|
//!
|
||||||
|
//! Contains article tracing, URL normalization/hashing, category assignment,
|
||||||
|
//! and Phase 2 URL filtering logic.
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::db;
|
||||||
|
use crate::util::token::hash_token;
|
||||||
|
|
||||||
|
/// Structured parameters for article history tracing.
|
||||||
|
pub(crate) struct ArticleTrace<'a> {
|
||||||
|
pub url: &'a str,
|
||||||
|
pub title: &'a str,
|
||||||
|
pub source_type: &'a str,
|
||||||
|
pub source_url: Option<&'a str>,
|
||||||
|
pub category: Option<&'a str>,
|
||||||
|
pub synthesis_id: Option<Uuid>,
|
||||||
|
pub status: &'a str,
|
||||||
|
pub scraped_ok: bool,
|
||||||
|
pub published_date: Option<&'a str>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build an article history entry from trace parameters (no DB call).
|
||||||
|
pub(crate) fn build_trace_entry(
|
||||||
|
user_id: Uuid,
|
||||||
|
job_id: Uuid,
|
||||||
|
trace: &ArticleTrace<'_>,
|
||||||
|
) -> db::article_history::ArticleHistoryEntry {
|
||||||
|
db::article_history::ArticleHistoryEntry {
|
||||||
|
user_id,
|
||||||
|
url: trace.url.to_string(),
|
||||||
|
url_hash: hash_article_url(trace.url),
|
||||||
|
title: trace.title.to_string(),
|
||||||
|
source_type: trace.source_type.to_string(),
|
||||||
|
source_url: trace.source_url.map(|s| s.to_string()),
|
||||||
|
category: trace.category.map(|s| s.to_string()),
|
||||||
|
synthesis_id: trace.synthesis_id,
|
||||||
|
status: trace.status.to_string(),
|
||||||
|
scraped_ok: trace.scraped_ok,
|
||||||
|
job_id,
|
||||||
|
published_date: trace.published_date.map(|s| s.to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the domain (host) from a URL, or None if unparseable.
|
||||||
|
pub(crate) fn extract_domain(url: &str) -> Option<String> {
|
||||||
|
url::Url::parse(url)
|
||||||
|
.ok()
|
||||||
|
.and_then(|u| u.host_str().map(|h| h.to_lowercase()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Assign an article to a category based on LLM classification response.
|
||||||
|
/// Returns `Some((cat_key, cat_name, title, summary))` or `None` if all categories full.
|
||||||
|
pub(crate) fn assign_category(
|
||||||
|
llm_response: &serde_json::Value,
|
||||||
|
page_title: &str,
|
||||||
|
user_categories: &[String],
|
||||||
|
classification_categories: &[String],
|
||||||
|
filled_counts: &HashMap<String, usize>,
|
||||||
|
max_items_per_category: usize,
|
||||||
|
) -> Option<(String, String, String, String)> {
|
||||||
|
let llm_title = llm_response.get("title").and_then(|t| t.as_str()).unwrap_or(page_title).to_string();
|
||||||
|
let llm_summary = llm_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
|
||||||
|
let mut llm_category = llm_response.get("category").and_then(|c| c.as_str()).unwrap_or("Divers").to_string();
|
||||||
|
|
||||||
|
if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) {
|
||||||
|
llm_category = "Divers".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
let cat_key = if llm_category.to_lowercase() == "autre" {
|
||||||
|
"category_autre".to_string()
|
||||||
|
} else {
|
||||||
|
user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase())
|
||||||
|
.map(|i| format!("category_{}", i))
|
||||||
|
.unwrap_or_else(|| "category_autre".to_string())
|
||||||
|
};
|
||||||
|
|
||||||
|
let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0);
|
||||||
|
if cat_filled >= max_items_per_category && llm_category.to_lowercase() != "autre" {
|
||||||
|
let autre_filled = filled_counts.get("Divers").copied().unwrap_or(0);
|
||||||
|
if autre_filled >= max_items_per_category {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(("category_autre".to_string(), "Divers".to_string(), llm_title, llm_summary))
|
||||||
|
} else {
|
||||||
|
Some((cat_key, llm_category, llm_title, llm_summary))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a Phase 2 URL passes all filters.
|
||||||
|
/// Returns the filter reason if rejected, None if accepted.
|
||||||
|
pub(crate) async fn filter_phase2_url(
|
||||||
|
pool: &sqlx::PgPool,
|
||||||
|
user_id: Uuid,
|
||||||
|
url: &str,
|
||||||
|
seen_urls: &std::collections::HashSet<String>,
|
||||||
|
source_counts: &HashMap<String, usize>,
|
||||||
|
article_history_days: i32,
|
||||||
|
max_articles_per_source: usize,
|
||||||
|
) -> Option<&'static str> {
|
||||||
|
if let Ok(parsed_url) = url::Url::parse(url) {
|
||||||
|
let path = parsed_url.path();
|
||||||
|
if path.is_empty() || path == "/" {
|
||||||
|
return Some("filtered_homepage");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if seen_urls.contains(&url.to_lowercase()) {
|
||||||
|
return Some("filtered_cross_phase_dedup");
|
||||||
|
}
|
||||||
|
if article_history_days > 0 {
|
||||||
|
let hash = hash_article_url(url);
|
||||||
|
let exists = db::article_history::check_urls_exist(pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
|
||||||
|
if exists.contains(&hash) {
|
||||||
|
return Some("filtered_history");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(domain) = extract_domain(url) {
|
||||||
|
let count = source_counts.get(&domain).copied().unwrap_or(0);
|
||||||
|
if count >= max_articles_per_source {
|
||||||
|
return Some("filtered_diversity");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize an article URL for consistent history hashing.
|
||||||
|
///
|
||||||
|
/// Strips fragments, trailing slashes, and known tracking query parameters
|
||||||
|
/// so that the same article with different UTM tags is recognized as a duplicate.
|
||||||
|
pub(crate) fn normalize_article_url(url_str: &str) -> String {
|
||||||
|
let Ok(mut parsed) = url::Url::parse(url_str) else {
|
||||||
|
return url_str.to_lowercase();
|
||||||
|
};
|
||||||
|
|
||||||
|
// Strip fragment
|
||||||
|
parsed.set_fragment(None);
|
||||||
|
|
||||||
|
// Strip known tracking query parameters
|
||||||
|
let tracking_params: &[&str] = &[
|
||||||
|
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
|
||||||
|
"ref", "source", "fbclid", "gclid",
|
||||||
|
];
|
||||||
|
|
||||||
|
let filtered_pairs: Vec<(String, String)> = parsed
|
||||||
|
.query_pairs()
|
||||||
|
.filter(|(key, _)| !tracking_params.contains(&key.as_ref()))
|
||||||
|
.map(|(k, v)| (k.into_owned(), v.into_owned()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if filtered_pairs.is_empty() {
|
||||||
|
parsed.set_query(None);
|
||||||
|
} else {
|
||||||
|
let query_string = filtered_pairs
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| format!("{}={}", k, v))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("&");
|
||||||
|
parsed.set_query(Some(&query_string));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Strip trailing slash (unless path is just "/")
|
||||||
|
let path = parsed.path().to_string();
|
||||||
|
if path.len() > 1 && path.ends_with('/') {
|
||||||
|
parsed.set_path(&path[..path.len() - 1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
parsed.to_string().to_lowercase()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute the hash of a normalized article URL for history lookup.
|
||||||
|
pub(crate) fn hash_article_url(url: &str) -> String {
|
||||||
|
let normalized = normalize_article_url(url);
|
||||||
|
hash_token(&normalized)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
// ── hash_article_url tests ─────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn hash_article_url_deterministic() {
|
||||||
|
let h1 = hash_article_url("https://example.com/article?utm_source=twitter");
|
||||||
|
let h2 = hash_article_url("https://example.com/article?utm_source=newsletter");
|
||||||
|
assert_eq!(h1, h2, "Same article with different UTM params should hash the same");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn hash_article_url_different_articles() {
|
||||||
|
let h1 = hash_article_url("https://example.com/article-1");
|
||||||
|
let h2 = hash_article_url("https://example.com/article-2");
|
||||||
|
assert_ne!(h1, h2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── assign_category tests ───────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn assign_category_maps_to_correct_category() {
|
||||||
|
let response = serde_json::json!({
|
||||||
|
"title": "Test Article",
|
||||||
|
"summary": "Test summary",
|
||||||
|
"category": "AI News",
|
||||||
|
"date": "2026-03-25",
|
||||||
|
"is_article": true
|
||||||
|
});
|
||||||
|
let user_cats = vec!["AI News".to_string(), "Research".to_string()];
|
||||||
|
let class_cats = vec![
|
||||||
|
"AI News".to_string(),
|
||||||
|
"Research".to_string(),
|
||||||
|
"Divers".to_string(),
|
||||||
|
];
|
||||||
|
let filled = std::collections::HashMap::new();
|
||||||
|
|
||||||
|
let result =
|
||||||
|
assign_category(&response, "Fallback Title", &user_cats, &class_cats, &filled, 4);
|
||||||
|
assert!(result.is_some());
|
||||||
|
let (cat_key, cat_name, title, _summary) = result.unwrap();
|
||||||
|
assert_eq!(cat_key, "category_0");
|
||||||
|
assert_eq!(cat_name, "AI News");
|
||||||
|
assert_eq!(title, "Test Article");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn assign_category_overflows_to_divers() {
|
||||||
|
let response = serde_json::json!({
|
||||||
|
"title": "Overflow Article",
|
||||||
|
"summary": "...",
|
||||||
|
"category": "AI News",
|
||||||
|
"date": "",
|
||||||
|
"is_article": true
|
||||||
|
});
|
||||||
|
let user_cats = vec!["AI News".to_string()];
|
||||||
|
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
|
||||||
|
let mut filled = std::collections::HashMap::new();
|
||||||
|
filled.insert("AI News".to_string(), 4usize); // already full
|
||||||
|
|
||||||
|
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
|
||||||
|
assert!(result.is_some());
|
||||||
|
let (cat_key, cat_name, _, _) = result.unwrap();
|
||||||
|
assert_eq!(cat_key, "category_autre");
|
||||||
|
assert_eq!(cat_name, "Divers");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn assign_category_returns_none_when_all_full() {
|
||||||
|
let response = serde_json::json!({
|
||||||
|
"title": "No Room",
|
||||||
|
"summary": "...",
|
||||||
|
"category": "AI News",
|
||||||
|
"date": "",
|
||||||
|
"is_article": true
|
||||||
|
});
|
||||||
|
let user_cats = vec!["AI News".to_string()];
|
||||||
|
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
|
||||||
|
let mut filled = std::collections::HashMap::new();
|
||||||
|
filled.insert("AI News".to_string(), 4usize);
|
||||||
|
filled.insert("Divers".to_string(), 4usize);
|
||||||
|
|
||||||
|
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
|
||||||
|
assert!(result.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn assign_category_unknown_category_maps_to_divers() {
|
||||||
|
let response = serde_json::json!({
|
||||||
|
"title": "Unknown Cat",
|
||||||
|
"summary": "...",
|
||||||
|
"category": "Nonexistent Category",
|
||||||
|
"date": "",
|
||||||
|
"is_article": true
|
||||||
|
});
|
||||||
|
let user_cats = vec!["AI News".to_string()];
|
||||||
|
let class_cats = vec!["AI News".to_string(), "Divers".to_string()];
|
||||||
|
let filled = std::collections::HashMap::new();
|
||||||
|
|
||||||
|
let result = assign_category(&response, "", &user_cats, &class_cats, &filled, 4);
|
||||||
|
assert!(result.is_some());
|
||||||
|
let (cat_key, cat_name, _, _) = result.unwrap();
|
||||||
|
assert_eq!(cat_key, "category_autre");
|
||||||
|
assert_eq!(cat_name, "Divers");
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue