From 0a87b7ed8f89137c86b4476419a0ac750791a330 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 12:00:03 +0100 Subject: [PATCH] feat: add normalize_article_url and hash_article_url utilities Co-Authored-By: Claude Sonnet 4.6 --- backend/src/services/synthesis.rs | 126 ++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index a0c6992..a09201d 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -963,6 +963,56 @@ fn extract_domain(url: &str) -> Option { .and_then(|u| u.host_str().map(|h| h.to_lowercase())) } +/// Normalize an article URL for consistent history hashing. +/// +/// Strips fragments, trailing slashes, and known tracking query parameters +/// so that the same article with different UTM tags is recognized as a duplicate. +fn normalize_article_url(url_str: &str) -> String { + let Ok(mut parsed) = url::Url::parse(url_str) else { + return url_str.to_lowercase(); + }; + + // Strip fragment + parsed.set_fragment(None); + + // Strip known tracking query parameters + let tracking_params: &[&str] = &[ + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + "ref", "source", "fbclid", "gclid", + ]; + + let filtered_pairs: Vec<(String, String)> = parsed + .query_pairs() + .filter(|(key, _)| !tracking_params.contains(&key.as_ref())) + .map(|(k, v)| (k.into_owned(), v.into_owned())) + .collect(); + + if filtered_pairs.is_empty() { + parsed.set_query(None); + } else { + let query_string = filtered_pairs + .iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect::>() + .join("&"); + parsed.set_query(Some(&query_string)); + } + + // Strip trailing slash (unless path is just "/") + let path = parsed.path().to_string(); + if path.len() > 1 && path.ends_with('/') { + parsed.set_path(&path[..path.len() - 1]); + } + + parsed.to_string().to_lowercase() +} + +/// Compute the hash of a normalized article URL for history lookup. +fn hash_article_url(url: &str) -> String { + let normalized = normalize_article_url(url); + crate::util::token::hash_token(&normalized) +} + /// Resolve the LLM provider and decrypt the user's API key. /// /// If the user has a preferred provider in settings, looks for a key matching @@ -2306,4 +2356,80 @@ mod tests { let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled); assert_eq!(result.get("category_0").map(|v| v.len()), Some(1)); } + + // ── normalize_article_url tests ───────────────────────────── + + #[test] + fn normalize_strips_fragment() { + assert_eq!( + normalize_article_url("https://example.com/article#section"), + "https://example.com/article" + ); + } + + #[test] + fn normalize_strips_utm_params() { + assert_eq!( + normalize_article_url("https://example.com/article?utm_source=twitter&utm_medium=social"), + "https://example.com/article" + ); + } + + #[test] + fn normalize_keeps_non_tracking_params() { + let result = normalize_article_url("https://example.com/search?q=test&utm_source=twitter"); + assert!(result.contains("q=test")); + assert!(!result.contains("utm_source")); + } + + #[test] + fn normalize_strips_trailing_slash() { + assert_eq!( + normalize_article_url("https://example.com/article/"), + "https://example.com/article" + ); + } + + #[test] + fn normalize_keeps_root_slash() { + assert_eq!( + normalize_article_url("https://example.com/"), + "https://example.com/" + ); + } + + #[test] + fn normalize_lowercases() { + assert_eq!( + normalize_article_url("https://Example.COM/Article"), + "https://example.com/article" + ); + } + + #[test] + fn normalize_strips_fbclid() { + let result = normalize_article_url("https://example.com/post?fbclid=abc123"); + assert!(!result.contains("fbclid")); + assert!(!result.contains("?")); + } + + #[test] + fn normalize_handles_invalid_url() { + let result = normalize_article_url("not a url at all"); + assert_eq!(result, "not a url at all"); + } + + #[test] + fn hash_article_url_deterministic() { + let h1 = hash_article_url("https://example.com/article?utm_source=twitter"); + let h2 = hash_article_url("https://example.com/article?utm_source=newsletter"); + assert_eq!(h1, h2, "Same article with different UTM params should hash the same"); + } + + #[test] + fn hash_article_url_different_articles() { + let h1 = hash_article_url("https://example.com/article-1"); + let h2 = hash_article_url("https://example.com/article-2"); + assert_ne!(h1, h2); + } }