feat: add normalize_article_url and hash_article_url utilities

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 5a928aa990
commit 0a87b7ed8f

@ -963,6 +963,56 @@ fn extract_domain(url: &str) -> Option<String> {
.and_then(|u| u.host_str().map(|h| h.to_lowercase())) .and_then(|u| u.host_str().map(|h| h.to_lowercase()))
} }
/// Normalize an article URL for consistent history hashing.
///
/// Strips fragments, trailing slashes, and known tracking query parameters
/// so that the same article with different UTM tags is recognized as a duplicate.
fn normalize_article_url(url_str: &str) -> String {
let Ok(mut parsed) = url::Url::parse(url_str) else {
return url_str.to_lowercase();
};
// Strip fragment
parsed.set_fragment(None);
// Strip known tracking query parameters
let tracking_params: &[&str] = &[
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
"ref", "source", "fbclid", "gclid",
];
let filtered_pairs: Vec<(String, String)> = parsed
.query_pairs()
.filter(|(key, _)| !tracking_params.contains(&key.as_ref()))
.map(|(k, v)| (k.into_owned(), v.into_owned()))
.collect();
if filtered_pairs.is_empty() {
parsed.set_query(None);
} else {
let query_string = filtered_pairs
.iter()
.map(|(k, v)| format!("{}={}", k, v))
.collect::<Vec<_>>()
.join("&");
parsed.set_query(Some(&query_string));
}
// Strip trailing slash (unless path is just "/")
let path = parsed.path().to_string();
if path.len() > 1 && path.ends_with('/') {
parsed.set_path(&path[..path.len() - 1]);
}
parsed.to_string().to_lowercase()
}
/// Compute the hash of a normalized article URL for history lookup.
fn hash_article_url(url: &str) -> String {
let normalized = normalize_article_url(url);
crate::util::token::hash_token(&normalized)
}
/// Resolve the LLM provider and decrypt the user's API key. /// Resolve the LLM provider and decrypt the user's API key.
/// ///
/// If the user has a preferred provider in settings, looks for a key matching /// If the user has a preferred provider in settings, looks for a key matching
@ -2306,4 +2356,80 @@ mod tests {
let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled); let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
assert_eq!(result.get("category_0").map(|v| v.len()), Some(1)); assert_eq!(result.get("category_0").map(|v| v.len()), Some(1));
} }
// ── normalize_article_url tests ─────────────────────────────
#[test]
fn normalize_strips_fragment() {
assert_eq!(
normalize_article_url("https://example.com/article#section"),
"https://example.com/article"
);
}
#[test]
fn normalize_strips_utm_params() {
assert_eq!(
normalize_article_url("https://example.com/article?utm_source=twitter&utm_medium=social"),
"https://example.com/article"
);
}
#[test]
fn normalize_keeps_non_tracking_params() {
let result = normalize_article_url("https://example.com/search?q=test&utm_source=twitter");
assert!(result.contains("q=test"));
assert!(!result.contains("utm_source"));
}
#[test]
fn normalize_strips_trailing_slash() {
assert_eq!(
normalize_article_url("https://example.com/article/"),
"https://example.com/article"
);
}
#[test]
fn normalize_keeps_root_slash() {
assert_eq!(
normalize_article_url("https://example.com/"),
"https://example.com/"
);
}
#[test]
fn normalize_lowercases() {
assert_eq!(
normalize_article_url("https://Example.COM/Article"),
"https://example.com/article"
);
}
#[test]
fn normalize_strips_fbclid() {
let result = normalize_article_url("https://example.com/post?fbclid=abc123");
assert!(!result.contains("fbclid"));
assert!(!result.contains("?"));
}
#[test]
fn normalize_handles_invalid_url() {
let result = normalize_article_url("not a url at all");
assert_eq!(result, "not a url at all");
}
#[test]
fn hash_article_url_deterministic() {
let h1 = hash_article_url("https://example.com/article?utm_source=twitter");
let h2 = hash_article_url("https://example.com/article?utm_source=newsletter");
assert_eq!(h1, h2, "Same article with different UTM params should hash the same");
}
#[test]
fn hash_article_url_different_articles() {
let h1 = hash_article_url("https://example.com/article-1");
let h2 = hash_article_url("https://example.com/article-2");
assert_ne!(h1, h2);
}
} }

Loading…
Cancel
Save