|
|
|
@ -963,6 +963,56 @@ fn extract_domain(url: &str) -> Option<String> {
|
|
|
|
.and_then(|u| u.host_str().map(|h| h.to_lowercase()))
|
|
|
|
.and_then(|u| u.host_str().map(|h| h.to_lowercase()))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Normalize an article URL for consistent history hashing.
|
|
|
|
|
|
|
|
///
|
|
|
|
|
|
|
|
/// Strips fragments, trailing slashes, and known tracking query parameters
|
|
|
|
|
|
|
|
/// so that the same article with different UTM tags is recognized as a duplicate.
|
|
|
|
|
|
|
|
fn normalize_article_url(url_str: &str) -> String {
|
|
|
|
|
|
|
|
let Ok(mut parsed) = url::Url::parse(url_str) else {
|
|
|
|
|
|
|
|
return url_str.to_lowercase();
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Strip fragment
|
|
|
|
|
|
|
|
parsed.set_fragment(None);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Strip known tracking query parameters
|
|
|
|
|
|
|
|
let tracking_params: &[&str] = &[
|
|
|
|
|
|
|
|
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
|
|
|
|
|
|
|
|
"ref", "source", "fbclid", "gclid",
|
|
|
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let filtered_pairs: Vec<(String, String)> = parsed
|
|
|
|
|
|
|
|
.query_pairs()
|
|
|
|
|
|
|
|
.filter(|(key, _)| !tracking_params.contains(&key.as_ref()))
|
|
|
|
|
|
|
|
.map(|(k, v)| (k.into_owned(), v.into_owned()))
|
|
|
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if filtered_pairs.is_empty() {
|
|
|
|
|
|
|
|
parsed.set_query(None);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
let query_string = filtered_pairs
|
|
|
|
|
|
|
|
.iter()
|
|
|
|
|
|
|
|
.map(|(k, v)| format!("{}={}", k, v))
|
|
|
|
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
|
|
|
|
.join("&");
|
|
|
|
|
|
|
|
parsed.set_query(Some(&query_string));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Strip trailing slash (unless path is just "/")
|
|
|
|
|
|
|
|
let path = parsed.path().to_string();
|
|
|
|
|
|
|
|
if path.len() > 1 && path.ends_with('/') {
|
|
|
|
|
|
|
|
parsed.set_path(&path[..path.len() - 1]);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parsed.to_string().to_lowercase()
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Compute the hash of a normalized article URL for history lookup.
|
|
|
|
|
|
|
|
fn hash_article_url(url: &str) -> String {
|
|
|
|
|
|
|
|
let normalized = normalize_article_url(url);
|
|
|
|
|
|
|
|
crate::util::token::hash_token(&normalized)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Resolve the LLM provider and decrypt the user's API key.
|
|
|
|
/// Resolve the LLM provider and decrypt the user's API key.
|
|
|
|
///
|
|
|
|
///
|
|
|
|
/// If the user has a preferred provider in settings, looks for a key matching
|
|
|
|
/// If the user has a preferred provider in settings, looks for a key matching
|
|
|
|
@ -2306,4 +2356,80 @@ mod tests {
|
|
|
|
let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
|
|
|
|
let result = parse_classification_response(&response, &articles, &categories, 4, &mut filled);
|
|
|
|
assert_eq!(result.get("category_0").map(|v| v.len()), Some(1));
|
|
|
|
assert_eq!(result.get("category_0").map(|v| v.len()), Some(1));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// ── normalize_article_url tests ─────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn normalize_strips_fragment() {
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
|
|
normalize_article_url("https://example.com/article#section"),
|
|
|
|
|
|
|
|
"https://example.com/article"
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn normalize_strips_utm_params() {
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
|
|
normalize_article_url("https://example.com/article?utm_source=twitter&utm_medium=social"),
|
|
|
|
|
|
|
|
"https://example.com/article"
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn normalize_keeps_non_tracking_params() {
|
|
|
|
|
|
|
|
let result = normalize_article_url("https://example.com/search?q=test&utm_source=twitter");
|
|
|
|
|
|
|
|
assert!(result.contains("q=test"));
|
|
|
|
|
|
|
|
assert!(!result.contains("utm_source"));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn normalize_strips_trailing_slash() {
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
|
|
normalize_article_url("https://example.com/article/"),
|
|
|
|
|
|
|
|
"https://example.com/article"
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn normalize_keeps_root_slash() {
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
|
|
normalize_article_url("https://example.com/"),
|
|
|
|
|
|
|
|
"https://example.com/"
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn normalize_lowercases() {
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
|
|
|
|
normalize_article_url("https://Example.COM/Article"),
|
|
|
|
|
|
|
|
"https://example.com/article"
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn normalize_strips_fbclid() {
|
|
|
|
|
|
|
|
let result = normalize_article_url("https://example.com/post?fbclid=abc123");
|
|
|
|
|
|
|
|
assert!(!result.contains("fbclid"));
|
|
|
|
|
|
|
|
assert!(!result.contains("?"));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn normalize_handles_invalid_url() {
|
|
|
|
|
|
|
|
let result = normalize_article_url("not a url at all");
|
|
|
|
|
|
|
|
assert_eq!(result, "not a url at all");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn hash_article_url_deterministic() {
|
|
|
|
|
|
|
|
let h1 = hash_article_url("https://example.com/article?utm_source=twitter");
|
|
|
|
|
|
|
|
let h2 = hash_article_url("https://example.com/article?utm_source=newsletter");
|
|
|
|
|
|
|
|
assert_eq!(h1, h2, "Same article with different UTM params should hash the same");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn hash_article_url_different_articles() {
|
|
|
|
|
|
|
|
let h1 = hash_article_url("https://example.com/article-1");
|
|
|
|
|
|
|
|
let h2 = hash_article_url("https://example.com/article-2");
|
|
|
|
|
|
|
|
assert_ne!(h1, h2);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|