diff --git a/backend/src/services/scraper.rs b/backend/src/services/scraper.rs index e268dcf..7cd90b2 100644 --- a/backend/src/services/scraper.rs +++ b/backend/src/services/scraper.rs @@ -104,6 +104,9 @@ pub async fn scrape_url( }); } + // Capture final URL BEFORE consuming the response body (follows redirects) + let final_url = response.url().clone(); + // Read body with size limit let bytes = response .bytes() @@ -122,15 +125,19 @@ pub async fn scrape_url( // Extract page title let title = extract_page_title(&document); - // Detect soft-404 - let is_soft_404 = detect_soft_404(&document); + // Extract body text + let body_text = extract_body_text(&document); + + // Combine all soft-404 detection checks + let is_soft_404 = detect_soft_404(&document) + || detect_canonical_error(&document) + || detect_noindex(&document) + || detect_short_page_error(&body_text) + || is_error_path(final_url.path()); // Extract publication date let published_date = extract_publication_date(&document); - // Extract body text - let body_text = extract_body_text(&document); - Ok(ScrapedContent { ok: !is_soft_404, status, @@ -253,13 +260,46 @@ fn is_private_ip(ip: IpAddr) -> bool { // HTML Parsing // ──────────────────────────────────────────────────────────────────────────── -/// Extract the page title from the `` element. +/// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None. fn extract_page_title(doc: &Html) -> Option<String> { - let sel = Selector::parse("title").ok()?; - doc.select(&sel) - .next() - .map(|el| el.text().collect::<String>().trim().to_string()) - .filter(|t| !t.is_empty()) + // 1. Try <title> element + if let Some(sel) = Selector::parse("title").ok() { + if let Some(title) = doc + .select(&sel) + .next() + .map(|el| el.text().collect::<String>().trim().to_string()) + .filter(|t| !t.is_empty()) + { + return Some(title); + } + } + + // 2. Try <meta property="og:title"> + if let Some(sel) = Selector::parse(r#"meta[property="og:title"]"#).ok() { + if let Some(content) = doc + .select(&sel) + .next() + .and_then(|el| el.value().attr("content")) + .map(|c| c.trim().to_string()) + .filter(|t| !t.is_empty()) + { + return Some(content); + } + } + + // 3. Try first <h1> + if let Some(sel) = Selector::parse("h1").ok() { + if let Some(h1) = doc + .select(&sel) + .next() + .map(|el| el.text().collect::<String>().trim().to_string()) + .filter(|t| !t.is_empty()) + { + return Some(h1); + } + } + + None } /// Detect whether a page is a soft-404 by checking the page title @@ -282,6 +322,112 @@ fn detect_soft_404(doc: &Html) -> bool { .any(|kw| title_text.contains(kw) || h1_text.contains(kw)) } +/// Detect whether canonical or og:url points to an error path. +/// +/// Extracts `<link rel="canonical" href>` and `<meta property="og:url" content>`, +/// and returns `true` if either URL's path indicates an error page. +fn detect_canonical_error(doc: &Html) -> bool { + // Check <link rel="canonical" href="..."> + if let Ok(sel) = Selector::parse(r#"link[rel="canonical"]"#) { + if let Some(el) = doc.select(&sel).next() { + if let Some(href) = el.value().attr("href") { + if is_error_path(href) { + return true; + } + } + } + } + + // Check <meta property="og:url" content="..."> + if let Ok(sel) = Selector::parse(r#"meta[property="og:url"]"#) { + if let Some(el) = doc.select(&sel).next() { + if let Some(content) = el.value().attr("content") { + if is_error_path(content) { + return true; + } + } + } + } + + false +} + +/// Detect error pages with short body text by scanning for error phrases. +/// +/// Only triggers on pages shorter than 1500 characters to avoid false positives +/// on real articles that happen to mention "404" or "not found". +fn detect_short_page_error(body_text: &str) -> bool { + if body_text.len() >= 1500 { + return false; + } + + let lower = body_text.to_lowercase(); + + // English error phrases + let english_phrases = [ + "page not found", + "could not be found", + "the requested page", + "does not exist", + ]; + + // French error phrases + let french_phrases = [ + "page introuvable", + "page non trouvee", + "n'existe pas", + "n'a pas ete trouvee", + ]; + + for phrase in english_phrases.iter().chain(french_phrases.iter()) { + if lower.contains(phrase) { + return true; + } + } + + // Check for "404" within 50 chars of "not found", "error", or "introuvable" + let proximity_keywords = ["not found", "error", "introuvable"]; + for (idx, _) in lower.match_indices("404") { + for kw in &proximity_keywords { + if let Some(kw_idx) = lower.find(kw) { + let distance = if idx > kw_idx { + idx - kw_idx + } else { + kw_idx - idx + }; + if distance <= 50 { + return true; + } + } + } + } + + false +} + +/// Detect whether the page has a `<meta name="robots">` tag containing "noindex". +fn detect_noindex(doc: &Html) -> bool { + if let Ok(sel) = Selector::parse(r#"meta[name="robots"]"#) { + if let Some(el) = doc.select(&sel).next() { + if let Some(content) = el.value().attr("content") { + return content.to_lowercase().contains("noindex"); + } + } + } + false +} + +/// Check if a URL path contains error-related segments. +/// +/// Returns `true` if the path contains "/404", "/404.html", "/error", or "/not-found". +fn is_error_path(path: &str) -> bool { + let lower = path.to_lowercase(); + lower.contains("/404") + || lower.contains("/404.html") + || lower.contains("/error") + || lower.contains("/not-found") +} + /// Extract the publication date from structured data and meta tags. /// /// Tries sources in priority order: @@ -916,4 +1062,126 @@ mod tests { let url = url::Url::parse("file:///etc/passwd").unwrap(); assert!(validate_scheme(&url).is_err()); } + + // ── Enhanced Title Extraction (priority chain) ───────────────── + + #[test] + fn test_title_priority_title_element_first() { + let html = r#"<html><head> + <title>Title Element + +

H1 Title

"#; + let doc = Html::parse_document(html); + assert_eq!(extract_page_title(&doc), Some("Title Element".into())); + } + + #[test] + fn test_title_fallback_to_og_title() { + let html = r#" + + +

H1 Title

"#; + let doc = Html::parse_document(html); + assert_eq!(extract_page_title(&doc), Some("OG Title".into())); + } + + #[test] + fn test_title_fallback_to_h1() { + let html = r#" + +

H1 Title

"#; + let doc = Html::parse_document(html); + assert_eq!(extract_page_title(&doc), Some("H1 Title".into())); + } + + // ── Canonical / OG URL Error Detection ───────────────────────── + + #[test] + fn test_canonical_404_detected() { + let html = r#" + +

Sorry

"#; + let doc = Html::parse_document(html); + assert!(detect_canonical_error(&doc)); + } + + #[test] + fn test_og_url_error_path_detected() { + let html = r#" + +

Oops

"#; + let doc = Html::parse_document(html); + assert!(detect_canonical_error(&doc)); + } + + #[test] + fn test_canonical_normal_url_not_flagged() { + let html = r#" + + +

Content

"#; + let doc = Html::parse_document(html); + assert!(!detect_canonical_error(&doc)); + } + + // ── Short Page Error Detection ───────────────────────────────── + + #[test] + fn test_short_page_with_error_phrases_detected() { + let body = "Sorry, the page you are looking for could not be found."; + assert!(detect_short_page_error(body)); + } + + #[test] + fn test_short_page_french_error_detected() { + let body = "Désolé, cette page introuvable. Veuillez retourner à l'accueil."; + assert!(detect_short_page_error(body)); + } + + #[test] + fn test_long_page_not_flagged() { + let body = "page not found ".repeat(200); // well over 1500 chars + assert!(!detect_short_page_error(&body)); + } + + #[test] + fn test_short_page_404_near_error() { + let body = "Error 404 - the page you requested is unavailable."; + assert!(detect_short_page_error(body)); + } + + // ── Noindex Detection ────────────────────────────────────────── + + #[test] + fn test_noindex_detected() { + let html = r#" + +

Hidden page

"#; + let doc = Html::parse_document(html); + assert!(detect_noindex(&doc)); + } + + #[test] + fn test_noindex_not_present() { + let html = r#" + +

Normal page

"#; + let doc = Html::parse_document(html); + assert!(!detect_noindex(&doc)); + } + + // ── Error Path Detection ─────────────────────────────────────── + + #[test] + fn test_error_path_detection() { + assert!(is_error_path("/404")); + assert!(is_error_path("/pages/404.html")); + assert!(is_error_path("/error")); + assert!(is_error_path("/error/something")); + assert!(is_error_path("/not-found")); + assert!(is_error_path("/en/not-found")); + assert!(!is_error_path("/articles/great-news")); + assert!(!is_error_path("/blog/2026/latest")); + assert!(!is_error_path("/")); + } }