v2: enhanced scraper - title priority chain, broken link detection, noindex

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · 191e1c716b
parent 9b994e0528
commit 191e1c716b
1 changed files with 279 additions and 11 deletions
--- a/backend/src/services/scraper.rs
+++ b/backend/src/services/scraper.rs
@ -104,6 +104,9 @@ pub async fn scrape_url(
        });
    }

+    // Capture final URL BEFORE consuming the response body (follows redirects)
+    let final_url = response.url().clone();
+
    // Read body with size limit
    let bytes = response
        .bytes()
@ -122,15 +125,19 @@ pub async fn scrape_url(
    // Extract page title
    let title = extract_page_title(&document);

-    // Detect soft-404
-    let is_soft_404 = detect_soft_404(&document);
+    // Extract body text
+    let body_text = extract_body_text(&document);
+
+    // Combine all soft-404 detection checks
+    let is_soft_404 = detect_soft_404(&document)
+        || detect_canonical_error(&document)
+        || detect_noindex(&document)
+        || detect_short_page_error(&body_text)
+        || is_error_path(final_url.path());

    // Extract publication date
    let published_date = extract_publication_date(&document);

-    // Extract body text
-    let body_text = extract_body_text(&document);
-
    Ok(ScrapedContent {
        ok: !is_soft_404,
        status,
@ -253,13 +260,46 @@ fn is_private_ip(ip: IpAddr) -> bool {
 // HTML Parsing
 // ────────────────────────────────────────────────────────────────────────────

-/// Extract the page title from the `<title>` element.
+/// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None.
 fn extract_page_title(doc: &Html) -> Option<String> {
-    let sel = Selector::parse("title").ok()?;
-    doc.select(&sel)
+    // 1. Try <title> element
+    if let Some(sel) = Selector::parse("title").ok() {
+        if let Some(title) = doc
+            .select(&sel)
            .next()
            .map(|el| el.text().collect::<String>().trim().to_string())
            .filter(|t| !t.is_empty())
+        {
+            return Some(title);
+        }
+    }
+
+    // 2. Try <meta property="og:title">
+    if let Some(sel) = Selector::parse(r#"meta[property="og:title"]"#).ok() {
+        if let Some(content) = doc
+            .select(&sel)
+            .next()
+            .and_then(|el| el.value().attr("content"))
+            .map(|c| c.trim().to_string())
+            .filter(|t| !t.is_empty())
+        {
+            return Some(content);
+        }
+    }
+
+    // 3. Try first <h1>
+    if let Some(sel) = Selector::parse("h1").ok() {
+        if let Some(h1) = doc
+            .select(&sel)
+            .next()
+            .map(|el| el.text().collect::<String>().trim().to_string())
+            .filter(|t| !t.is_empty())
+        {
+            return Some(h1);
+        }
+    }
+
+    None
 }

 /// Detect whether a page is a soft-404 by checking the page title
@ -282,6 +322,112 @@ fn detect_soft_404(doc: &Html) -> bool {
        .any(|kw| title_text.contains(kw) || h1_text.contains(kw))
 }

+/// Detect whether canonical or og:url points to an error path.
+///
+/// Extracts `<link rel="canonical" href>` and `<meta property="og:url" content>`,
+/// and returns `true` if either URL's path indicates an error page.
+fn detect_canonical_error(doc: &Html) -> bool {
+    // Check <link rel="canonical" href="...">
+    if let Ok(sel) = Selector::parse(r#"link[rel="canonical"]"#) {
+        if let Some(el) = doc.select(&sel).next() {
+            if let Some(href) = el.value().attr("href") {
+                if is_error_path(href) {
+                    return true;
+                }
+            }
+        }
+    }
+
+    // Check <meta property="og:url" content="...">
+    if let Ok(sel) = Selector::parse(r#"meta[property="og:url"]"#) {
+        if let Some(el) = doc.select(&sel).next() {
+            if let Some(content) = el.value().attr("content") {
+                if is_error_path(content) {
+                    return true;
+                }
+            }
+        }
+    }
+
+    false
+}
+
+/// Detect error pages with short body text by scanning for error phrases.
+///
+/// Only triggers on pages shorter than 1500 characters to avoid false positives
+/// on real articles that happen to mention "404" or "not found".
+fn detect_short_page_error(body_text: &str) -> bool {
+    if body_text.len() >= 1500 {
+        return false;
+    }
+
+    let lower = body_text.to_lowercase();
+
+    // English error phrases
+    let english_phrases = [
+        "page not found",
+        "could not be found",
+        "the requested page",
+        "does not exist",
+    ];
+
+    // French error phrases
+    let french_phrases = [
+        "page introuvable",
+        "page non trouvee",
+        "n'existe pas",
+        "n'a pas ete trouvee",
+    ];
+
+    for phrase in english_phrases.iter().chain(french_phrases.iter()) {
+        if lower.contains(phrase) {
+            return true;
+        }
+    }
+
+    // Check for "404" within 50 chars of "not found", "error", or "introuvable"
+    let proximity_keywords = ["not found", "error", "introuvable"];
+    for (idx, _) in lower.match_indices("404") {
+        for kw in &proximity_keywords {
+            if let Some(kw_idx) = lower.find(kw) {
+                let distance = if idx > kw_idx {
+                    idx - kw_idx
+                } else {
+                    kw_idx - idx
+                };
+                if distance <= 50 {
+                    return true;
+                }
+            }
+        }
+    }
+
+    false
+}
+
+/// Detect whether the page has a `<meta name="robots">` tag containing "noindex".
+fn detect_noindex(doc: &Html) -> bool {
+    if let Ok(sel) = Selector::parse(r#"meta[name="robots"]"#) {
+        if let Some(el) = doc.select(&sel).next() {
+            if let Some(content) = el.value().attr("content") {
+                return content.to_lowercase().contains("noindex");
+            }
+        }
+    }
+    false
+}
+
+/// Check if a URL path contains error-related segments.
+///
+/// Returns `true` if the path contains "/404", "/404.html", "/error", or "/not-found".
+fn is_error_path(path: &str) -> bool {
+    let lower = path.to_lowercase();
+    lower.contains("/404")
+        || lower.contains("/404.html")
+        || lower.contains("/error")
+        || lower.contains("/not-found")
+}
+
 /// Extract the publication date from structured data and meta tags.
 ///
 /// Tries sources in priority order:
@ -916,4 +1062,126 @@ mod tests {
        let url = url::Url::parse("file:///etc/passwd").unwrap();
        assert!(validate_scheme(&url).is_err());
    }
+
+    // ── Enhanced Title Extraction (priority chain) ─────────────────
+
+    #[test]
+    fn test_title_priority_title_element_first() {
+        let html = r#"<html><head>
+            <title>Title Element</title>
+            <meta property="og:title" content="OG Title">
+        </head><body><h1>H1 Title</h1></body></html>"#;
+        let doc = Html::parse_document(html);
+        assert_eq!(extract_page_title(&doc), Some("Title Element".into()));
+    }
+
+    #[test]
+    fn test_title_fallback_to_og_title() {
+        let html = r#"<html><head>
+            <title></title>
+            <meta property="og:title" content="OG Title">
+        </head><body><h1>H1 Title</h1></body></html>"#;
+        let doc = Html::parse_document(html);
+        assert_eq!(extract_page_title(&doc), Some("OG Title".into()));
+    }
+
+    #[test]
+    fn test_title_fallback_to_h1() {
+        let html = r#"<html><head>
+            <title></title>
+        </head><body><h1>H1 Title</h1></body></html>"#;
+        let doc = Html::parse_document(html);
+        assert_eq!(extract_page_title(&doc), Some("H1 Title".into()));
+    }
+
+    // ── Canonical / OG URL Error Detection ─────────────────────────
+
+    #[test]
+    fn test_canonical_404_detected() {
+        let html = r#"<html><head>
+            <link rel="canonical" href="https://example.com/404">
+        </head><body><p>Sorry</p></body></html>"#;
+        let doc = Html::parse_document(html);
+        assert!(detect_canonical_error(&doc));
+    }
+
+    #[test]
+    fn test_og_url_error_path_detected() {
+        let html = r#"<html><head>
+            <meta property="og:url" content="https://example.com/error/page">
+        </head><body><p>Oops</p></body></html>"#;
+        let doc = Html::parse_document(html);
+        assert!(detect_canonical_error(&doc));
+    }
+
+    #[test]
+    fn test_canonical_normal_url_not_flagged() {
+        let html = r#"<html><head>
+            <link rel="canonical" href="https://example.com/articles/great-news">
+            <meta property="og:url" content="https://example.com/articles/great-news">
+        </head><body><p>Content</p></body></html>"#;
+        let doc = Html::parse_document(html);
+        assert!(!detect_canonical_error(&doc));
+    }
+
+    // ── Short Page Error Detection ─────────────────────────────────
+
+    #[test]
+    fn test_short_page_with_error_phrases_detected() {
+        let body = "Sorry, the page you are looking for could not be found.";
+        assert!(detect_short_page_error(body));
+    }
+
+    #[test]
+    fn test_short_page_french_error_detected() {
+        let body = "Désolé, cette page introuvable. Veuillez retourner à l'accueil.";
+        assert!(detect_short_page_error(body));
+    }
+
+    #[test]
+    fn test_long_page_not_flagged() {
+        let body = "page not found ".repeat(200); // well over 1500 chars
+        assert!(!detect_short_page_error(&body));
+    }
+
+    #[test]
+    fn test_short_page_404_near_error() {
+        let body = "Error 404 - the page you requested is unavailable.";
+        assert!(detect_short_page_error(body));
+    }
+
+    // ── Noindex Detection ──────────────────────────────────────────
+
+    #[test]
+    fn test_noindex_detected() {
+        let html = r#"<html><head>
+            <meta name="robots" content="noindex, nofollow">
+        </head><body><p>Hidden page</p></body></html>"#;
+        let doc = Html::parse_document(html);
+        assert!(detect_noindex(&doc));
+    }
+
+    #[test]
+    fn test_noindex_not_present() {
+        let html = r#"<html><head>
+            <meta name="robots" content="index, follow">
+        </head><body><p>Normal page</p></body></html>"#;
+        let doc = Html::parse_document(html);
+        assert!(!detect_noindex(&doc));
+    }
+
+    // ── Error Path Detection ───────────────────────────────────────
+
+    #[test]
+    fn test_error_path_detection() {
+        assert!(is_error_path("/404"));
+        assert!(is_error_path("/pages/404.html"));
+        assert!(is_error_path("/error"));
+        assert!(is_error_path("/error/something"));
+        assert!(is_error_path("/not-found"));
+        assert!(is_error_path("/en/not-found"));
+        assert!(!is_error_path("/articles/great-news"));
+        assert!(!is_error_path("/blog/2026/latest"));
+        assert!(!is_error_path("/"));
+    }
 }