v2: enhanced scraper - title priority chain, broken link detection, noindex

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · 191e1c716b
parent 9b994e0528
commit 191e1c716b
1 changed files with 279 additions and 11 deletions
--- a/backend/src/services/scraper.rs
+++ b/backend/src/services/scraper.rs
@ -104,6 +104,9 @@ pub async fn scrape_url(
        });
    }
    // Capture final URL BEFORE consuming the response body (follows redirects)
    let final_url = response.url().clone();
    // Read body with size limit
    let bytes = response
        .bytes()
@ -122,15 +125,19 @@ pub async fn scrape_url(
    // Extract page title
    let title = extract_page_title(&document);
-    // Detect soft-404
+    // Extract body text
-    let is_soft_404 = detect_soft_404(&document);
+    let body_text = extract_body_text(&document);
    // Combine all soft-404 detection checks
    let is_soft_404 = detect_soft_404(&document)
        || detect_canonical_error(&document)
        || detect_noindex(&document)
        || detect_short_page_error(&body_text)
        || is_error_path(final_url.path());
    // Extract publication date
    let published_date = extract_publication_date(&document);
    // Extract body text
    let body_text = extract_body_text(&document);
    Ok(ScrapedContent {
        ok: !is_soft_404,
        status,
@ -253,13 +260,46 @@ fn is_private_ip(ip: IpAddr) -> bool {
 // HTML Parsing
 // ────────────────────────────────────────────────────────────────────────────
-/// Extract the page title from the `<title>` element.
+/// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None.
 fn extract_page_title(doc: &Html) -> Option<String> {
-    let sel = Selector::parse("title").ok()?;
+    // 1. Try <title> element
-    doc.select(&sel)
+    if let Some(sel) = Selector::parse("title").ok() {
        if let Some(title) = doc
            .select(&sel)
            .next()
            .map(|el| el.text().collect::<String>().trim().to_string())
            .filter(|t| !t.is_empty())
        {
            return Some(title);
        }
    }
    // 2. Try <meta property="og:title">
    if let Some(sel) = Selector::parse(r#"meta[property="og:title"]"#).ok() {
        if let Some(content) = doc
            .select(&sel)
            .next()
            .and_then(|el| el.value().attr("content"))
            .map(|c| c.trim().to_string())
            .filter(|t| !t.is_empty())
        {
            return Some(content);
        }
    }
    // 3. Try first <h1>
    if let Some(sel) = Selector::parse("h1").ok() {
        if let Some(h1) = doc
            .select(&sel)
            .next()
            .map(|el| el.text().collect::<String>().trim().to_string())
            .filter(|t| !t.is_empty())
        {
            return Some(h1);
        }
    }
    None
 }
 /// Detect whether a page is a soft-404 by checking the page title
@ -282,6 +322,112 @@ fn detect_soft_404(doc: &Html) -> bool {
        .any(|kw| title_text.contains(kw) || h1_text.contains(kw))
 }
 /// Detect whether canonical or og:url points to an error path.
 ///
 /// Extracts `<link rel="canonical" href>` and `<meta property="og:url" content>`,
 /// and returns `true` if either URL's path indicates an error page.
 fn detect_canonical_error(doc: &Html) -> bool {
    // Check <link rel="canonical" href="...">
    if let Ok(sel) = Selector::parse(r#"link[rel="canonical"]"#) {
        if let Some(el) = doc.select(&sel).next() {
            if let Some(href) = el.value().attr("href") {
                if is_error_path(href) {
                    return true;
                }
            }
        }
    }
    // Check <meta property="og:url" content="...">
    if let Ok(sel) = Selector::parse(r#"meta[property="og:url"]"#) {
        if let Some(el) = doc.select(&sel).next() {
            if let Some(content) = el.value().attr("content") {
                if is_error_path(content) {
                    return true;
                }
            }
        }
    }
    false
 }
 /// Detect error pages with short body text by scanning for error phrases.
 ///
 /// Only triggers on pages shorter than 1500 characters to avoid false positives
 /// on real articles that happen to mention "404" or "not found".
 fn detect_short_page_error(body_text: &str) -> bool {
    if body_text.len() >= 1500 {
        return false;
    }
    let lower = body_text.to_lowercase();
    // English error phrases
    let english_phrases = [
        "page not found",
        "could not be found",
        "the requested page",
        "does not exist",
    ];
    // French error phrases
    let french_phrases = [
        "page introuvable",
        "page non trouvee",
        "n'existe pas",
        "n'a pas ete trouvee",
    ];
    for phrase in english_phrases.iter().chain(french_phrases.iter()) {
        if lower.contains(phrase) {
            return true;
        }
    }
    // Check for "404" within 50 chars of "not found", "error", or "introuvable"
    let proximity_keywords = ["not found", "error", "introuvable"];
    for (idx, _) in lower.match_indices("404") {
        for kw in &proximity_keywords {
            if let Some(kw_idx) = lower.find(kw) {
                let distance = if idx > kw_idx {
                    idx - kw_idx
                } else {
                    kw_idx - idx
                };
                if distance <= 50 {
                    return true;
                }
            }
        }
    }
    false
 }
 /// Detect whether the page has a `<meta name="robots">` tag containing "noindex".
 fn detect_noindex(doc: &Html) -> bool {
    if let Ok(sel) = Selector::parse(r#"meta[name="robots"]"#) {
        if let Some(el) = doc.select(&sel).next() {
            if let Some(content) = el.value().attr("content") {
                return content.to_lowercase().contains("noindex");
            }
        }
    }
    false
 }
 /// Check if a URL path contains error-related segments.
 ///
 /// Returns `true` if the path contains "/404", "/404.html", "/error", or "/not-found".
 fn is_error_path(path: &str) -> bool {
    let lower = path.to_lowercase();
    lower.contains("/404")
        || lower.contains("/404.html")
        || lower.contains("/error")
        || lower.contains("/not-found")
 }
 /// Extract the publication date from structured data and meta tags.
 ///
 /// Tries sources in priority order:
@ -916,4 +1062,126 @@ mod tests {
        let url = url::Url::parse("file:///etc/passwd").unwrap();
        assert!(validate_scheme(&url).is_err());
    }
    // ── Enhanced Title Extraction (priority chain) ─────────────────
    #[test]
    fn test_title_priority_title_element_first() {
        let html = r#"<html><head>
            <title>Title Element</title>
            <meta property="og:title" content="OG Title">
        </head><body><h1>H1 Title</h1></body></html>"#;
        let doc = Html::parse_document(html);
        assert_eq!(extract_page_title(&doc), Some("Title Element".into()));
    }
    #[test]
    fn test_title_fallback_to_og_title() {
        let html = r#"<html><head>
            <title></title>
            <meta property="og:title" content="OG Title">
        </head><body><h1>H1 Title</h1></body></html>"#;
        let doc = Html::parse_document(html);
        assert_eq!(extract_page_title(&doc), Some("OG Title".into()));
    }
    #[test]
    fn test_title_fallback_to_h1() {
        let html = r#"<html><head>
            <title></title>
        </head><body><h1>H1 Title</h1></body></html>"#;
        let doc = Html::parse_document(html);
        assert_eq!(extract_page_title(&doc), Some("H1 Title".into()));
    }
    // ── Canonical / OG URL Error Detection ─────────────────────────
    #[test]
    fn test_canonical_404_detected() {
        let html = r#"<html><head>
            <link rel="canonical" href="https://example.com/404">
        </head><body><p>Sorry</p></body></html>"#;
        let doc = Html::parse_document(html);
        assert!(detect_canonical_error(&doc));
    }
    #[test]
    fn test_og_url_error_path_detected() {
        let html = r#"<html><head>
            <meta property="og:url" content="https://example.com/error/page">
        </head><body><p>Oops</p></body></html>"#;
        let doc = Html::parse_document(html);
        assert!(detect_canonical_error(&doc));
    }
    #[test]
    fn test_canonical_normal_url_not_flagged() {
        let html = r#"<html><head>
            <link rel="canonical" href="https://example.com/articles/great-news">
            <meta property="og:url" content="https://example.com/articles/great-news">
        </head><body><p>Content</p></body></html>"#;
        let doc = Html::parse_document(html);
        assert!(!detect_canonical_error(&doc));
    }
    // ── Short Page Error Detection ─────────────────────────────────
    #[test]
    fn test_short_page_with_error_phrases_detected() {
        let body = "Sorry, the page you are looking for could not be found.";
        assert!(detect_short_page_error(body));
    }
    #[test]
    fn test_short_page_french_error_detected() {
        let body = "Désolé, cette page introuvable. Veuillez retourner à l'accueil.";
        assert!(detect_short_page_error(body));
    }
    #[test]
    fn test_long_page_not_flagged() {
        let body = "page not found ".repeat(200); // well over 1500 chars
        assert!(!detect_short_page_error(&body));
    }
    #[test]
    fn test_short_page_404_near_error() {
        let body = "Error 404 - the page you requested is unavailable.";
        assert!(detect_short_page_error(body));
    }
    // ── Noindex Detection ──────────────────────────────────────────
    #[test]
    fn test_noindex_detected() {
        let html = r#"<html><head>
            <meta name="robots" content="noindex, nofollow">
        </head><body><p>Hidden page</p></body></html>"#;
        let doc = Html::parse_document(html);
        assert!(detect_noindex(&doc));
    }
    #[test]
    fn test_noindex_not_present() {
        let html = r#"<html><head>
            <meta name="robots" content="index, follow">
        </head><body><p>Normal page</p></body></html>"#;
        let doc = Html::parse_document(html);
        assert!(!detect_noindex(&doc));
    }
    // ── Error Path Detection ───────────────────────────────────────
    #[test]
    fn test_error_path_detection() {
        assert!(is_error_path("/404"));
        assert!(is_error_path("/pages/404.html"));
        assert!(is_error_path("/error"));
        assert!(is_error_path("/error/something"));
        assert!(is_error_path("/not-found"));
        assert!(is_error_path("/en/not-found"));
        assert!(!is_error_path("/articles/great-news"));
        assert!(!is_error_path("/blog/2026/latest"));
        assert!(!is_error_path("/"));
    }
 }