v2: enhanced scraper - title priority chain, broken link detection, noindex

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 9b994e0528
commit 191e1c716b

@ -104,6 +104,9 @@ pub async fn scrape_url(
}); });
} }
// Capture final URL BEFORE consuming the response body (follows redirects)
let final_url = response.url().clone();
// Read body with size limit // Read body with size limit
let bytes = response let bytes = response
.bytes() .bytes()
@ -122,15 +125,19 @@ pub async fn scrape_url(
// Extract page title // Extract page title
let title = extract_page_title(&document); let title = extract_page_title(&document);
// Detect soft-404 // Extract body text
let is_soft_404 = detect_soft_404(&document); let body_text = extract_body_text(&document);
// Combine all soft-404 detection checks
let is_soft_404 = detect_soft_404(&document)
|| detect_canonical_error(&document)
|| detect_noindex(&document)
|| detect_short_page_error(&body_text)
|| is_error_path(final_url.path());
// Extract publication date // Extract publication date
let published_date = extract_publication_date(&document); let published_date = extract_publication_date(&document);
// Extract body text
let body_text = extract_body_text(&document);
Ok(ScrapedContent { Ok(ScrapedContent {
ok: !is_soft_404, ok: !is_soft_404,
status, status,
@ -253,13 +260,46 @@ fn is_private_ip(ip: IpAddr) -> bool {
// HTML Parsing // HTML Parsing
// ──────────────────────────────────────────────────────────────────────────── // ────────────────────────────────────────────────────────────────────────────
/// Extract the page title from the `<title>` element. /// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None.
fn extract_page_title(doc: &Html) -> Option<String> { fn extract_page_title(doc: &Html) -> Option<String> {
let sel = Selector::parse("title").ok()?; // 1. Try <title> element
doc.select(&sel) if let Some(sel) = Selector::parse("title").ok() {
if let Some(title) = doc
.select(&sel)
.next() .next()
.map(|el| el.text().collect::<String>().trim().to_string()) .map(|el| el.text().collect::<String>().trim().to_string())
.filter(|t| !t.is_empty()) .filter(|t| !t.is_empty())
{
return Some(title);
}
}
// 2. Try <meta property="og:title">
if let Some(sel) = Selector::parse(r#"meta[property="og:title"]"#).ok() {
if let Some(content) = doc
.select(&sel)
.next()
.and_then(|el| el.value().attr("content"))
.map(|c| c.trim().to_string())
.filter(|t| !t.is_empty())
{
return Some(content);
}
}
// 3. Try first <h1>
if let Some(sel) = Selector::parse("h1").ok() {
if let Some(h1) = doc
.select(&sel)
.next()
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|t| !t.is_empty())
{
return Some(h1);
}
}
None
} }
/// Detect whether a page is a soft-404 by checking the page title /// Detect whether a page is a soft-404 by checking the page title
@ -282,6 +322,112 @@ fn detect_soft_404(doc: &Html) -> bool {
.any(|kw| title_text.contains(kw) || h1_text.contains(kw)) .any(|kw| title_text.contains(kw) || h1_text.contains(kw))
} }
/// Detect whether canonical or og:url points to an error path.
///
/// Extracts `<link rel="canonical" href>` and `<meta property="og:url" content>`,
/// and returns `true` if either URL's path indicates an error page.
fn detect_canonical_error(doc: &Html) -> bool {
// Check <link rel="canonical" href="...">
if let Ok(sel) = Selector::parse(r#"link[rel="canonical"]"#) {
if let Some(el) = doc.select(&sel).next() {
if let Some(href) = el.value().attr("href") {
if is_error_path(href) {
return true;
}
}
}
}
// Check <meta property="og:url" content="...">
if let Ok(sel) = Selector::parse(r#"meta[property="og:url"]"#) {
if let Some(el) = doc.select(&sel).next() {
if let Some(content) = el.value().attr("content") {
if is_error_path(content) {
return true;
}
}
}
}
false
}
/// Detect error pages with short body text by scanning for error phrases.
///
/// Only triggers on pages shorter than 1500 characters to avoid false positives
/// on real articles that happen to mention "404" or "not found".
fn detect_short_page_error(body_text: &str) -> bool {
if body_text.len() >= 1500 {
return false;
}
let lower = body_text.to_lowercase();
// English error phrases
let english_phrases = [
"page not found",
"could not be found",
"the requested page",
"does not exist",
];
// French error phrases
let french_phrases = [
"page introuvable",
"page non trouvee",
"n'existe pas",
"n'a pas ete trouvee",
];
for phrase in english_phrases.iter().chain(french_phrases.iter()) {
if lower.contains(phrase) {
return true;
}
}
// Check for "404" within 50 chars of "not found", "error", or "introuvable"
let proximity_keywords = ["not found", "error", "introuvable"];
for (idx, _) in lower.match_indices("404") {
for kw in &proximity_keywords {
if let Some(kw_idx) = lower.find(kw) {
let distance = if idx > kw_idx {
idx - kw_idx
} else {
kw_idx - idx
};
if distance <= 50 {
return true;
}
}
}
}
false
}
/// Detect whether the page has a `<meta name="robots">` tag containing "noindex".
fn detect_noindex(doc: &Html) -> bool {
if let Ok(sel) = Selector::parse(r#"meta[name="robots"]"#) {
if let Some(el) = doc.select(&sel).next() {
if let Some(content) = el.value().attr("content") {
return content.to_lowercase().contains("noindex");
}
}
}
false
}
/// Check if a URL path contains error-related segments.
///
/// Returns `true` if the path contains "/404", "/404.html", "/error", or "/not-found".
fn is_error_path(path: &str) -> bool {
let lower = path.to_lowercase();
lower.contains("/404")
|| lower.contains("/404.html")
|| lower.contains("/error")
|| lower.contains("/not-found")
}
/// Extract the publication date from structured data and meta tags. /// Extract the publication date from structured data and meta tags.
/// ///
/// Tries sources in priority order: /// Tries sources in priority order:
@ -916,4 +1062,126 @@ mod tests {
let url = url::Url::parse("file:///etc/passwd").unwrap(); let url = url::Url::parse("file:///etc/passwd").unwrap();
assert!(validate_scheme(&url).is_err()); assert!(validate_scheme(&url).is_err());
} }
// ── Enhanced Title Extraction (priority chain) ─────────────────
#[test]
fn test_title_priority_title_element_first() {
let html = r#"<html><head>
<title>Title Element</title>
<meta property="og:title" content="OG Title">
</head><body><h1>H1 Title</h1></body></html>"#;
let doc = Html::parse_document(html);
assert_eq!(extract_page_title(&doc), Some("Title Element".into()));
}
#[test]
fn test_title_fallback_to_og_title() {
let html = r#"<html><head>
<title></title>
<meta property="og:title" content="OG Title">
</head><body><h1>H1 Title</h1></body></html>"#;
let doc = Html::parse_document(html);
assert_eq!(extract_page_title(&doc), Some("OG Title".into()));
}
#[test]
fn test_title_fallback_to_h1() {
let html = r#"<html><head>
<title></title>
</head><body><h1>H1 Title</h1></body></html>"#;
let doc = Html::parse_document(html);
assert_eq!(extract_page_title(&doc), Some("H1 Title".into()));
}
// ── Canonical / OG URL Error Detection ─────────────────────────
#[test]
fn test_canonical_404_detected() {
let html = r#"<html><head>
<link rel="canonical" href="https://example.com/404">
</head><body><p>Sorry</p></body></html>"#;
let doc = Html::parse_document(html);
assert!(detect_canonical_error(&doc));
}
#[test]
fn test_og_url_error_path_detected() {
let html = r#"<html><head>
<meta property="og:url" content="https://example.com/error/page">
</head><body><p>Oops</p></body></html>"#;
let doc = Html::parse_document(html);
assert!(detect_canonical_error(&doc));
}
#[test]
fn test_canonical_normal_url_not_flagged() {
let html = r#"<html><head>
<link rel="canonical" href="https://example.com/articles/great-news">
<meta property="og:url" content="https://example.com/articles/great-news">
</head><body><p>Content</p></body></html>"#;
let doc = Html::parse_document(html);
assert!(!detect_canonical_error(&doc));
}
// ── Short Page Error Detection ─────────────────────────────────
#[test]
fn test_short_page_with_error_phrases_detected() {
let body = "Sorry, the page you are looking for could not be found.";
assert!(detect_short_page_error(body));
}
#[test]
fn test_short_page_french_error_detected() {
let body = "Désolé, cette page introuvable. Veuillez retourner à l'accueil.";
assert!(detect_short_page_error(body));
}
#[test]
fn test_long_page_not_flagged() {
let body = "page not found ".repeat(200); // well over 1500 chars
assert!(!detect_short_page_error(&body));
}
#[test]
fn test_short_page_404_near_error() {
let body = "Error 404 - the page you requested is unavailable.";
assert!(detect_short_page_error(body));
}
// ── Noindex Detection ──────────────────────────────────────────
#[test]
fn test_noindex_detected() {
let html = r#"<html><head>
<meta name="robots" content="noindex, nofollow">
</head><body><p>Hidden page</p></body></html>"#;
let doc = Html::parse_document(html);
assert!(detect_noindex(&doc));
}
#[test]
fn test_noindex_not_present() {
let html = r#"<html><head>
<meta name="robots" content="index, follow">
</head><body><p>Normal page</p></body></html>"#;
let doc = Html::parse_document(html);
assert!(!detect_noindex(&doc));
}
// ── Error Path Detection ───────────────────────────────────────
#[test]
fn test_error_path_detection() {
assert!(is_error_path("/404"));
assert!(is_error_path("/pages/404.html"));
assert!(is_error_path("/error"));
assert!(is_error_path("/error/something"));
assert!(is_error_path("/not-found"));
assert!(is_error_path("/en/not-found"));
assert!(!is_error_path("/articles/great-news"));
assert!(!is_error_path("/blog/2026/latest"));
assert!(!is_error_path("/"));
}
} }

Loading…
Cancel
Save