|
|
|
|
@ -104,6 +104,9 @@ pub async fn scrape_url(
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Capture final URL BEFORE consuming the response body (follows redirects)
|
|
|
|
|
let final_url = response.url().clone();
|
|
|
|
|
|
|
|
|
|
// Read body with size limit
|
|
|
|
|
let bytes = response
|
|
|
|
|
.bytes()
|
|
|
|
|
@ -122,15 +125,19 @@ pub async fn scrape_url(
|
|
|
|
|
// Extract page title
|
|
|
|
|
let title = extract_page_title(&document);
|
|
|
|
|
|
|
|
|
|
// Detect soft-404
|
|
|
|
|
let is_soft_404 = detect_soft_404(&document);
|
|
|
|
|
// Extract body text
|
|
|
|
|
let body_text = extract_body_text(&document);
|
|
|
|
|
|
|
|
|
|
// Combine all soft-404 detection checks
|
|
|
|
|
let is_soft_404 = detect_soft_404(&document)
|
|
|
|
|
|| detect_canonical_error(&document)
|
|
|
|
|
|| detect_noindex(&document)
|
|
|
|
|
|| detect_short_page_error(&body_text)
|
|
|
|
|
|| is_error_path(final_url.path());
|
|
|
|
|
|
|
|
|
|
// Extract publication date
|
|
|
|
|
let published_date = extract_publication_date(&document);
|
|
|
|
|
|
|
|
|
|
// Extract body text
|
|
|
|
|
let body_text = extract_body_text(&document);
|
|
|
|
|
|
|
|
|
|
Ok(ScrapedContent {
|
|
|
|
|
ok: !is_soft_404,
|
|
|
|
|
status,
|
|
|
|
|
@ -253,13 +260,46 @@ fn is_private_ip(ip: IpAddr) -> bool {
|
|
|
|
|
// HTML Parsing
|
|
|
|
|
// ────────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
/// Extract the page title from the `<title>` element.
|
|
|
|
|
/// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None.
|
|
|
|
|
fn extract_page_title(doc: &Html) -> Option<String> {
|
|
|
|
|
let sel = Selector::parse("title").ok()?;
|
|
|
|
|
doc.select(&sel)
|
|
|
|
|
// 1. Try <title> element
|
|
|
|
|
if let Some(sel) = Selector::parse("title").ok() {
|
|
|
|
|
if let Some(title) = doc
|
|
|
|
|
.select(&sel)
|
|
|
|
|
.next()
|
|
|
|
|
.map(|el| el.text().collect::<String>().trim().to_string())
|
|
|
|
|
.filter(|t| !t.is_empty())
|
|
|
|
|
{
|
|
|
|
|
return Some(title);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 2. Try <meta property="og:title">
|
|
|
|
|
if let Some(sel) = Selector::parse(r#"meta[property="og:title"]"#).ok() {
|
|
|
|
|
if let Some(content) = doc
|
|
|
|
|
.select(&sel)
|
|
|
|
|
.next()
|
|
|
|
|
.and_then(|el| el.value().attr("content"))
|
|
|
|
|
.map(|c| c.trim().to_string())
|
|
|
|
|
.filter(|t| !t.is_empty())
|
|
|
|
|
{
|
|
|
|
|
return Some(content);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 3. Try first <h1>
|
|
|
|
|
if let Some(sel) = Selector::parse("h1").ok() {
|
|
|
|
|
if let Some(h1) = doc
|
|
|
|
|
.select(&sel)
|
|
|
|
|
.next()
|
|
|
|
|
.map(|el| el.text().collect::<String>().trim().to_string())
|
|
|
|
|
.filter(|t| !t.is_empty())
|
|
|
|
|
{
|
|
|
|
|
return Some(h1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Detect whether a page is a soft-404 by checking the page title
|
|
|
|
|
@ -282,6 +322,112 @@ fn detect_soft_404(doc: &Html) -> bool {
|
|
|
|
|
.any(|kw| title_text.contains(kw) || h1_text.contains(kw))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Detect whether canonical or og:url points to an error path.
|
|
|
|
|
///
|
|
|
|
|
/// Extracts `<link rel="canonical" href>` and `<meta property="og:url" content>`,
|
|
|
|
|
/// and returns `true` if either URL's path indicates an error page.
|
|
|
|
|
fn detect_canonical_error(doc: &Html) -> bool {
|
|
|
|
|
// Check <link rel="canonical" href="...">
|
|
|
|
|
if let Ok(sel) = Selector::parse(r#"link[rel="canonical"]"#) {
|
|
|
|
|
if let Some(el) = doc.select(&sel).next() {
|
|
|
|
|
if let Some(href) = el.value().attr("href") {
|
|
|
|
|
if is_error_path(href) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check <meta property="og:url" content="...">
|
|
|
|
|
if let Ok(sel) = Selector::parse(r#"meta[property="og:url"]"#) {
|
|
|
|
|
if let Some(el) = doc.select(&sel).next() {
|
|
|
|
|
if let Some(content) = el.value().attr("content") {
|
|
|
|
|
if is_error_path(content) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Detect error pages with short body text by scanning for error phrases.
|
|
|
|
|
///
|
|
|
|
|
/// Only triggers on pages shorter than 1500 characters to avoid false positives
|
|
|
|
|
/// on real articles that happen to mention "404" or "not found".
|
|
|
|
|
fn detect_short_page_error(body_text: &str) -> bool {
|
|
|
|
|
if body_text.len() >= 1500 {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let lower = body_text.to_lowercase();
|
|
|
|
|
|
|
|
|
|
// English error phrases
|
|
|
|
|
let english_phrases = [
|
|
|
|
|
"page not found",
|
|
|
|
|
"could not be found",
|
|
|
|
|
"the requested page",
|
|
|
|
|
"does not exist",
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
// French error phrases
|
|
|
|
|
let french_phrases = [
|
|
|
|
|
"page introuvable",
|
|
|
|
|
"page non trouvee",
|
|
|
|
|
"n'existe pas",
|
|
|
|
|
"n'a pas ete trouvee",
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
for phrase in english_phrases.iter().chain(french_phrases.iter()) {
|
|
|
|
|
if lower.contains(phrase) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check for "404" within 50 chars of "not found", "error", or "introuvable"
|
|
|
|
|
let proximity_keywords = ["not found", "error", "introuvable"];
|
|
|
|
|
for (idx, _) in lower.match_indices("404") {
|
|
|
|
|
for kw in &proximity_keywords {
|
|
|
|
|
if let Some(kw_idx) = lower.find(kw) {
|
|
|
|
|
let distance = if idx > kw_idx {
|
|
|
|
|
idx - kw_idx
|
|
|
|
|
} else {
|
|
|
|
|
kw_idx - idx
|
|
|
|
|
};
|
|
|
|
|
if distance <= 50 {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Detect whether the page has a `<meta name="robots">` tag containing "noindex".
|
|
|
|
|
fn detect_noindex(doc: &Html) -> bool {
|
|
|
|
|
if let Ok(sel) = Selector::parse(r#"meta[name="robots"]"#) {
|
|
|
|
|
if let Some(el) = doc.select(&sel).next() {
|
|
|
|
|
if let Some(content) = el.value().attr("content") {
|
|
|
|
|
return content.to_lowercase().contains("noindex");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Check if a URL path contains error-related segments.
|
|
|
|
|
///
|
|
|
|
|
/// Returns `true` if the path contains "/404", "/404.html", "/error", or "/not-found".
|
|
|
|
|
fn is_error_path(path: &str) -> bool {
|
|
|
|
|
let lower = path.to_lowercase();
|
|
|
|
|
lower.contains("/404")
|
|
|
|
|
|| lower.contains("/404.html")
|
|
|
|
|
|| lower.contains("/error")
|
|
|
|
|
|| lower.contains("/not-found")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Extract the publication date from structured data and meta tags.
|
|
|
|
|
///
|
|
|
|
|
/// Tries sources in priority order:
|
|
|
|
|
@ -916,4 +1062,126 @@ mod tests {
|
|
|
|
|
let url = url::Url::parse("file:///etc/passwd").unwrap();
|
|
|
|
|
assert!(validate_scheme(&url).is_err());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Enhanced Title Extraction (priority chain) ─────────────────
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_title_priority_title_element_first() {
|
|
|
|
|
let html = r#"<html><head>
|
|
|
|
|
<title>Title Element</title>
|
|
|
|
|
<meta property="og:title" content="OG Title">
|
|
|
|
|
</head><body><h1>H1 Title</h1></body></html>"#;
|
|
|
|
|
let doc = Html::parse_document(html);
|
|
|
|
|
assert_eq!(extract_page_title(&doc), Some("Title Element".into()));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_title_fallback_to_og_title() {
|
|
|
|
|
let html = r#"<html><head>
|
|
|
|
|
<title></title>
|
|
|
|
|
<meta property="og:title" content="OG Title">
|
|
|
|
|
</head><body><h1>H1 Title</h1></body></html>"#;
|
|
|
|
|
let doc = Html::parse_document(html);
|
|
|
|
|
assert_eq!(extract_page_title(&doc), Some("OG Title".into()));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_title_fallback_to_h1() {
|
|
|
|
|
let html = r#"<html><head>
|
|
|
|
|
<title></title>
|
|
|
|
|
</head><body><h1>H1 Title</h1></body></html>"#;
|
|
|
|
|
let doc = Html::parse_document(html);
|
|
|
|
|
assert_eq!(extract_page_title(&doc), Some("H1 Title".into()));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Canonical / OG URL Error Detection ─────────────────────────
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_canonical_404_detected() {
|
|
|
|
|
let html = r#"<html><head>
|
|
|
|
|
<link rel="canonical" href="https://example.com/404">
|
|
|
|
|
</head><body><p>Sorry</p></body></html>"#;
|
|
|
|
|
let doc = Html::parse_document(html);
|
|
|
|
|
assert!(detect_canonical_error(&doc));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_og_url_error_path_detected() {
|
|
|
|
|
let html = r#"<html><head>
|
|
|
|
|
<meta property="og:url" content="https://example.com/error/page">
|
|
|
|
|
</head><body><p>Oops</p></body></html>"#;
|
|
|
|
|
let doc = Html::parse_document(html);
|
|
|
|
|
assert!(detect_canonical_error(&doc));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_canonical_normal_url_not_flagged() {
|
|
|
|
|
let html = r#"<html><head>
|
|
|
|
|
<link rel="canonical" href="https://example.com/articles/great-news">
|
|
|
|
|
<meta property="og:url" content="https://example.com/articles/great-news">
|
|
|
|
|
</head><body><p>Content</p></body></html>"#;
|
|
|
|
|
let doc = Html::parse_document(html);
|
|
|
|
|
assert!(!detect_canonical_error(&doc));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Short Page Error Detection ─────────────────────────────────
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_short_page_with_error_phrases_detected() {
|
|
|
|
|
let body = "Sorry, the page you are looking for could not be found.";
|
|
|
|
|
assert!(detect_short_page_error(body));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_short_page_french_error_detected() {
|
|
|
|
|
let body = "Désolé, cette page introuvable. Veuillez retourner à l'accueil.";
|
|
|
|
|
assert!(detect_short_page_error(body));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_long_page_not_flagged() {
|
|
|
|
|
let body = "page not found ".repeat(200); // well over 1500 chars
|
|
|
|
|
assert!(!detect_short_page_error(&body));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_short_page_404_near_error() {
|
|
|
|
|
let body = "Error 404 - the page you requested is unavailable.";
|
|
|
|
|
assert!(detect_short_page_error(body));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Noindex Detection ──────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_noindex_detected() {
|
|
|
|
|
let html = r#"<html><head>
|
|
|
|
|
<meta name="robots" content="noindex, nofollow">
|
|
|
|
|
</head><body><p>Hidden page</p></body></html>"#;
|
|
|
|
|
let doc = Html::parse_document(html);
|
|
|
|
|
assert!(detect_noindex(&doc));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_noindex_not_present() {
|
|
|
|
|
let html = r#"<html><head>
|
|
|
|
|
<meta name="robots" content="index, follow">
|
|
|
|
|
</head><body><p>Normal page</p></body></html>"#;
|
|
|
|
|
let doc = Html::parse_document(html);
|
|
|
|
|
assert!(!detect_noindex(&doc));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Error Path Detection ───────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_error_path_detection() {
|
|
|
|
|
assert!(is_error_path("/404"));
|
|
|
|
|
assert!(is_error_path("/pages/404.html"));
|
|
|
|
|
assert!(is_error_path("/error"));
|
|
|
|
|
assert!(is_error_path("/error/something"));
|
|
|
|
|
assert!(is_error_path("/not-found"));
|
|
|
|
|
assert!(is_error_path("/en/not-found"));
|
|
|
|
|
assert!(!is_error_path("/articles/great-news"));
|
|
|
|
|
assert!(!is_error_path("/blog/2026/latest"));
|
|
|
|
|
assert!(!is_error_path("/"));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|