diff --git a/backend/src/services/scraper.rs b/backend/src/services/scraper.rs
index e268dcf..7cd90b2 100644
--- a/backend/src/services/scraper.rs
+++ b/backend/src/services/scraper.rs
@@ -104,6 +104,9 @@ pub async fn scrape_url(
});
}
+ // Capture final URL BEFORE consuming the response body (follows redirects)
+ let final_url = response.url().clone();
+
// Read body with size limit
let bytes = response
.bytes()
@@ -122,15 +125,19 @@ pub async fn scrape_url(
// Extract page title
let title = extract_page_title(&document);
- // Detect soft-404
- let is_soft_404 = detect_soft_404(&document);
+ // Extract body text
+ let body_text = extract_body_text(&document);
+
+ // Combine all soft-404 detection checks
+ let is_soft_404 = detect_soft_404(&document)
+ || detect_canonical_error(&document)
+ || detect_noindex(&document)
+ || detect_short_page_error(&body_text)
+ || is_error_path(final_url.path());
// Extract publication date
let published_date = extract_publication_date(&document);
- // Extract body text
- let body_text = extract_body_text(&document);
-
Ok(ScrapedContent {
ok: !is_soft_404,
status,
@@ -253,13 +260,46 @@ fn is_private_ip(ip: IpAddr) -> bool {
// HTML Parsing
// ────────────────────────────────────────────────────────────────────────────
-/// Extract the page title from the `
` element.
+/// Extract the page title using a priority chain: `` -> `og:title` -> `
` -> None.
fn extract_page_title(doc: &Html) -> Option {
- let sel = Selector::parse("title").ok()?;
- doc.select(&sel)
- .next()
- .map(|el| el.text().collect::().trim().to_string())
- .filter(|t| !t.is_empty())
+ // 1. Try element
+ if let Some(sel) = Selector::parse("title").ok() {
+ if let Some(title) = doc
+ .select(&sel)
+ .next()
+ .map(|el| el.text().collect::().trim().to_string())
+ .filter(|t| !t.is_empty())
+ {
+ return Some(title);
+ }
+ }
+
+ // 2. Try
+ if let Some(sel) = Selector::parse(r#"meta[property="og:title"]"#).ok() {
+ if let Some(content) = doc
+ .select(&sel)
+ .next()
+ .and_then(|el| el.value().attr("content"))
+ .map(|c| c.trim().to_string())
+ .filter(|t| !t.is_empty())
+ {
+ return Some(content);
+ }
+ }
+
+ // 3. Try first
+ if let Some(sel) = Selector::parse("h1").ok() {
+ if let Some(h1) = doc
+ .select(&sel)
+ .next()
+ .map(|el| el.text().collect::().trim().to_string())
+ .filter(|t| !t.is_empty())
+ {
+ return Some(h1);
+ }
+ }
+
+ None
}
/// Detect whether a page is a soft-404 by checking the page title
@@ -282,6 +322,112 @@ fn detect_soft_404(doc: &Html) -> bool {
.any(|kw| title_text.contains(kw) || h1_text.contains(kw))
}
+/// Detect whether canonical or og:url points to an error path.
+///
+/// Extracts `` and ``,
+/// and returns `true` if either URL's path indicates an error page.
+fn detect_canonical_error(doc: &Html) -> bool {
+ // Check
+ if let Ok(sel) = Selector::parse(r#"link[rel="canonical"]"#) {
+ if let Some(el) = doc.select(&sel).next() {
+ if let Some(href) = el.value().attr("href") {
+ if is_error_path(href) {
+ return true;
+ }
+ }
+ }
+ }
+
+ // Check
+ if let Ok(sel) = Selector::parse(r#"meta[property="og:url"]"#) {
+ if let Some(el) = doc.select(&sel).next() {
+ if let Some(content) = el.value().attr("content") {
+ if is_error_path(content) {
+ return true;
+ }
+ }
+ }
+ }
+
+ false
+}
+
+/// Detect error pages with short body text by scanning for error phrases.
+///
+/// Only triggers on pages shorter than 1500 characters to avoid false positives
+/// on real articles that happen to mention "404" or "not found".
+fn detect_short_page_error(body_text: &str) -> bool {
+ if body_text.len() >= 1500 {
+ return false;
+ }
+
+ let lower = body_text.to_lowercase();
+
+ // English error phrases
+ let english_phrases = [
+ "page not found",
+ "could not be found",
+ "the requested page",
+ "does not exist",
+ ];
+
+ // French error phrases
+ let french_phrases = [
+ "page introuvable",
+ "page non trouvee",
+ "n'existe pas",
+ "n'a pas ete trouvee",
+ ];
+
+ for phrase in english_phrases.iter().chain(french_phrases.iter()) {
+ if lower.contains(phrase) {
+ return true;
+ }
+ }
+
+ // Check for "404" within 50 chars of "not found", "error", or "introuvable"
+ let proximity_keywords = ["not found", "error", "introuvable"];
+ for (idx, _) in lower.match_indices("404") {
+ for kw in &proximity_keywords {
+ if let Some(kw_idx) = lower.find(kw) {
+ let distance = if idx > kw_idx {
+ idx - kw_idx
+ } else {
+ kw_idx - idx
+ };
+ if distance <= 50 {
+ return true;
+ }
+ }
+ }
+ }
+
+ false
+}
+
+/// Detect whether the page has a `` tag containing "noindex".
+fn detect_noindex(doc: &Html) -> bool {
+ if let Ok(sel) = Selector::parse(r#"meta[name="robots"]"#) {
+ if let Some(el) = doc.select(&sel).next() {
+ if let Some(content) = el.value().attr("content") {
+ return content.to_lowercase().contains("noindex");
+ }
+ }
+ }
+ false
+}
+
+/// Check if a URL path contains error-related segments.
+///
+/// Returns `true` if the path contains "/404", "/404.html", "/error", or "/not-found".
+fn is_error_path(path: &str) -> bool {
+ let lower = path.to_lowercase();
+ lower.contains("/404")
+ || lower.contains("/404.html")
+ || lower.contains("/error")
+ || lower.contains("/not-found")
+}
+
/// Extract the publication date from structured data and meta tags.
///
/// Tries sources in priority order:
@@ -916,4 +1062,126 @@ mod tests {
let url = url::Url::parse("file:///etc/passwd").unwrap();
assert!(validate_scheme(&url).is_err());
}
+
+ // ── Enhanced Title Extraction (priority chain) ─────────────────
+
+ #[test]
+ fn test_title_priority_title_element_first() {
+ let html = r#"
+ Title Element
+
+
H1 Title
"#;
+ let doc = Html::parse_document(html);
+ assert_eq!(extract_page_title(&doc), Some("Title Element".into()));
+ }
+
+ #[test]
+ fn test_title_fallback_to_og_title() {
+ let html = r#"
+
+
+
H1 Title
"#;
+ let doc = Html::parse_document(html);
+ assert_eq!(extract_page_title(&doc), Some("OG Title".into()));
+ }
+
+ #[test]
+ fn test_title_fallback_to_h1() {
+ let html = r#"
+
+
H1 Title
"#;
+ let doc = Html::parse_document(html);
+ assert_eq!(extract_page_title(&doc), Some("H1 Title".into()));
+ }
+
+ // ── Canonical / OG URL Error Detection ─────────────────────────
+
+ #[test]
+ fn test_canonical_404_detected() {
+ let html = r#"
+
+
Sorry
"#;
+ let doc = Html::parse_document(html);
+ assert!(detect_canonical_error(&doc));
+ }
+
+ #[test]
+ fn test_og_url_error_path_detected() {
+ let html = r#"
+
+
Oops
"#;
+ let doc = Html::parse_document(html);
+ assert!(detect_canonical_error(&doc));
+ }
+
+ #[test]
+ fn test_canonical_normal_url_not_flagged() {
+ let html = r#"
+
+
+
Content
"#;
+ let doc = Html::parse_document(html);
+ assert!(!detect_canonical_error(&doc));
+ }
+
+ // ── Short Page Error Detection ─────────────────────────────────
+
+ #[test]
+ fn test_short_page_with_error_phrases_detected() {
+ let body = "Sorry, the page you are looking for could not be found.";
+ assert!(detect_short_page_error(body));
+ }
+
+ #[test]
+ fn test_short_page_french_error_detected() {
+ let body = "Désolé, cette page introuvable. Veuillez retourner à l'accueil.";
+ assert!(detect_short_page_error(body));
+ }
+
+ #[test]
+ fn test_long_page_not_flagged() {
+ let body = "page not found ".repeat(200); // well over 1500 chars
+ assert!(!detect_short_page_error(&body));
+ }
+
+ #[test]
+ fn test_short_page_404_near_error() {
+ let body = "Error 404 - the page you requested is unavailable.";
+ assert!(detect_short_page_error(body));
+ }
+
+ // ── Noindex Detection ──────────────────────────────────────────
+
+ #[test]
+ fn test_noindex_detected() {
+ let html = r#"
+
+
Hidden page
"#;
+ let doc = Html::parse_document(html);
+ assert!(detect_noindex(&doc));
+ }
+
+ #[test]
+ fn test_noindex_not_present() {
+ let html = r#"
+
+