From 3d790e7ce78ed8fd606bc9619196382a35a9439c Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 31 Mar 2026 18:21:53 +0200 Subject: [PATCH] feat: extract article URLs from JSON-LD structured data in source pages Many modern sites (Hugo, WordPress, Next.js) load articles via JavaScript but include full article URLs in JSON-LD schema.org markup in the . The scraper now extracts these first (highest quality), then falls back to heuristic extraction. Supports ItemList, BlogPosting, NewsArticle, @graph arrays, and mainEntity wrappers. Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/services/source_scraper.rs | 153 ++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 1 deletion(-) diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs index 71a2530..4288b0c 100644 --- a/backend/src/services/source_scraper.rs +++ b/backend/src/services/source_scraper.rs @@ -69,7 +69,11 @@ pub async fn extract_article_links( /// Extract and filter article links from HTML content. /// -/// This is a pure function (no I/O) for easy testing. +/// Combines two strategies: +/// 1. JSON-LD structured data (high quality — explicit article URLs from schema.org markup) +/// 2. HTML `` links (fallback — heuristic filtering) +/// +/// JSON-LD links are placed first (most reliable), followed by HTML links not already found. pub fn extract_links_from_html( html: &str, base_url: &Url, @@ -79,6 +83,17 @@ pub fn extract_links_from_html( let mut seen = std::collections::HashSet::new(); let mut links = Vec::new(); + // Strategy 1: Extract URLs from JSON-LD structured data + if let Ok(sel) = scraper::Selector::parse(r#"script[type="application/ld+json"]"#) { + for element in document.select(&sel) { + let text = element.text().collect::(); + if let Ok(json) = serde_json::from_str::(&text) { + extract_urls_from_json_ld(&json, base_domain, &mut seen, &mut links); + } + } + } + + // Strategy 2: Extract URLs from tags (existing heuristic) for element in document.select(&ANCHOR_SELECTOR) { if let Some(href) = element.value().attr("href") { let resolved = match base_url.join(href) { @@ -122,6 +137,71 @@ pub fn extract_links_from_html( links } +/// Extract article URLs from JSON-LD structured data. +/// +/// Supports common schema.org patterns: +/// - `ItemList` with `ListItem` entries (Hugo, many CMS) +/// - `BlogPosting` / `NewsArticle` with `url` field +/// - `@graph` arrays containing any of the above +fn extract_urls_from_json_ld( + json: &serde_json::Value, + base_domain: &str, + seen: &mut std::collections::HashSet, + links: &mut Vec, +) { + // Helper to add a URL if it matches the domain + let mut try_add = |url_str: &str| { + if let Ok(parsed) = Url::parse(url_str) { + let domain = parsed.host_str().unwrap_or("").to_lowercase(); + if domain == base_domain { + let path = parsed.path(); + if !path.is_empty() && path != "/" { + let url = parsed.to_string(); + if seen.insert(url.clone()) { + links.push(url); + } + } + } + } + }; + + // Direct URL on the object (BlogPosting, NewsArticle, etc.) + if let Some(url) = json.get("url").and_then(|v| v.as_str()) { + let obj_type = json.get("@type").and_then(|v| v.as_str()).unwrap_or(""); + if matches!(obj_type, "BlogPosting" | "NewsArticle" | "Article" | "WebPage") { + try_add(url); + } + } + + // ItemList → itemListElement[] + if let Some(items) = json.get("itemListElement").and_then(|v| v.as_array()) { + for item in items { + // ListItem with url + if let Some(url) = item.get("url").and_then(|v| v.as_str()) { + try_add(url); + } + // ListItem with nested item.url + if let Some(inner) = item.get("item") { + if let Some(url) = inner.get("url").and_then(|v| v.as_str()) { + try_add(url); + } + } + } + } + + // @graph array + if let Some(graph) = json.get("@graph").and_then(|v| v.as_array()) { + for node in graph { + extract_urls_from_json_ld(node, base_domain, seen, links); + } + } + + // Recurse into mainEntity (common wrapper in CollectionPage, WebPage) + if let Some(main) = json.get("mainEntity") { + extract_urls_from_json_ld(main, base_domain, seen, links); + } +} + #[cfg(test)] mod tests { use super::*; @@ -212,4 +292,75 @@ mod tests { let links = extract_links_from_html("", &base_url("https://example.com"), "example.com"); assert!(links.is_empty()); } + + #[test] + fn extracts_urls_from_json_ld_item_list() { + let html = r#" + + "#; + let links = extract_links_from_html(html, &base_url("https://example.com/news/"), "example.com"); + assert!(links.len() >= 2, "Should extract at least 2 URLs from JSON-LD, got {}", links.len()); + assert!(links.iter().any(|u| u.contains("article-1"))); + assert!(links.iter().any(|u| u.contains("article-2"))); + } + + #[test] + fn extracts_urls_from_json_ld_blog_posting() { + let html = r#" + + "#; + let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com"); + assert_eq!(links.len(), 1); + assert!(links[0].contains("my-article")); + } + + #[test] + fn json_ld_urls_come_before_html_links() { + let html = r#" + + + HTML Article + "#; + let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com"); + assert_eq!(links.len(), 2); + assert!(links[0].contains("jsonld-article"), "JSON-LD URLs should come first"); + assert!(links[1].contains("html-article"), "HTML links should come second"); + } + + #[test] + fn json_ld_deduplicates_with_html_links() { + let html = r#" + + + Same Article + "#; + let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com"); + assert_eq!(links.len(), 1, "Should deduplicate across JSON-LD and HTML"); + } + + #[test] + fn json_ld_filters_external_domains() { + let html = r#" + + "#; + let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com"); + assert!(links.is_empty(), "Should filter external domain URLs from JSON-LD"); + } }