feat: extract article URLs from JSON-LD structured data in source pages

Many modern sites (Hugo, WordPress, Next.js) load articles via JavaScript
but include full article URLs in JSON-LD schema.org markup in the <head>.
The scraper now extracts these first (highest quality), then falls back
to <a href> heuristic extraction. Supports ItemList, BlogPosting,
NewsArticle, @graph arrays, and mainEntity wrappers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 2 months ago
parent 9a310bbf19
commit 3d790e7ce7

@ -69,7 +69,11 @@ pub async fn extract_article_links(
/// Extract and filter article links from HTML content.
///
/// This is a pure function (no I/O) for easy testing.
/// Combines two strategies:
/// 1. JSON-LD structured data (high quality — explicit article URLs from schema.org markup)
/// 2. HTML `<a href>` links (fallback — heuristic filtering)
///
/// JSON-LD links are placed first (most reliable), followed by HTML links not already found.
pub fn extract_links_from_html(
html: &str,
base_url: &Url,
@ -79,6 +83,17 @@ pub fn extract_links_from_html(
let mut seen = std::collections::HashSet::new();
let mut links = Vec::new();
// Strategy 1: Extract URLs from JSON-LD structured data
if let Ok(sel) = scraper::Selector::parse(r#"script[type="application/ld+json"]"#) {
for element in document.select(&sel) {
let text = element.text().collect::<String>();
if let Ok(json) = serde_json::from_str::<serde_json::Value>(&text) {
extract_urls_from_json_ld(&json, base_domain, &mut seen, &mut links);
}
}
}
// Strategy 2: Extract URLs from <a href> tags (existing heuristic)
for element in document.select(&ANCHOR_SELECTOR) {
if let Some(href) = element.value().attr("href") {
let resolved = match base_url.join(href) {
@ -122,6 +137,71 @@ pub fn extract_links_from_html(
links
}
/// Extract article URLs from JSON-LD structured data.
///
/// Supports common schema.org patterns:
/// - `ItemList` with `ListItem` entries (Hugo, many CMS)
/// - `BlogPosting` / `NewsArticle` with `url` field
/// - `@graph` arrays containing any of the above
fn extract_urls_from_json_ld(
json: &serde_json::Value,
base_domain: &str,
seen: &mut std::collections::HashSet<String>,
links: &mut Vec<String>,
) {
// Helper to add a URL if it matches the domain
let mut try_add = |url_str: &str| {
if let Ok(parsed) = Url::parse(url_str) {
let domain = parsed.host_str().unwrap_or("").to_lowercase();
if domain == base_domain {
let path = parsed.path();
if !path.is_empty() && path != "/" {
let url = parsed.to_string();
if seen.insert(url.clone()) {
links.push(url);
}
}
}
}
};
// Direct URL on the object (BlogPosting, NewsArticle, etc.)
if let Some(url) = json.get("url").and_then(|v| v.as_str()) {
let obj_type = json.get("@type").and_then(|v| v.as_str()).unwrap_or("");
if matches!(obj_type, "BlogPosting" | "NewsArticle" | "Article" | "WebPage") {
try_add(url);
}
}
// ItemList → itemListElement[]
if let Some(items) = json.get("itemListElement").and_then(|v| v.as_array()) {
for item in items {
// ListItem with url
if let Some(url) = item.get("url").and_then(|v| v.as_str()) {
try_add(url);
}
// ListItem with nested item.url
if let Some(inner) = item.get("item") {
if let Some(url) = inner.get("url").and_then(|v| v.as_str()) {
try_add(url);
}
}
}
}
// @graph array
if let Some(graph) = json.get("@graph").and_then(|v| v.as_array()) {
for node in graph {
extract_urls_from_json_ld(node, base_domain, seen, links);
}
}
// Recurse into mainEntity (common wrapper in CollectionPage, WebPage)
if let Some(main) = json.get("mainEntity") {
extract_urls_from_json_ld(main, base_domain, seen, links);
}
}
#[cfg(test)]
mod tests {
use super::*;
@ -212,4 +292,75 @@ mod tests {
let links = extract_links_from_html("", &base_url("https://example.com"), "example.com");
assert!(links.is_empty());
}
#[test]
fn extracts_urls_from_json_ld_item_list() {
let html = r#"<html><head>
<script type="application/ld+json">
{"@type":"CollectionPage","mainEntity":{"@type":"ItemList","itemListElement":[
{"@type":"ListItem","position":1,"url":"https://example.com/news/article-1/","item":{"@type":"BlogPosting","url":"https://example.com/news/article-1/"}},
{"@type":"ListItem","position":2,"url":"https://example.com/news/article-2/","item":{"@type":"BlogPosting","url":"https://example.com/news/article-2/"}}
]}}
</script>
</head><body></body></html>"#;
let links = extract_links_from_html(html, &base_url("https://example.com/news/"), "example.com");
assert!(links.len() >= 2, "Should extract at least 2 URLs from JSON-LD, got {}", links.len());
assert!(links.iter().any(|u| u.contains("article-1")));
assert!(links.iter().any(|u| u.contains("article-2")));
}
#[test]
fn extracts_urls_from_json_ld_blog_posting() {
let html = r#"<html><head>
<script type="application/ld+json">
{"@type":"BlogPosting","url":"https://example.com/post/my-article","headline":"Test"}
</script>
</head><body></body></html>"#;
let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com");
assert_eq!(links.len(), 1);
assert!(links[0].contains("my-article"));
}
#[test]
fn json_ld_urls_come_before_html_links() {
let html = r#"<html><head>
<script type="application/ld+json">
{"@type":"ItemList","itemListElement":[
{"@type":"ListItem","url":"https://example.com/jsonld-article/"}
]}
</script>
</head><body>
<a href="/html-article/">HTML Article</a>
</body></html>"#;
let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com");
assert_eq!(links.len(), 2);
assert!(links[0].contains("jsonld-article"), "JSON-LD URLs should come first");
assert!(links[1].contains("html-article"), "HTML links should come second");
}
#[test]
fn json_ld_deduplicates_with_html_links() {
let html = r#"<html><head>
<script type="application/ld+json">
{"@type":"ItemList","itemListElement":[
{"@type":"ListItem","url":"https://example.com/same-article/"}
]}
</script>
</head><body>
<a href="/same-article/">Same Article</a>
</body></html>"#;
let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com");
assert_eq!(links.len(), 1, "Should deduplicate across JSON-LD and HTML");
}
#[test]
fn json_ld_filters_external_domains() {
let html = r#"<html><head>
<script type="application/ld+json">
{"@type":"BlogPosting","url":"https://other-site.com/article"}
</script>
</head><body></body></html>"#;
let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com");
assert!(links.is_empty(), "Should filter external domain URLs from JSON-LD");
}
}

Loading…
Cancel
Save