feat: extract article URLs from JSON-LD structured data in source pages

Many modern sites (Hugo, WordPress, Next.js) load articles via JavaScript but include full article URLs in JSON-LD schema.org markup in the <head>. The scraper now extracts these first (highest quality), then falls back to <a href> heuristic extraction. Supports ItemList, BlogPosting, NewsArticle, @graph arrays, and mainEntity wrappers. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2 months ago · 3d790e7ce7
parent 9a310bbf19
commit 3d790e7ce7
1 changed files with 152 additions and 1 deletions
--- a/backend/src/services/source_scraper.rs
+++ b/backend/src/services/source_scraper.rs
@ -69,7 +69,11 @@ pub async fn extract_article_links(

 /// Extract and filter article links from HTML content.
 ///
-/// This is a pure function (no I/O) for easy testing.
+/// Combines two strategies:
+/// 1. JSON-LD structured data (high quality — explicit article URLs from schema.org markup)
+/// 2. HTML `<a href>` links (fallback — heuristic filtering)
+///
+/// JSON-LD links are placed first (most reliable), followed by HTML links not already found.
 pub fn extract_links_from_html(
    html: &str,
    base_url: &Url,
@ -79,6 +83,17 @@ pub fn extract_links_from_html(
    let mut seen = std::collections::HashSet::new();
    let mut links = Vec::new();

+    // Strategy 1: Extract URLs from JSON-LD structured data
+    if let Ok(sel) = scraper::Selector::parse(r#"script[type="application/ld+json"]"#) {
+        for element in document.select(&sel) {
+            let text = element.text().collect::<String>();
+            if let Ok(json) = serde_json::from_str::<serde_json::Value>(&text) {
+                extract_urls_from_json_ld(&json, base_domain, &mut seen, &mut links);
+            }
+        }
+    }
+
+    // Strategy 2: Extract URLs from <a href> tags (existing heuristic)
    for element in document.select(&ANCHOR_SELECTOR) {
        if let Some(href) = element.value().attr("href") {
            let resolved = match base_url.join(href) {
@ -122,6 +137,71 @@ pub fn extract_links_from_html(
    links
 }

+/// Extract article URLs from JSON-LD structured data.
+///
+/// Supports common schema.org patterns:
+/// - `ItemList` with `ListItem` entries (Hugo, many CMS)
+/// - `BlogPosting` / `NewsArticle` with `url` field
+/// - `@graph` arrays containing any of the above
+fn extract_urls_from_json_ld(
+    json: &serde_json::Value,
+    base_domain: &str,
+    seen: &mut std::collections::HashSet<String>,
+    links: &mut Vec<String>,
+) {
+    // Helper to add a URL if it matches the domain
+    let mut try_add = |url_str: &str| {
+        if let Ok(parsed) = Url::parse(url_str) {
+            let domain = parsed.host_str().unwrap_or("").to_lowercase();
+            if domain == base_domain {
+                let path = parsed.path();
+                if !path.is_empty() && path != "/" {
+                    let url = parsed.to_string();
+                    if seen.insert(url.clone()) {
+                        links.push(url);
+                    }
+                }
+            }
+        }
+    };
+
+    // Direct URL on the object (BlogPosting, NewsArticle, etc.)
+    if let Some(url) = json.get("url").and_then(|v| v.as_str()) {
+        let obj_type = json.get("@type").and_then(|v| v.as_str()).unwrap_or("");
+        if matches!(obj_type, "BlogPosting" | "NewsArticle" | "Article" | "WebPage") {
+            try_add(url);
+        }
+    }
+
+    // ItemList → itemListElement[]
+    if let Some(items) = json.get("itemListElement").and_then(|v| v.as_array()) {
+        for item in items {
+            // ListItem with url
+            if let Some(url) = item.get("url").and_then(|v| v.as_str()) {
+                try_add(url);
+            }
+            // ListItem with nested item.url
+            if let Some(inner) = item.get("item") {
+                if let Some(url) = inner.get("url").and_then(|v| v.as_str()) {
+                    try_add(url);
+                }
+            }
+        }
+    }
+
+    // @graph array
+    if let Some(graph) = json.get("@graph").and_then(|v| v.as_array()) {
+        for node in graph {
+            extract_urls_from_json_ld(node, base_domain, seen, links);
+        }
+    }
+
+    // Recurse into mainEntity (common wrapper in CollectionPage, WebPage)
+    if let Some(main) = json.get("mainEntity") {
+        extract_urls_from_json_ld(main, base_domain, seen, links);
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@ -212,4 +292,75 @@ mod tests {
        let links = extract_links_from_html("", &base_url("https://example.com"), "example.com");
        assert!(links.is_empty());
    }
+
+    #[test]
+    fn extracts_urls_from_json_ld_item_list() {
+        let html = r#"<html><head>
+        <script type="application/ld+json">
+        {"@type":"CollectionPage","mainEntity":{"@type":"ItemList","itemListElement":[
+            {"@type":"ListItem","position":1,"url":"https://example.com/news/article-1/","item":{"@type":"BlogPosting","url":"https://example.com/news/article-1/"}},
+            {"@type":"ListItem","position":2,"url":"https://example.com/news/article-2/","item":{"@type":"BlogPosting","url":"https://example.com/news/article-2/"}}
+        ]}}
+        </script>
+        </head><body></body></html>"#;
+        let links = extract_links_from_html(html, &base_url("https://example.com/news/"), "example.com");
+        assert!(links.len() >= 2, "Should extract at least 2 URLs from JSON-LD, got {}", links.len());
+        assert!(links.iter().any(|u| u.contains("article-1")));
+        assert!(links.iter().any(|u| u.contains("article-2")));
+    }
+
+    #[test]
+    fn extracts_urls_from_json_ld_blog_posting() {
+        let html = r#"<html><head>
+        <script type="application/ld+json">
+        {"@type":"BlogPosting","url":"https://example.com/post/my-article","headline":"Test"}
+        </script>
+        </head><body></body></html>"#;
+        let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com");
+        assert_eq!(links.len(), 1);
+        assert!(links[0].contains("my-article"));
+    }
+
+    #[test]
+    fn json_ld_urls_come_before_html_links() {
+        let html = r#"<html><head>
+        <script type="application/ld+json">
+        {"@type":"ItemList","itemListElement":[
+            {"@type":"ListItem","url":"https://example.com/jsonld-article/"}
+        ]}
+        </script>
+        </head><body>
+        <a href="/html-article/">HTML Article</a>
+        </body></html>"#;
+        let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com");
+        assert_eq!(links.len(), 2);
+        assert!(links[0].contains("jsonld-article"), "JSON-LD URLs should come first");
+        assert!(links[1].contains("html-article"), "HTML links should come second");
+    }
+
+    #[test]
+    fn json_ld_deduplicates_with_html_links() {
+        let html = r#"<html><head>
+        <script type="application/ld+json">
+        {"@type":"ItemList","itemListElement":[
+            {"@type":"ListItem","url":"https://example.com/same-article/"}
+        ]}
+        </script>
+        </head><body>
+        <a href="/same-article/">Same Article</a>
+        </body></html>"#;
+        let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com");
+        assert_eq!(links.len(), 1, "Should deduplicate across JSON-LD and HTML");
+    }
+
+    #[test]
+    fn json_ld_filters_external_domains() {
+        let html = r#"<html><head>
+        <script type="application/ld+json">
+        {"@type":"BlogPosting","url":"https://other-site.com/article"}
+        </script>
+        </head><body></body></html>"#;
+        let links = extract_links_from_html(html, &base_url("https://example.com"), "example.com");
+        assert!(links.is_empty(), "Should filter external domain URLs from JSON-LD");
+    }
 }