@ -69,7 +69,11 @@ pub async fn extract_article_links(
/// Extract and filter article links from HTML content.
///
/// This is a pure function (no I/O) for easy testing.
/// Combines two strategies:
/// 1. JSON-LD structured data (high quality — explicit article URLs from schema.org markup)
/// 2. HTML `<a href>` links (fallback — heuristic filtering)
///
/// JSON-LD links are placed first (most reliable), followed by HTML links not already found.
pub fn extract_links_from_html (
html : & str ,
base_url : & Url ,
@ -79,6 +83,17 @@ pub fn extract_links_from_html(
let mut seen = std ::collections ::HashSet ::new ( ) ;
let mut links = Vec ::new ( ) ;
// Strategy 1: Extract URLs from JSON-LD structured data
if let Ok ( sel ) = scraper ::Selector ::parse ( r#"script[type="application/ld+json"]"# ) {
for element in document . select ( & sel ) {
let text = element . text ( ) . collect ::< String > ( ) ;
if let Ok ( json ) = serde_json ::from_str ::< serde_json ::Value > ( & text ) {
extract_urls_from_json_ld ( & json , base_domain , & mut seen , & mut links ) ;
}
}
}
// Strategy 2: Extract URLs from <a href> tags (existing heuristic)
for element in document . select ( & ANCHOR_SELECTOR ) {
if let Some ( href ) = element . value ( ) . attr ( "href" ) {
let resolved = match base_url . join ( href ) {
@ -122,6 +137,71 @@ pub fn extract_links_from_html(
links
}
/// Extract article URLs from JSON-LD structured data.
///
/// Supports common schema.org patterns:
/// - `ItemList` with `ListItem` entries (Hugo, many CMS)
/// - `BlogPosting` / `NewsArticle` with `url` field
/// - `@graph` arrays containing any of the above
fn extract_urls_from_json_ld (
json : & serde_json ::Value ,
base_domain : & str ,
seen : & mut std ::collections ::HashSet < String > ,
links : & mut Vec < String > ,
) {
// Helper to add a URL if it matches the domain
let mut try_add = | url_str : & str | {
if let Ok ( parsed ) = Url ::parse ( url_str ) {
let domain = parsed . host_str ( ) . unwrap_or ( "" ) . to_lowercase ( ) ;
if domain = = base_domain {
let path = parsed . path ( ) ;
if ! path . is_empty ( ) & & path ! = "/" {
let url = parsed . to_string ( ) ;
if seen . insert ( url . clone ( ) ) {
links . push ( url ) ;
}
}
}
}
} ;
// Direct URL on the object (BlogPosting, NewsArticle, etc.)
if let Some ( url ) = json . get ( "url" ) . and_then ( | v | v . as_str ( ) ) {
let obj_type = json . get ( "@type" ) . and_then ( | v | v . as_str ( ) ) . unwrap_or ( "" ) ;
if matches! ( obj_type , "BlogPosting" | "NewsArticle" | "Article" | "WebPage" ) {
try_add ( url ) ;
}
}
// ItemList → itemListElement[]
if let Some ( items ) = json . get ( "itemListElement" ) . and_then ( | v | v . as_array ( ) ) {
for item in items {
// ListItem with url
if let Some ( url ) = item . get ( "url" ) . and_then ( | v | v . as_str ( ) ) {
try_add ( url ) ;
}
// ListItem with nested item.url
if let Some ( inner ) = item . get ( "item" ) {
if let Some ( url ) = inner . get ( "url" ) . and_then ( | v | v . as_str ( ) ) {
try_add ( url ) ;
}
}
}
}
// @graph array
if let Some ( graph ) = json . get ( "@graph" ) . and_then ( | v | v . as_array ( ) ) {
for node in graph {
extract_urls_from_json_ld ( node , base_domain , seen , links ) ;
}
}
// Recurse into mainEntity (common wrapper in CollectionPage, WebPage)
if let Some ( main ) = json . get ( "mainEntity" ) {
extract_urls_from_json_ld ( main , base_domain , seen , links ) ;
}
}
#[ cfg(test) ]
mod tests {
use super ::* ;
@ -212,4 +292,75 @@ mod tests {
let links = extract_links_from_html ( "" , & base_url ( "https://example.com" ) , "example.com" ) ;
assert! ( links . is_empty ( ) ) ;
}
#[ test ]
fn extracts_urls_from_json_ld_item_list ( ) {
let html = r #" < html > < head >
< script type = "application/ld+json" >
{ "@type" :"CollectionPage" , "mainEntity" :{ "@type" :"ItemList" , "itemListElement" :[
{ "@type" :"ListItem" , "position" :1 , "url" :"https://example.com/news/article-1/" , "item" :{ "@type" :"BlogPosting" , "url" :"https://example.com/news/article-1/" } } ,
{ "@type" :"ListItem" , "position" :2 , "url" :"https://example.com/news/article-2/" , "item" :{ "@type" :"BlogPosting" , "url" :"https://example.com/news/article-2/" } }
] } }
< / script >
< / head > < body > < / body > < / html > " #;
let links = extract_links_from_html ( html , & base_url ( "https://example.com/news/" ) , "example.com" ) ;
assert! ( links . len ( ) > = 2 , "Should extract at least 2 URLs from JSON-LD, got {}" , links . len ( ) ) ;
assert! ( links . iter ( ) . any ( | u | u . contains ( "article-1" ) ) ) ;
assert! ( links . iter ( ) . any ( | u | u . contains ( "article-2" ) ) ) ;
}
#[ test ]
fn extracts_urls_from_json_ld_blog_posting ( ) {
let html = r #" < html > < head >
< script type = "application/ld+json" >
{ "@type" :"BlogPosting" , "url" :"https://example.com/post/my-article" , "headline" :"Test" }
< / script >
< / head > < body > < / body > < / html > " #;
let links = extract_links_from_html ( html , & base_url ( "https://example.com" ) , "example.com" ) ;
assert_eq! ( links . len ( ) , 1 ) ;
assert! ( links [ 0 ] . contains ( "my-article" ) ) ;
}
#[ test ]
fn json_ld_urls_come_before_html_links ( ) {
let html = r #" < html > < head >
< script type = "application/ld+json" >
{ "@type" :"ItemList" , "itemListElement" :[
{ "@type" :"ListItem" , "url" :"https://example.com/jsonld-article/" }
] }
< / script >
< / head > < body >
< a href = "/html-article/" > HTML Article < / a >
< / body > < / html > " #;
let links = extract_links_from_html ( html , & base_url ( "https://example.com" ) , "example.com" ) ;
assert_eq! ( links . len ( ) , 2 ) ;
assert! ( links [ 0 ] . contains ( "jsonld-article" ) , "JSON-LD URLs should come first" ) ;
assert! ( links [ 1 ] . contains ( "html-article" ) , "HTML links should come second" ) ;
}
#[ test ]
fn json_ld_deduplicates_with_html_links ( ) {
let html = r #" < html > < head >
< script type = "application/ld+json" >
{ "@type" :"ItemList" , "itemListElement" :[
{ "@type" :"ListItem" , "url" :"https://example.com/same-article/" }
] }
< / script >
< / head > < body >
< a href = "/same-article/" > Same Article < / a >
< / body > < / html > " #;
let links = extract_links_from_html ( html , & base_url ( "https://example.com" ) , "example.com" ) ;
assert_eq! ( links . len ( ) , 1 , "Should deduplicate across JSON-LD and HTML" ) ;
}
#[ test ]
fn json_ld_filters_external_domains ( ) {
let html = r #" < html > < head >
< script type = "application/ld+json" >
{ "@type" :"BlogPosting" , "url" :"https://other-site.com/article" }
< / script >
< / head > < body > < / body > < / html > " #;
let links = extract_links_from_html ( html , & base_url ( "https://example.com" ) , "example.com" ) ;
assert! ( links . is_empty ( ) , "Should filter external domain URLs from JSON-LD" ) ;
}
}