feat: add discover_feed function for RSS/Atom auto-discovery

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 2 months ago
parent fcdc7ca4a6
commit 96b39814bb

@ -125,6 +125,80 @@ pub async fn parse_feed(
Ok(entries)
}
/// RSS/Atom content types that indicate a direct feed URL.
const FEED_CONTENT_TYPES: &[&str] = &[
"application/rss+xml",
"application/atom+xml",
"application/xml",
"text/xml",
];
/// Discover an RSS/Atom feed URL from a source URL.
///
/// Two detection strategies:
/// 1. If the URL itself returns an RSS/Atom Content-Type, it is a feed directly.
/// 2. For any other response, attempt `<link rel="alternate">` HTML discovery,
/// looking for `type="application/rss+xml"` or `type="application/atom+xml"`.
///
/// Returns `Some(feed_url)` if a feed is found, `None` otherwise.
pub async fn discover_feed(
http_client: &reqwest::Client,
source_url: &str,
) -> Option<String> {
let parsed_url = Url::parse(source_url).ok()?;
if let Err(e) = crate::services::scraper::check_ssrf(&parsed_url).await {
tracing::warn!(url = source_url, error = %e, "Source URL failed SSRF check during feed discovery");
return None;
}
let response = http_client
.get(source_url)
.send()
.await
.map_err(|e| {
tracing::warn!(url = source_url, error = %e, "Failed to fetch URL during feed discovery");
e
})
.ok()?;
if !response.status().is_success() {
return None;
}
// Check Content-Type for direct feed
let content_type = response
.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_lowercase();
if FEED_CONTENT_TYPES.iter().any(|ct| content_type.contains(ct)) {
return Some(source_url.to_string());
}
// For anything else (HTML or unknown content-type), try HTML link discovery
let body = response.text().await.ok()?;
let document = scraper::Html::parse_document(&body);
let selector = scraper::Selector::parse(r#"link[rel="alternate"]"#)
.expect("hardcoded CSS selector is always valid");
for element in document.select(&selector) {
let link_type = element.value().attr("type").unwrap_or("");
if link_type == "application/rss+xml" || link_type == "application/atom+xml" {
if let Some(href) = element.value().attr("href") {
// Resolve relative URLs against the source URL
let resolved = parsed_url.join(href).ok()?;
return Some(resolved.to_string());
}
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
@ -275,4 +349,105 @@ mod tests {
let result = parse_feed(&client, &server.uri(), 10).await;
assert!(result.is_err());
}
#[tokio::test]
async fn discover_feed_from_link_rss() {
let server = MockServer::start().await;
let html = format!(
r#"<html><head>
<link rel="alternate" type="application/rss+xml" href="{}/feed.xml">
</head><body></body></html>"#,
server.uri()
);
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_string(html))
.mount(&server)
.await;
let client = reqwest::Client::new();
let result = discover_feed(&client, &server.uri()).await;
assert!(result.is_some());
assert!(result.unwrap().contains("/feed.xml"));
}
#[tokio::test]
async fn discover_feed_from_link_atom() {
let server = MockServer::start().await;
let html = format!(
r#"<html><head>
<link rel="alternate" type="application/atom+xml" href="{}/atom.xml">
</head><body></body></html>"#,
server.uri()
);
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_string(html))
.mount(&server)
.await;
let client = reqwest::Client::new();
let result = discover_feed(&client, &server.uri()).await;
assert!(result.is_some());
assert!(result.unwrap().contains("/atom.xml"));
}
#[tokio::test]
async fn discover_feed_direct_rss_url() {
let server = MockServer::start().await;
let rss_body = r#"<?xml version="1.0"?><rss version="2.0"><channel><title>T</title></channel></rss>"#;
Mock::given(method("GET"))
.respond_with(
ResponseTemplate::new(200)
.set_body_raw(rss_body, "application/rss+xml")
)
.mount(&server)
.await;
let client = reqwest::Client::new();
let result = discover_feed(&client, &server.uri()).await;
assert!(result.is_some());
assert_eq!(result.unwrap(), server.uri());
}
#[tokio::test]
async fn discover_feed_no_feed_found() {
let server = MockServer::start().await;
let html = "<html><head><title>No feed</title></head><body></body></html>";
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_string(html))
.mount(&server)
.await;
let client = reqwest::Client::new();
let result = discover_feed(&client, &server.uri()).await;
assert!(result.is_none());
}
#[tokio::test]
async fn discover_feed_resolves_relative_href() {
let server = MockServer::start().await;
let html = r#"<html><head>
<link rel="alternate" type="application/rss+xml" href="/feed.xml">
</head><body></body></html>"#;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_string(html))
.mount(&server)
.await;
let client = reqwest::Client::new();
let result = discover_feed(&client, &server.uri()).await;
assert!(result.is_some());
let feed_url = result.unwrap();
assert!(feed_url.starts_with(&server.uri()));
assert!(feed_url.ends_with("/feed.xml"));
}
}

Loading…
Cancel
Save