|
|
|
|
@ -125,6 +125,80 @@ pub async fn parse_feed(
|
|
|
|
|
Ok(entries)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// RSS/Atom content types that indicate a direct feed URL.
|
|
|
|
|
const FEED_CONTENT_TYPES: &[&str] = &[
|
|
|
|
|
"application/rss+xml",
|
|
|
|
|
"application/atom+xml",
|
|
|
|
|
"application/xml",
|
|
|
|
|
"text/xml",
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
/// Discover an RSS/Atom feed URL from a source URL.
|
|
|
|
|
///
|
|
|
|
|
/// Two detection strategies:
|
|
|
|
|
/// 1. If the URL itself returns an RSS/Atom Content-Type, it is a feed directly.
|
|
|
|
|
/// 2. For any other response, attempt `<link rel="alternate">` HTML discovery,
|
|
|
|
|
/// looking for `type="application/rss+xml"` or `type="application/atom+xml"`.
|
|
|
|
|
///
|
|
|
|
|
/// Returns `Some(feed_url)` if a feed is found, `None` otherwise.
|
|
|
|
|
pub async fn discover_feed(
|
|
|
|
|
http_client: &reqwest::Client,
|
|
|
|
|
source_url: &str,
|
|
|
|
|
) -> Option<String> {
|
|
|
|
|
let parsed_url = Url::parse(source_url).ok()?;
|
|
|
|
|
|
|
|
|
|
if let Err(e) = crate::services::scraper::check_ssrf(&parsed_url).await {
|
|
|
|
|
tracing::warn!(url = source_url, error = %e, "Source URL failed SSRF check during feed discovery");
|
|
|
|
|
return None;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let response = http_client
|
|
|
|
|
.get(source_url)
|
|
|
|
|
.send()
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| {
|
|
|
|
|
tracing::warn!(url = source_url, error = %e, "Failed to fetch URL during feed discovery");
|
|
|
|
|
e
|
|
|
|
|
})
|
|
|
|
|
.ok()?;
|
|
|
|
|
|
|
|
|
|
if !response.status().is_success() {
|
|
|
|
|
return None;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check Content-Type for direct feed
|
|
|
|
|
let content_type = response
|
|
|
|
|
.headers()
|
|
|
|
|
.get(reqwest::header::CONTENT_TYPE)
|
|
|
|
|
.and_then(|v| v.to_str().ok())
|
|
|
|
|
.unwrap_or("")
|
|
|
|
|
.to_lowercase();
|
|
|
|
|
|
|
|
|
|
if FEED_CONTENT_TYPES.iter().any(|ct| content_type.contains(ct)) {
|
|
|
|
|
return Some(source_url.to_string());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// For anything else (HTML or unknown content-type), try HTML link discovery
|
|
|
|
|
let body = response.text().await.ok()?;
|
|
|
|
|
let document = scraper::Html::parse_document(&body);
|
|
|
|
|
|
|
|
|
|
let selector = scraper::Selector::parse(r#"link[rel="alternate"]"#)
|
|
|
|
|
.expect("hardcoded CSS selector is always valid");
|
|
|
|
|
|
|
|
|
|
for element in document.select(&selector) {
|
|
|
|
|
let link_type = element.value().attr("type").unwrap_or("");
|
|
|
|
|
if link_type == "application/rss+xml" || link_type == "application/atom+xml" {
|
|
|
|
|
if let Some(href) = element.value().attr("href") {
|
|
|
|
|
// Resolve relative URLs against the source URL
|
|
|
|
|
let resolved = parsed_url.join(href).ok()?;
|
|
|
|
|
return Some(resolved.to_string());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests {
|
|
|
|
|
use super::*;
|
|
|
|
|
@ -275,4 +349,105 @@ mod tests {
|
|
|
|
|
let result = parse_feed(&client, &server.uri(), 10).await;
|
|
|
|
|
assert!(result.is_err());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn discover_feed_from_link_rss() {
|
|
|
|
|
let server = MockServer::start().await;
|
|
|
|
|
let html = format!(
|
|
|
|
|
r#"<html><head>
|
|
|
|
|
<link rel="alternate" type="application/rss+xml" href="{}/feed.xml">
|
|
|
|
|
</head><body></body></html>"#,
|
|
|
|
|
server.uri()
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
Mock::given(method("GET"))
|
|
|
|
|
.respond_with(ResponseTemplate::new(200).set_body_string(html))
|
|
|
|
|
.mount(&server)
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
let client = reqwest::Client::new();
|
|
|
|
|
let result = discover_feed(&client, &server.uri()).await;
|
|
|
|
|
|
|
|
|
|
assert!(result.is_some());
|
|
|
|
|
assert!(result.unwrap().contains("/feed.xml"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn discover_feed_from_link_atom() {
|
|
|
|
|
let server = MockServer::start().await;
|
|
|
|
|
let html = format!(
|
|
|
|
|
r#"<html><head>
|
|
|
|
|
<link rel="alternate" type="application/atom+xml" href="{}/atom.xml">
|
|
|
|
|
</head><body></body></html>"#,
|
|
|
|
|
server.uri()
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
Mock::given(method("GET"))
|
|
|
|
|
.respond_with(ResponseTemplate::new(200).set_body_string(html))
|
|
|
|
|
.mount(&server)
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
let client = reqwest::Client::new();
|
|
|
|
|
let result = discover_feed(&client, &server.uri()).await;
|
|
|
|
|
|
|
|
|
|
assert!(result.is_some());
|
|
|
|
|
assert!(result.unwrap().contains("/atom.xml"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn discover_feed_direct_rss_url() {
|
|
|
|
|
let server = MockServer::start().await;
|
|
|
|
|
let rss_body = r#"<?xml version="1.0"?><rss version="2.0"><channel><title>T</title></channel></rss>"#;
|
|
|
|
|
|
|
|
|
|
Mock::given(method("GET"))
|
|
|
|
|
.respond_with(
|
|
|
|
|
ResponseTemplate::new(200)
|
|
|
|
|
.set_body_raw(rss_body, "application/rss+xml")
|
|
|
|
|
)
|
|
|
|
|
.mount(&server)
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
let client = reqwest::Client::new();
|
|
|
|
|
let result = discover_feed(&client, &server.uri()).await;
|
|
|
|
|
|
|
|
|
|
assert!(result.is_some());
|
|
|
|
|
assert_eq!(result.unwrap(), server.uri());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn discover_feed_no_feed_found() {
|
|
|
|
|
let server = MockServer::start().await;
|
|
|
|
|
let html = "<html><head><title>No feed</title></head><body></body></html>";
|
|
|
|
|
|
|
|
|
|
Mock::given(method("GET"))
|
|
|
|
|
.respond_with(ResponseTemplate::new(200).set_body_string(html))
|
|
|
|
|
.mount(&server)
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
let client = reqwest::Client::new();
|
|
|
|
|
let result = discover_feed(&client, &server.uri()).await;
|
|
|
|
|
|
|
|
|
|
assert!(result.is_none());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn discover_feed_resolves_relative_href() {
|
|
|
|
|
let server = MockServer::start().await;
|
|
|
|
|
let html = r#"<html><head>
|
|
|
|
|
<link rel="alternate" type="application/rss+xml" href="/feed.xml">
|
|
|
|
|
</head><body></body></html>"#;
|
|
|
|
|
|
|
|
|
|
Mock::given(method("GET"))
|
|
|
|
|
.respond_with(ResponseTemplate::new(200).set_body_string(html))
|
|
|
|
|
.mount(&server)
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
let client = reqwest::Client::new();
|
|
|
|
|
let result = discover_feed(&client, &server.uri()).await;
|
|
|
|
|
|
|
|
|
|
assert!(result.is_some());
|
|
|
|
|
let feed_url = result.unwrap();
|
|
|
|
|
assert!(feed_url.starts_with(&server.uri()));
|
|
|
|
|
assert!(feed_url.ends_with("/feed.xml"));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|