diff --git a/backend/src/services/feed_parser.rs b/backend/src/services/feed_parser.rs
index dd94e4a..0ea1fb1 100644
--- a/backend/src/services/feed_parser.rs
+++ b/backend/src/services/feed_parser.rs
@@ -125,6 +125,80 @@ pub async fn parse_feed(
Ok(entries)
}
+/// RSS/Atom content types that indicate a direct feed URL.
+const FEED_CONTENT_TYPES: &[&str] = &[
+ "application/rss+xml",
+ "application/atom+xml",
+ "application/xml",
+ "text/xml",
+];
+
+/// Discover an RSS/Atom feed URL from a source URL.
+///
+/// Two detection strategies:
+/// 1. If the URL itself returns an RSS/Atom Content-Type, it is a feed directly.
+/// 2. For any other response, attempt `` HTML discovery,
+/// looking for `type="application/rss+xml"` or `type="application/atom+xml"`.
+///
+/// Returns `Some(feed_url)` if a feed is found, `None` otherwise.
+pub async fn discover_feed(
+ http_client: &reqwest::Client,
+ source_url: &str,
+) -> Option {
+ let parsed_url = Url::parse(source_url).ok()?;
+
+ if let Err(e) = crate::services::scraper::check_ssrf(&parsed_url).await {
+ tracing::warn!(url = source_url, error = %e, "Source URL failed SSRF check during feed discovery");
+ return None;
+ }
+
+ let response = http_client
+ .get(source_url)
+ .send()
+ .await
+ .map_err(|e| {
+ tracing::warn!(url = source_url, error = %e, "Failed to fetch URL during feed discovery");
+ e
+ })
+ .ok()?;
+
+ if !response.status().is_success() {
+ return None;
+ }
+
+ // Check Content-Type for direct feed
+ let content_type = response
+ .headers()
+ .get(reqwest::header::CONTENT_TYPE)
+ .and_then(|v| v.to_str().ok())
+ .unwrap_or("")
+ .to_lowercase();
+
+ if FEED_CONTENT_TYPES.iter().any(|ct| content_type.contains(ct)) {
+ return Some(source_url.to_string());
+ }
+
+ // For anything else (HTML or unknown content-type), try HTML link discovery
+ let body = response.text().await.ok()?;
+ let document = scraper::Html::parse_document(&body);
+
+ let selector = scraper::Selector::parse(r#"link[rel="alternate"]"#)
+ .expect("hardcoded CSS selector is always valid");
+
+ for element in document.select(&selector) {
+ let link_type = element.value().attr("type").unwrap_or("");
+ if link_type == "application/rss+xml" || link_type == "application/atom+xml" {
+ if let Some(href) = element.value().attr("href") {
+ // Resolve relative URLs against the source URL
+ let resolved = parsed_url.join(href).ok()?;
+ return Some(resolved.to_string());
+ }
+ }
+ }
+
+ None
+}
+
#[cfg(test)]
mod tests {
use super::*;
@@ -275,4 +349,105 @@ mod tests {
let result = parse_feed(&client, &server.uri(), 10).await;
assert!(result.is_err());
}
+
+ #[tokio::test]
+ async fn discover_feed_from_link_rss() {
+ let server = MockServer::start().await;
+ let html = format!(
+ r#"
+
+ "#,
+ server.uri()
+ );
+
+ Mock::given(method("GET"))
+ .respond_with(ResponseTemplate::new(200).set_body_string(html))
+ .mount(&server)
+ .await;
+
+ let client = reqwest::Client::new();
+ let result = discover_feed(&client, &server.uri()).await;
+
+ assert!(result.is_some());
+ assert!(result.unwrap().contains("/feed.xml"));
+ }
+
+ #[tokio::test]
+ async fn discover_feed_from_link_atom() {
+ let server = MockServer::start().await;
+ let html = format!(
+ r#"
+
+ "#,
+ server.uri()
+ );
+
+ Mock::given(method("GET"))
+ .respond_with(ResponseTemplate::new(200).set_body_string(html))
+ .mount(&server)
+ .await;
+
+ let client = reqwest::Client::new();
+ let result = discover_feed(&client, &server.uri()).await;
+
+ assert!(result.is_some());
+ assert!(result.unwrap().contains("/atom.xml"));
+ }
+
+ #[tokio::test]
+ async fn discover_feed_direct_rss_url() {
+ let server = MockServer::start().await;
+ let rss_body = r#"T"#;
+
+ Mock::given(method("GET"))
+ .respond_with(
+ ResponseTemplate::new(200)
+ .set_body_raw(rss_body, "application/rss+xml")
+ )
+ .mount(&server)
+ .await;
+
+ let client = reqwest::Client::new();
+ let result = discover_feed(&client, &server.uri()).await;
+
+ assert!(result.is_some());
+ assert_eq!(result.unwrap(), server.uri());
+ }
+
+ #[tokio::test]
+ async fn discover_feed_no_feed_found() {
+ let server = MockServer::start().await;
+ let html = "No feed";
+
+ Mock::given(method("GET"))
+ .respond_with(ResponseTemplate::new(200).set_body_string(html))
+ .mount(&server)
+ .await;
+
+ let client = reqwest::Client::new();
+ let result = discover_feed(&client, &server.uri()).await;
+
+ assert!(result.is_none());
+ }
+
+ #[tokio::test]
+ async fn discover_feed_resolves_relative_href() {
+ let server = MockServer::start().await;
+ let html = r#"
+
+ "#;
+
+ Mock::given(method("GET"))
+ .respond_with(ResponseTemplate::new(200).set_body_string(html))
+ .mount(&server)
+ .await;
+
+ let client = reqwest::Client::new();
+ let result = discover_feed(&client, &server.uri()).await;
+
+ assert!(result.is_some());
+ let feed_url = result.unwrap();
+ assert!(feed_url.starts_with(&server.uri()));
+ assert!(feed_url.ends_with("/feed.xml"));
+ }
}