diff --git a/backend/src/services/feed_parser.rs b/backend/src/services/feed_parser.rs index dd94e4a..0ea1fb1 100644 --- a/backend/src/services/feed_parser.rs +++ b/backend/src/services/feed_parser.rs @@ -125,6 +125,80 @@ pub async fn parse_feed( Ok(entries) } +/// RSS/Atom content types that indicate a direct feed URL. +const FEED_CONTENT_TYPES: &[&str] = &[ + "application/rss+xml", + "application/atom+xml", + "application/xml", + "text/xml", +]; + +/// Discover an RSS/Atom feed URL from a source URL. +/// +/// Two detection strategies: +/// 1. If the URL itself returns an RSS/Atom Content-Type, it is a feed directly. +/// 2. For any other response, attempt `` HTML discovery, +/// looking for `type="application/rss+xml"` or `type="application/atom+xml"`. +/// +/// Returns `Some(feed_url)` if a feed is found, `None` otherwise. +pub async fn discover_feed( + http_client: &reqwest::Client, + source_url: &str, +) -> Option { + let parsed_url = Url::parse(source_url).ok()?; + + if let Err(e) = crate::services::scraper::check_ssrf(&parsed_url).await { + tracing::warn!(url = source_url, error = %e, "Source URL failed SSRF check during feed discovery"); + return None; + } + + let response = http_client + .get(source_url) + .send() + .await + .map_err(|e| { + tracing::warn!(url = source_url, error = %e, "Failed to fetch URL during feed discovery"); + e + }) + .ok()?; + + if !response.status().is_success() { + return None; + } + + // Check Content-Type for direct feed + let content_type = response + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_lowercase(); + + if FEED_CONTENT_TYPES.iter().any(|ct| content_type.contains(ct)) { + return Some(source_url.to_string()); + } + + // For anything else (HTML or unknown content-type), try HTML link discovery + let body = response.text().await.ok()?; + let document = scraper::Html::parse_document(&body); + + let selector = scraper::Selector::parse(r#"link[rel="alternate"]"#) + .expect("hardcoded CSS selector is always valid"); + + for element in document.select(&selector) { + let link_type = element.value().attr("type").unwrap_or(""); + if link_type == "application/rss+xml" || link_type == "application/atom+xml" { + if let Some(href) = element.value().attr("href") { + // Resolve relative URLs against the source URL + let resolved = parsed_url.join(href).ok()?; + return Some(resolved.to_string()); + } + } + } + + None +} + #[cfg(test)] mod tests { use super::*; @@ -275,4 +349,105 @@ mod tests { let result = parse_feed(&client, &server.uri(), 10).await; assert!(result.is_err()); } + + #[tokio::test] + async fn discover_feed_from_link_rss() { + let server = MockServer::start().await; + let html = format!( + r#" + + "#, + server.uri() + ); + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_string(html)) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let result = discover_feed(&client, &server.uri()).await; + + assert!(result.is_some()); + assert!(result.unwrap().contains("/feed.xml")); + } + + #[tokio::test] + async fn discover_feed_from_link_atom() { + let server = MockServer::start().await; + let html = format!( + r#" + + "#, + server.uri() + ); + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_string(html)) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let result = discover_feed(&client, &server.uri()).await; + + assert!(result.is_some()); + assert!(result.unwrap().contains("/atom.xml")); + } + + #[tokio::test] + async fn discover_feed_direct_rss_url() { + let server = MockServer::start().await; + let rss_body = r#"T"#; + + Mock::given(method("GET")) + .respond_with( + ResponseTemplate::new(200) + .set_body_raw(rss_body, "application/rss+xml") + ) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let result = discover_feed(&client, &server.uri()).await; + + assert!(result.is_some()); + assert_eq!(result.unwrap(), server.uri()); + } + + #[tokio::test] + async fn discover_feed_no_feed_found() { + let server = MockServer::start().await; + let html = "No feed"; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_string(html)) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let result = discover_feed(&client, &server.uri()).await; + + assert!(result.is_none()); + } + + #[tokio::test] + async fn discover_feed_resolves_relative_href() { + let server = MockServer::start().await; + let html = r#" + + "#; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_string(html)) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let result = discover_feed(&client, &server.uri()).await; + + assert!(result.is_some()); + let feed_url = result.unwrap(); + assert!(feed_url.starts_with(&server.uri())); + assert!(feed_url.ends_with("/feed.xml")); + } }