diff --git a/backend/src/services/feed_parser.rs b/backend/src/services/feed_parser.rs new file mode 100644 index 0000000..dd94e4a --- /dev/null +++ b/backend/src/services/feed_parser.rs @@ -0,0 +1,278 @@ +//! RSS/Atom feed parser service. +//! +//! Discovers and parses RSS/Atom feeds from source URLs. +//! Used in Phase 1 of the generation pipeline to extract article links +//! sorted by publication date (newest first), before falling back +//! to the HTML-based source_scraper. + +use chrono::{DateTime, Utc}; +use url::Url; + +use crate::errors::AppError; + +/// A single entry extracted from an RSS/Atom feed. +#[derive(Debug, Clone)] +pub struct FeedEntry { + pub url: String, + pub title: String, + pub published_date: Option>, +} + +/// Result of attempting to detect and parse a feed for a source. +pub enum FeedResult { + /// Feed found and parsed successfully. + Found { + feed_url: String, + entries: Vec, + }, + /// No feed discovered or feed invalid. + NotFound, +} + +/// Minimum number of feed entries to consider the feed useful. +/// Below this threshold, the pipeline falls back to HTML extraction. +pub const MIN_FEED_ENTRIES: usize = 3; + +/// Number of days before a cached feed URL is re-verified. +pub const REDISCOVERY_DAYS: i64 = 30; + +/// Parse an RSS/Atom feed URL and return entries sorted by date (newest first). +/// +/// Uses the `feed-rs` crate which handles RSS 1.0, RSS 2.0, Atom, and JSON Feed. +/// Entries without a published date are placed last. +pub async fn parse_feed( + http_client: &reqwest::Client, + feed_url: &str, + max_links: usize, +) -> Result, AppError> { + let parsed_url = Url::parse(feed_url) + .map_err(|e| AppError::BadRequest(format!("Invalid feed URL: {}", e)))?; + + if let Err(e) = crate::services::scraper::check_ssrf(&parsed_url).await { + tracing::warn!(url = feed_url, error = %e, "Feed URL failed SSRF check"); + return Ok(Vec::new()); + } + + let response = http_client + .get(feed_url) + .send() + .await + .map_err(|e| { + tracing::warn!(url = feed_url, error = %e, "Failed to fetch feed"); + AppError::Internal(anyhow::anyhow!("Failed to fetch feed")) + })?; + + if !response.status().is_success() { + tracing::warn!(url = feed_url, status = %response.status(), "Feed returned non-200"); + return Ok(Vec::new()); + } + + let body = response.bytes().await.map_err(|e| { + AppError::Internal(anyhow::anyhow!("Failed to read feed body: {}", e)) + })?; + + let feed = feed_rs::parser::parse(&body[..]).map_err(|e| { + tracing::warn!(url = feed_url, error = %e, "Failed to parse feed"); + AppError::Internal(anyhow::anyhow!("Failed to parse feed: {}", e)) + })?; + + let mut entries: Vec = feed + .entries + .into_iter() + .filter_map(|entry| { + // Get the article URL: prefer links, fall back to id if it looks like a URL + let url = entry + .links + .first() + .map(|l| l.href.clone()) + .or_else(|| { + if entry.id.starts_with("http://") || entry.id.starts_with("https://") { + Some(entry.id.clone()) + } else { + None + } + })?; + + let title = entry + .title + .map(|t| t.content) + .unwrap_or_default(); + + let published_date = entry + .published + .or(entry.updated); + + Some(FeedEntry { + url, + title, + published_date, + }) + }) + .collect(); + + // Sort by published_date descending (newest first), entries without dates last + entries.sort_by(|a, b| { + match (&b.published_date, &a.published_date) { + (Some(db), Some(da)) => db.cmp(da), + (Some(_), None) => std::cmp::Ordering::Greater, + (None, Some(_)) => std::cmp::Ordering::Less, + (None, None) => std::cmp::Ordering::Equal, + } + }); + + entries.truncate(max_links); + + Ok(entries) +} + +#[cfg(test)] +mod tests { + use super::*; + use wiremock::{Mock, MockServer, ResponseTemplate}; + use wiremock::matchers::method; + + #[tokio::test] + async fn parse_feed_rss2() { + let server = MockServer::start().await; + let rss_body = r#" + + + Test Blog + + Article 1 + https://example.com/article-1 + Thu, 03 Apr 2026 10:00:00 GMT + + + Article 2 + https://example.com/article-2 + Wed, 02 Apr 2026 10:00:00 GMT + + + Article 3 + https://example.com/article-3 + Tue, 01 Apr 2026 10:00:00 GMT + + +"#; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml")) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let entries = parse_feed(&client, &server.uri(), 10).await.unwrap(); + + assert_eq!(entries.len(), 3); + assert_eq!(entries[0].title, "Article 1"); + assert_eq!(entries[0].url, "https://example.com/article-1"); + assert!(entries[0].published_date > entries[1].published_date); + assert!(entries[1].published_date > entries[2].published_date); + } + + #[tokio::test] + async fn parse_feed_atom() { + let server = MockServer::start().await; + let atom_body = r#" + + Test Feed + + Atom Article + + 2026-04-03T12:00:00Z + +"#; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_raw(atom_body, "application/atom+xml")) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let entries = parse_feed(&client, &server.uri(), 10).await.unwrap(); + + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].title, "Atom Article"); + assert_eq!(entries[0].url, "https://example.com/atom-1"); + assert!(entries[0].published_date.is_some()); + } + + #[tokio::test] + async fn parse_feed_respects_max_links() { + let server = MockServer::start().await; + let rss_body = r#" + + + Test + A1https://example.com/1Thu, 03 Apr 2026 10:00:00 GMT + A2https://example.com/2Wed, 02 Apr 2026 10:00:00 GMT + A3https://example.com/3Tue, 01 Apr 2026 10:00:00 GMT + +"#; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml")) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let entries = parse_feed(&client, &server.uri(), 2).await.unwrap(); + + assert_eq!(entries.len(), 2); + assert_eq!(entries[0].url, "https://example.com/1"); // newest first + } + + #[tokio::test] + async fn parse_feed_entries_without_dates_come_last() { + let server = MockServer::start().await; + let rss_body = r#" + + + Test + No datehttps://example.com/no-date + Has datehttps://example.com/has-dateThu, 03 Apr 2026 10:00:00 GMT + +"#; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml")) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let entries = parse_feed(&client, &server.uri(), 10).await.unwrap(); + + assert_eq!(entries.len(), 2); + assert_eq!(entries[0].url, "https://example.com/has-date"); + assert_eq!(entries[1].url, "https://example.com/no-date"); + } + + #[tokio::test] + async fn parse_feed_404_returns_empty() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(404)) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let entries = parse_feed(&client, &server.uri(), 10).await.unwrap(); + assert!(entries.is_empty()); + } + + #[tokio::test] + async fn parse_feed_invalid_xml_returns_error() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_string("not xml at all")) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let result = parse_feed(&client, &server.uri(), 10).await; + assert!(result.is_err()); + } +} diff --git a/backend/src/services/mod.rs b/backend/src/services/mod.rs index a0484e8..1956d4d 100644 --- a/backend/src/services/mod.rs +++ b/backend/src/services/mod.rs @@ -4,6 +4,7 @@ pub mod csv; pub mod email; pub mod encryption; pub mod export; +pub mod feed_parser; pub mod job_store; pub mod llm; pub mod prompts;