feat: add feed_parser service with parse_feed function and tests

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 2 months ago
parent cd5f5434b2
commit fcdc7ca4a6

@ -0,0 +1,278 @@
//! RSS/Atom feed parser service.
//!
//! Discovers and parses RSS/Atom feeds from source URLs.
//! Used in Phase 1 of the generation pipeline to extract article links
//! sorted by publication date (newest first), before falling back
//! to the HTML-based source_scraper.
use chrono::{DateTime, Utc};
use url::Url;
use crate::errors::AppError;
/// A single entry extracted from an RSS/Atom feed.
#[derive(Debug, Clone)]
pub struct FeedEntry {
pub url: String,
pub title: String,
pub published_date: Option<DateTime<Utc>>,
}
/// Result of attempting to detect and parse a feed for a source.
pub enum FeedResult {
/// Feed found and parsed successfully.
Found {
feed_url: String,
entries: Vec<FeedEntry>,
},
/// No feed discovered or feed invalid.
NotFound,
}
/// Minimum number of feed entries to consider the feed useful.
/// Below this threshold, the pipeline falls back to HTML extraction.
pub const MIN_FEED_ENTRIES: usize = 3;
/// Number of days before a cached feed URL is re-verified.
pub const REDISCOVERY_DAYS: i64 = 30;
/// Parse an RSS/Atom feed URL and return entries sorted by date (newest first).
///
/// Uses the `feed-rs` crate which handles RSS 1.0, RSS 2.0, Atom, and JSON Feed.
/// Entries without a published date are placed last.
pub async fn parse_feed(
http_client: &reqwest::Client,
feed_url: &str,
max_links: usize,
) -> Result<Vec<FeedEntry>, AppError> {
let parsed_url = Url::parse(feed_url)
.map_err(|e| AppError::BadRequest(format!("Invalid feed URL: {}", e)))?;
if let Err(e) = crate::services::scraper::check_ssrf(&parsed_url).await {
tracing::warn!(url = feed_url, error = %e, "Feed URL failed SSRF check");
return Ok(Vec::new());
}
let response = http_client
.get(feed_url)
.send()
.await
.map_err(|e| {
tracing::warn!(url = feed_url, error = %e, "Failed to fetch feed");
AppError::Internal(anyhow::anyhow!("Failed to fetch feed"))
})?;
if !response.status().is_success() {
tracing::warn!(url = feed_url, status = %response.status(), "Feed returned non-200");
return Ok(Vec::new());
}
let body = response.bytes().await.map_err(|e| {
AppError::Internal(anyhow::anyhow!("Failed to read feed body: {}", e))
})?;
let feed = feed_rs::parser::parse(&body[..]).map_err(|e| {
tracing::warn!(url = feed_url, error = %e, "Failed to parse feed");
AppError::Internal(anyhow::anyhow!("Failed to parse feed: {}", e))
})?;
let mut entries: Vec<FeedEntry> = feed
.entries
.into_iter()
.filter_map(|entry| {
// Get the article URL: prefer links, fall back to id if it looks like a URL
let url = entry
.links
.first()
.map(|l| l.href.clone())
.or_else(|| {
if entry.id.starts_with("http://") || entry.id.starts_with("https://") {
Some(entry.id.clone())
} else {
None
}
})?;
let title = entry
.title
.map(|t| t.content)
.unwrap_or_default();
let published_date = entry
.published
.or(entry.updated);
Some(FeedEntry {
url,
title,
published_date,
})
})
.collect();
// Sort by published_date descending (newest first), entries without dates last
entries.sort_by(|a, b| {
match (&b.published_date, &a.published_date) {
(Some(db), Some(da)) => db.cmp(da),
(Some(_), None) => std::cmp::Ordering::Greater,
(None, Some(_)) => std::cmp::Ordering::Less,
(None, None) => std::cmp::Ordering::Equal,
}
});
entries.truncate(max_links);
Ok(entries)
}
#[cfg(test)]
mod tests {
use super::*;
use wiremock::{Mock, MockServer, ResponseTemplate};
use wiremock::matchers::method;
#[tokio::test]
async fn parse_feed_rss2() {
let server = MockServer::start().await;
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Test Blog</title>
<item>
<title>Article 1</title>
<link>https://example.com/article-1</link>
<pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate>
</item>
<item>
<title>Article 2</title>
<link>https://example.com/article-2</link>
<pubDate>Wed, 02 Apr 2026 10:00:00 GMT</pubDate>
</item>
<item>
<title>Article 3</title>
<link>https://example.com/article-3</link>
<pubDate>Tue, 01 Apr 2026 10:00:00 GMT</pubDate>
</item>
</channel>
</rss>"#;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
.mount(&server)
.await;
let client = reqwest::Client::new();
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
assert_eq!(entries.len(), 3);
assert_eq!(entries[0].title, "Article 1");
assert_eq!(entries[0].url, "https://example.com/article-1");
assert!(entries[0].published_date > entries[1].published_date);
assert!(entries[1].published_date > entries[2].published_date);
}
#[tokio::test]
async fn parse_feed_atom() {
let server = MockServer::start().await;
let atom_body = r#"<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Test Feed</title>
<entry>
<title>Atom Article</title>
<link href="https://example.com/atom-1"/>
<updated>2026-04-03T12:00:00Z</updated>
</entry>
</feed>"#;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_raw(atom_body, "application/atom+xml"))
.mount(&server)
.await;
let client = reqwest::Client::new();
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].title, "Atom Article");
assert_eq!(entries[0].url, "https://example.com/atom-1");
assert!(entries[0].published_date.is_some());
}
#[tokio::test]
async fn parse_feed_respects_max_links() {
let server = MockServer::start().await;
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Test</title>
<item><title>A1</title><link>https://example.com/1</link><pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate></item>
<item><title>A2</title><link>https://example.com/2</link><pubDate>Wed, 02 Apr 2026 10:00:00 GMT</pubDate></item>
<item><title>A3</title><link>https://example.com/3</link><pubDate>Tue, 01 Apr 2026 10:00:00 GMT</pubDate></item>
</channel>
</rss>"#;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
.mount(&server)
.await;
let client = reqwest::Client::new();
let entries = parse_feed(&client, &server.uri(), 2).await.unwrap();
assert_eq!(entries.len(), 2);
assert_eq!(entries[0].url, "https://example.com/1"); // newest first
}
#[tokio::test]
async fn parse_feed_entries_without_dates_come_last() {
let server = MockServer::start().await;
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Test</title>
<item><title>No date</title><link>https://example.com/no-date</link></item>
<item><title>Has date</title><link>https://example.com/has-date</link><pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate></item>
</channel>
</rss>"#;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
.mount(&server)
.await;
let client = reqwest::Client::new();
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
assert_eq!(entries.len(), 2);
assert_eq!(entries[0].url, "https://example.com/has-date");
assert_eq!(entries[1].url, "https://example.com/no-date");
}
#[tokio::test]
async fn parse_feed_404_returns_empty() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(404))
.mount(&server)
.await;
let client = reqwest::Client::new();
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
assert!(entries.is_empty());
}
#[tokio::test]
async fn parse_feed_invalid_xml_returns_error() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_string("not xml at all"))
.mount(&server)
.await;
let client = reqwest::Client::new();
let result = parse_feed(&client, &server.uri(), 10).await;
assert!(result.is_err());
}
}

@ -4,6 +4,7 @@ pub mod csv;
pub mod email;
pub mod encryption;
pub mod export;
pub mod feed_parser;
pub mod job_store;
pub mod llm;
pub mod prompts;

Loading…
Cancel
Save