feat: add feed_parser service with parse_feed function and tests
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>master
parent
cd5f5434b2
commit
fcdc7ca4a6
@ -0,0 +1,278 @@
|
||||
//! RSS/Atom feed parser service.
|
||||
//!
|
||||
//! Discovers and parses RSS/Atom feeds from source URLs.
|
||||
//! Used in Phase 1 of the generation pipeline to extract article links
|
||||
//! sorted by publication date (newest first), before falling back
|
||||
//! to the HTML-based source_scraper.
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use url::Url;
|
||||
|
||||
use crate::errors::AppError;
|
||||
|
||||
/// A single entry extracted from an RSS/Atom feed.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FeedEntry {
|
||||
pub url: String,
|
||||
pub title: String,
|
||||
pub published_date: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
/// Result of attempting to detect and parse a feed for a source.
|
||||
pub enum FeedResult {
|
||||
/// Feed found and parsed successfully.
|
||||
Found {
|
||||
feed_url: String,
|
||||
entries: Vec<FeedEntry>,
|
||||
},
|
||||
/// No feed discovered or feed invalid.
|
||||
NotFound,
|
||||
}
|
||||
|
||||
/// Minimum number of feed entries to consider the feed useful.
|
||||
/// Below this threshold, the pipeline falls back to HTML extraction.
|
||||
pub const MIN_FEED_ENTRIES: usize = 3;
|
||||
|
||||
/// Number of days before a cached feed URL is re-verified.
|
||||
pub const REDISCOVERY_DAYS: i64 = 30;
|
||||
|
||||
/// Parse an RSS/Atom feed URL and return entries sorted by date (newest first).
|
||||
///
|
||||
/// Uses the `feed-rs` crate which handles RSS 1.0, RSS 2.0, Atom, and JSON Feed.
|
||||
/// Entries without a published date are placed last.
|
||||
pub async fn parse_feed(
|
||||
http_client: &reqwest::Client,
|
||||
feed_url: &str,
|
||||
max_links: usize,
|
||||
) -> Result<Vec<FeedEntry>, AppError> {
|
||||
let parsed_url = Url::parse(feed_url)
|
||||
.map_err(|e| AppError::BadRequest(format!("Invalid feed URL: {}", e)))?;
|
||||
|
||||
if let Err(e) = crate::services::scraper::check_ssrf(&parsed_url).await {
|
||||
tracing::warn!(url = feed_url, error = %e, "Feed URL failed SSRF check");
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let response = http_client
|
||||
.get(feed_url)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tracing::warn!(url = feed_url, error = %e, "Failed to fetch feed");
|
||||
AppError::Internal(anyhow::anyhow!("Failed to fetch feed"))
|
||||
})?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
tracing::warn!(url = feed_url, status = %response.status(), "Feed returned non-200");
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let body = response.bytes().await.map_err(|e| {
|
||||
AppError::Internal(anyhow::anyhow!("Failed to read feed body: {}", e))
|
||||
})?;
|
||||
|
||||
let feed = feed_rs::parser::parse(&body[..]).map_err(|e| {
|
||||
tracing::warn!(url = feed_url, error = %e, "Failed to parse feed");
|
||||
AppError::Internal(anyhow::anyhow!("Failed to parse feed: {}", e))
|
||||
})?;
|
||||
|
||||
let mut entries: Vec<FeedEntry> = feed
|
||||
.entries
|
||||
.into_iter()
|
||||
.filter_map(|entry| {
|
||||
// Get the article URL: prefer links, fall back to id if it looks like a URL
|
||||
let url = entry
|
||||
.links
|
||||
.first()
|
||||
.map(|l| l.href.clone())
|
||||
.or_else(|| {
|
||||
if entry.id.starts_with("http://") || entry.id.starts_with("https://") {
|
||||
Some(entry.id.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})?;
|
||||
|
||||
let title = entry
|
||||
.title
|
||||
.map(|t| t.content)
|
||||
.unwrap_or_default();
|
||||
|
||||
let published_date = entry
|
||||
.published
|
||||
.or(entry.updated);
|
||||
|
||||
Some(FeedEntry {
|
||||
url,
|
||||
title,
|
||||
published_date,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by published_date descending (newest first), entries without dates last
|
||||
entries.sort_by(|a, b| {
|
||||
match (&b.published_date, &a.published_date) {
|
||||
(Some(db), Some(da)) => db.cmp(da),
|
||||
(Some(_), None) => std::cmp::Ordering::Greater,
|
||||
(None, Some(_)) => std::cmp::Ordering::Less,
|
||||
(None, None) => std::cmp::Ordering::Equal,
|
||||
}
|
||||
});
|
||||
|
||||
entries.truncate(max_links);
|
||||
|
||||
Ok(entries)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||
use wiremock::matchers::method;
|
||||
|
||||
#[tokio::test]
|
||||
async fn parse_feed_rss2() {
|
||||
let server = MockServer::start().await;
|
||||
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Test Blog</title>
|
||||
<item>
|
||||
<title>Article 1</title>
|
||||
<link>https://example.com/article-1</link>
|
||||
<pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Article 2</title>
|
||||
<link>https://example.com/article-2</link>
|
||||
<pubDate>Wed, 02 Apr 2026 10:00:00 GMT</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Article 3</title>
|
||||
<link>https://example.com/article-3</link>
|
||||
<pubDate>Tue, 01 Apr 2026 10:00:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>"#;
|
||||
|
||||
Mock::given(method("GET"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
|
||||
|
||||
assert_eq!(entries.len(), 3);
|
||||
assert_eq!(entries[0].title, "Article 1");
|
||||
assert_eq!(entries[0].url, "https://example.com/article-1");
|
||||
assert!(entries[0].published_date > entries[1].published_date);
|
||||
assert!(entries[1].published_date > entries[2].published_date);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn parse_feed_atom() {
|
||||
let server = MockServer::start().await;
|
||||
let atom_body = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<title>Test Feed</title>
|
||||
<entry>
|
||||
<title>Atom Article</title>
|
||||
<link href="https://example.com/atom-1"/>
|
||||
<updated>2026-04-03T12:00:00Z</updated>
|
||||
</entry>
|
||||
</feed>"#;
|
||||
|
||||
Mock::given(method("GET"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_raw(atom_body, "application/atom+xml"))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
|
||||
|
||||
assert_eq!(entries.len(), 1);
|
||||
assert_eq!(entries[0].title, "Atom Article");
|
||||
assert_eq!(entries[0].url, "https://example.com/atom-1");
|
||||
assert!(entries[0].published_date.is_some());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn parse_feed_respects_max_links() {
|
||||
let server = MockServer::start().await;
|
||||
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Test</title>
|
||||
<item><title>A1</title><link>https://example.com/1</link><pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate></item>
|
||||
<item><title>A2</title><link>https://example.com/2</link><pubDate>Wed, 02 Apr 2026 10:00:00 GMT</pubDate></item>
|
||||
<item><title>A3</title><link>https://example.com/3</link><pubDate>Tue, 01 Apr 2026 10:00:00 GMT</pubDate></item>
|
||||
</channel>
|
||||
</rss>"#;
|
||||
|
||||
Mock::given(method("GET"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let entries = parse_feed(&client, &server.uri(), 2).await.unwrap();
|
||||
|
||||
assert_eq!(entries.len(), 2);
|
||||
assert_eq!(entries[0].url, "https://example.com/1"); // newest first
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn parse_feed_entries_without_dates_come_last() {
|
||||
let server = MockServer::start().await;
|
||||
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Test</title>
|
||||
<item><title>No date</title><link>https://example.com/no-date</link></item>
|
||||
<item><title>Has date</title><link>https://example.com/has-date</link><pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate></item>
|
||||
</channel>
|
||||
</rss>"#;
|
||||
|
||||
Mock::given(method("GET"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
|
||||
|
||||
assert_eq!(entries.len(), 2);
|
||||
assert_eq!(entries[0].url, "https://example.com/has-date");
|
||||
assert_eq!(entries[1].url, "https://example.com/no-date");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn parse_feed_404_returns_empty() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("GET"))
|
||||
.respond_with(ResponseTemplate::new(404))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
|
||||
assert!(entries.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn parse_feed_invalid_xml_returns_error() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("GET"))
|
||||
.respond_with(ResponseTemplate::new(200).set_body_string("not xml at all"))
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let result = parse_feed(&client, &server.uri(), 10).await;
|
||||
assert!(result.is_err());
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue