feat: add feed_parser service with parse_feed function and tests
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>master
parent
cd5f5434b2
commit
fcdc7ca4a6
@ -0,0 +1,278 @@
|
|||||||
|
//! RSS/Atom feed parser service.
|
||||||
|
//!
|
||||||
|
//! Discovers and parses RSS/Atom feeds from source URLs.
|
||||||
|
//! Used in Phase 1 of the generation pipeline to extract article links
|
||||||
|
//! sorted by publication date (newest first), before falling back
|
||||||
|
//! to the HTML-based source_scraper.
|
||||||
|
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
use crate::errors::AppError;
|
||||||
|
|
||||||
|
/// A single entry extracted from an RSS/Atom feed.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct FeedEntry {
|
||||||
|
pub url: String,
|
||||||
|
pub title: String,
|
||||||
|
pub published_date: Option<DateTime<Utc>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of attempting to detect and parse a feed for a source.
|
||||||
|
pub enum FeedResult {
|
||||||
|
/// Feed found and parsed successfully.
|
||||||
|
Found {
|
||||||
|
feed_url: String,
|
||||||
|
entries: Vec<FeedEntry>,
|
||||||
|
},
|
||||||
|
/// No feed discovered or feed invalid.
|
||||||
|
NotFound,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Minimum number of feed entries to consider the feed useful.
|
||||||
|
/// Below this threshold, the pipeline falls back to HTML extraction.
|
||||||
|
pub const MIN_FEED_ENTRIES: usize = 3;
|
||||||
|
|
||||||
|
/// Number of days before a cached feed URL is re-verified.
|
||||||
|
pub const REDISCOVERY_DAYS: i64 = 30;
|
||||||
|
|
||||||
|
/// Parse an RSS/Atom feed URL and return entries sorted by date (newest first).
|
||||||
|
///
|
||||||
|
/// Uses the `feed-rs` crate which handles RSS 1.0, RSS 2.0, Atom, and JSON Feed.
|
||||||
|
/// Entries without a published date are placed last.
|
||||||
|
pub async fn parse_feed(
|
||||||
|
http_client: &reqwest::Client,
|
||||||
|
feed_url: &str,
|
||||||
|
max_links: usize,
|
||||||
|
) -> Result<Vec<FeedEntry>, AppError> {
|
||||||
|
let parsed_url = Url::parse(feed_url)
|
||||||
|
.map_err(|e| AppError::BadRequest(format!("Invalid feed URL: {}", e)))?;
|
||||||
|
|
||||||
|
if let Err(e) = crate::services::scraper::check_ssrf(&parsed_url).await {
|
||||||
|
tracing::warn!(url = feed_url, error = %e, "Feed URL failed SSRF check");
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = http_client
|
||||||
|
.get(feed_url)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
tracing::warn!(url = feed_url, error = %e, "Failed to fetch feed");
|
||||||
|
AppError::Internal(anyhow::anyhow!("Failed to fetch feed"))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
if !response.status().is_success() {
|
||||||
|
tracing::warn!(url = feed_url, status = %response.status(), "Feed returned non-200");
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let body = response.bytes().await.map_err(|e| {
|
||||||
|
AppError::Internal(anyhow::anyhow!("Failed to read feed body: {}", e))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let feed = feed_rs::parser::parse(&body[..]).map_err(|e| {
|
||||||
|
tracing::warn!(url = feed_url, error = %e, "Failed to parse feed");
|
||||||
|
AppError::Internal(anyhow::anyhow!("Failed to parse feed: {}", e))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut entries: Vec<FeedEntry> = feed
|
||||||
|
.entries
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|entry| {
|
||||||
|
// Get the article URL: prefer links, fall back to id if it looks like a URL
|
||||||
|
let url = entry
|
||||||
|
.links
|
||||||
|
.first()
|
||||||
|
.map(|l| l.href.clone())
|
||||||
|
.or_else(|| {
|
||||||
|
if entry.id.starts_with("http://") || entry.id.starts_with("https://") {
|
||||||
|
Some(entry.id.clone())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let title = entry
|
||||||
|
.title
|
||||||
|
.map(|t| t.content)
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let published_date = entry
|
||||||
|
.published
|
||||||
|
.or(entry.updated);
|
||||||
|
|
||||||
|
Some(FeedEntry {
|
||||||
|
url,
|
||||||
|
title,
|
||||||
|
published_date,
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Sort by published_date descending (newest first), entries without dates last
|
||||||
|
entries.sort_by(|a, b| {
|
||||||
|
match (&b.published_date, &a.published_date) {
|
||||||
|
(Some(db), Some(da)) => db.cmp(da),
|
||||||
|
(Some(_), None) => std::cmp::Ordering::Greater,
|
||||||
|
(None, Some(_)) => std::cmp::Ordering::Less,
|
||||||
|
(None, None) => std::cmp::Ordering::Equal,
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
entries.truncate(max_links);
|
||||||
|
|
||||||
|
Ok(entries)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use wiremock::{Mock, MockServer, ResponseTemplate};
|
||||||
|
use wiremock::matchers::method;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn parse_feed_rss2() {
|
||||||
|
let server = MockServer::start().await;
|
||||||
|
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>Test Blog</title>
|
||||||
|
<item>
|
||||||
|
<title>Article 1</title>
|
||||||
|
<link>https://example.com/article-1</link>
|
||||||
|
<pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>Article 2</title>
|
||||||
|
<link>https://example.com/article-2</link>
|
||||||
|
<pubDate>Wed, 02 Apr 2026 10:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>Article 3</title>
|
||||||
|
<link>https://example.com/article-3</link>
|
||||||
|
<pubDate>Tue, 01 Apr 2026 10:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>"#;
|
||||||
|
|
||||||
|
Mock::given(method("GET"))
|
||||||
|
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
|
||||||
|
.mount(&server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(entries.len(), 3);
|
||||||
|
assert_eq!(entries[0].title, "Article 1");
|
||||||
|
assert_eq!(entries[0].url, "https://example.com/article-1");
|
||||||
|
assert!(entries[0].published_date > entries[1].published_date);
|
||||||
|
assert!(entries[1].published_date > entries[2].published_date);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn parse_feed_atom() {
|
||||||
|
let server = MockServer::start().await;
|
||||||
|
let atom_body = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<title>Test Feed</title>
|
||||||
|
<entry>
|
||||||
|
<title>Atom Article</title>
|
||||||
|
<link href="https://example.com/atom-1"/>
|
||||||
|
<updated>2026-04-03T12:00:00Z</updated>
|
||||||
|
</entry>
|
||||||
|
</feed>"#;
|
||||||
|
|
||||||
|
Mock::given(method("GET"))
|
||||||
|
.respond_with(ResponseTemplate::new(200).set_body_raw(atom_body, "application/atom+xml"))
|
||||||
|
.mount(&server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(entries.len(), 1);
|
||||||
|
assert_eq!(entries[0].title, "Atom Article");
|
||||||
|
assert_eq!(entries[0].url, "https://example.com/atom-1");
|
||||||
|
assert!(entries[0].published_date.is_some());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn parse_feed_respects_max_links() {
|
||||||
|
let server = MockServer::start().await;
|
||||||
|
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>Test</title>
|
||||||
|
<item><title>A1</title><link>https://example.com/1</link><pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate></item>
|
||||||
|
<item><title>A2</title><link>https://example.com/2</link><pubDate>Wed, 02 Apr 2026 10:00:00 GMT</pubDate></item>
|
||||||
|
<item><title>A3</title><link>https://example.com/3</link><pubDate>Tue, 01 Apr 2026 10:00:00 GMT</pubDate></item>
|
||||||
|
</channel>
|
||||||
|
</rss>"#;
|
||||||
|
|
||||||
|
Mock::given(method("GET"))
|
||||||
|
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
|
||||||
|
.mount(&server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
let entries = parse_feed(&client, &server.uri(), 2).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(entries.len(), 2);
|
||||||
|
assert_eq!(entries[0].url, "https://example.com/1"); // newest first
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn parse_feed_entries_without_dates_come_last() {
|
||||||
|
let server = MockServer::start().await;
|
||||||
|
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>Test</title>
|
||||||
|
<item><title>No date</title><link>https://example.com/no-date</link></item>
|
||||||
|
<item><title>Has date</title><link>https://example.com/has-date</link><pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate></item>
|
||||||
|
</channel>
|
||||||
|
</rss>"#;
|
||||||
|
|
||||||
|
Mock::given(method("GET"))
|
||||||
|
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
|
||||||
|
.mount(&server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(entries.len(), 2);
|
||||||
|
assert_eq!(entries[0].url, "https://example.com/has-date");
|
||||||
|
assert_eq!(entries[1].url, "https://example.com/no-date");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn parse_feed_404_returns_empty() {
|
||||||
|
let server = MockServer::start().await;
|
||||||
|
|
||||||
|
Mock::given(method("GET"))
|
||||||
|
.respond_with(ResponseTemplate::new(404))
|
||||||
|
.mount(&server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
|
||||||
|
assert!(entries.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn parse_feed_invalid_xml_returns_error() {
|
||||||
|
let server = MockServer::start().await;
|
||||||
|
|
||||||
|
Mock::given(method("GET"))
|
||||||
|
.respond_with(ResponseTemplate::new(200).set_body_string("not xml at all"))
|
||||||
|
.mount(&server)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
let result = parse_feed(&client, &server.uri(), 10).await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue