feat: add detect_and_parse_feed orchestration function

Adds detect_and_parse_feed which orchestrates feed caching/freshness logic:
uses cached feed URL directly if fresh (< 30 days), otherwise re-discovers
from source URL via discover_feed. Returns FeedResult::Found or NotFound.
Includes 4 new tests covering fresh cache, no cache, no feed, and stale cache cases.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 2 months ago
parent 96b39814bb
commit d4cbcf47ae

@ -199,6 +199,66 @@ pub async fn discover_feed(
None None
} }
/// Detect and parse an RSS/Atom feed for a source URL.
///
/// Orchestrates the discovery and parsing logic:
/// - If `rss_url` is cached and fresh (< 30 days), parse it directly.
/// - If `rss_url` is cached but stale (>= 30 days), re-discover from `source_url`.
/// - If no `rss_url` cached, attempt discovery from `source_url`.
///
/// Returns `FeedResult::Found` with the feed URL and sorted entries,
/// or `FeedResult::NotFound` if no feed could be found/parsed.
pub async fn detect_and_parse_feed(
http_client: &reqwest::Client,
source_url: &str,
rss_url: Option<&str>,
rss_discovered_at: Option<DateTime<Utc>>,
max_links: usize,
) -> FeedResult {
// Case 1: Cached and fresh — use directly
if let Some(cached_url) = rss_url {
let is_fresh = rss_discovered_at
.map(|d| Utc::now().signed_duration_since(d).num_days() < REDISCOVERY_DAYS)
.unwrap_or(false);
if is_fresh {
match parse_feed(http_client, cached_url, max_links).await {
Ok(entries) if !entries.is_empty() => {
return FeedResult::Found {
feed_url: cached_url.to_string(),
entries,
};
}
_ => {
tracing::warn!(url = cached_url, "Cached feed failed to parse, attempting re-discovery");
}
}
}
}
// Case 2: No cache or stale — discover
let discovered = discover_feed(http_client, source_url).await;
if let Some(feed_url) = discovered {
match parse_feed(http_client, &feed_url, max_links).await {
Ok(entries) if !entries.is_empty() => {
return FeedResult::Found {
feed_url,
entries,
};
}
Ok(_) => {
tracing::info!(url = feed_url, "Discovered feed is empty");
}
Err(e) => {
tracing::warn!(url = feed_url, error = %e, "Discovered feed failed to parse");
}
}
}
FeedResult::NotFound
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -450,4 +510,158 @@ mod tests {
assert!(feed_url.starts_with(&server.uri())); assert!(feed_url.starts_with(&server.uri()));
assert!(feed_url.ends_with("/feed.xml")); assert!(feed_url.ends_with("/feed.xml"));
} }
#[tokio::test]
async fn detect_and_parse_cached_fresh_feed() {
let server = MockServer::start().await;
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><channel><title>T</title>
<item><title>A1</title><link>https://example.com/1</link><pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate></item>
<item><title>A2</title><link>https://example.com/2</link><pubDate>Wed, 02 Apr 2026 10:00:00 GMT</pubDate></item>
<item><title>A3</title><link>https://example.com/3</link><pubDate>Tue, 01 Apr 2026 10:00:00 GMT</pubDate></item>
</channel></rss>"#;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
.mount(&server)
.await;
let client = reqwest::Client::new();
let result = detect_and_parse_feed(
&client,
"https://example.com",
Some(&server.uri()),
Some(Utc::now()), // fresh
10,
).await;
match result {
FeedResult::Found { entries, .. } => assert_eq!(entries.len(), 3),
FeedResult::NotFound => panic!("Expected Found"),
}
}
#[tokio::test]
async fn detect_and_parse_no_cache_discovers_feed() {
let server = MockServer::start().await;
// First request: HTML page with feed link
let feed_path = format!("{}/feed.xml", server.uri());
let html = format!(
r#"<html><head>
<link rel="alternate" type="application/rss+xml" href="{}">
</head><body></body></html>"#,
feed_path
);
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><channel><title>T</title>
<item><title>A1</title><link>https://example.com/1</link><pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate></item>
<item><title>A2</title><link>https://example.com/2</link><pubDate>Wed, 02 Apr 2026 10:00:00 GMT</pubDate></item>
<item><title>A3</title><link>https://example.com/3</link><pubDate>Tue, 01 Apr 2026 10:00:00 GMT</pubDate></item>
</channel></rss>"#;
// Mock: source page returns HTML
Mock::given(method("GET"))
.and(wiremock::matchers::path("/"))
.respond_with(ResponseTemplate::new(200).set_body_string(html))
.mount(&server)
.await;
// Mock: feed URL returns RSS
Mock::given(method("GET"))
.and(wiremock::matchers::path("/feed.xml"))
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
.mount(&server)
.await;
let client = reqwest::Client::new();
let result = detect_and_parse_feed(
&client,
&server.uri(),
None, // no cache
None,
10,
).await;
match result {
FeedResult::Found { feed_url, entries } => {
assert!(feed_url.contains("/feed.xml"));
assert_eq!(entries.len(), 3);
}
FeedResult::NotFound => panic!("Expected Found"),
}
}
#[tokio::test]
async fn detect_and_parse_no_feed_returns_not_found() {
let server = MockServer::start().await;
let html = "<html><head><title>No feed</title></head><body></body></html>";
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_string(html))
.mount(&server)
.await;
let client = reqwest::Client::new();
let result = detect_and_parse_feed(
&client,
&server.uri(),
None,
None,
10,
).await;
assert!(matches!(result, FeedResult::NotFound));
}
#[tokio::test]
async fn detect_and_parse_stale_cache_rediscovers() {
let server = MockServer::start().await;
let feed_path = format!("{}/feed.xml", server.uri());
let html = format!(
r#"<html><head>
<link rel="alternate" type="application/rss+xml" href="{}">
</head><body></body></html>"#,
feed_path
);
let rss_body = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><channel><title>T</title>
<item><title>A1</title><link>https://example.com/1</link><pubDate>Thu, 03 Apr 2026 10:00:00 GMT</pubDate></item>
<item><title>A2</title><link>https://example.com/2</link><pubDate>Wed, 02 Apr 2026 10:00:00 GMT</pubDate></item>
<item><title>A3</title><link>https://example.com/3</link><pubDate>Tue, 01 Apr 2026 10:00:00 GMT</pubDate></item>
</channel></rss>"#;
Mock::given(method("GET"))
.and(wiremock::matchers::path("/"))
.respond_with(ResponseTemplate::new(200).set_body_string(html))
.mount(&server)
.await;
Mock::given(method("GET"))
.and(wiremock::matchers::path("/feed.xml"))
.respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml"))
.mount(&server)
.await;
let client = reqwest::Client::new();
let stale_date = Utc::now() - chrono::Duration::days(31);
let result = detect_and_parse_feed(
&client,
&server.uri(),
Some("https://old-feed.example.com/rss"), // stale cached URL
Some(stale_date),
10,
).await;
match result {
FeedResult::Found { feed_url, entries } => {
assert!(feed_url.contains("/feed.xml"), "Should discover new feed URL");
assert_eq!(entries.len(), 3);
}
FeedResult::NotFound => panic!("Expected Found after re-discovery"),
}
}
} }

Loading…
Cancel
Save