From d4cbcf47ae80c1f1b8fb3e0d1137ce91e30f5b11 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Fri, 3 Apr 2026 14:08:50 +0200 Subject: [PATCH] feat: add detect_and_parse_feed orchestration function Adds detect_and_parse_feed which orchestrates feed caching/freshness logic: uses cached feed URL directly if fresh (< 30 days), otherwise re-discovers from source URL via discover_feed. Returns FeedResult::Found or NotFound. Includes 4 new tests covering fresh cache, no cache, no feed, and stale cache cases. Co-Authored-By: Claude Sonnet 4.6 --- backend/src/services/feed_parser.rs | 214 ++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) diff --git a/backend/src/services/feed_parser.rs b/backend/src/services/feed_parser.rs index 0ea1fb1..06d09bf 100644 --- a/backend/src/services/feed_parser.rs +++ b/backend/src/services/feed_parser.rs @@ -199,6 +199,66 @@ pub async fn discover_feed( None } +/// Detect and parse an RSS/Atom feed for a source URL. +/// +/// Orchestrates the discovery and parsing logic: +/// - If `rss_url` is cached and fresh (< 30 days), parse it directly. +/// - If `rss_url` is cached but stale (>= 30 days), re-discover from `source_url`. +/// - If no `rss_url` cached, attempt discovery from `source_url`. +/// +/// Returns `FeedResult::Found` with the feed URL and sorted entries, +/// or `FeedResult::NotFound` if no feed could be found/parsed. +pub async fn detect_and_parse_feed( + http_client: &reqwest::Client, + source_url: &str, + rss_url: Option<&str>, + rss_discovered_at: Option>, + max_links: usize, +) -> FeedResult { + // Case 1: Cached and fresh — use directly + if let Some(cached_url) = rss_url { + let is_fresh = rss_discovered_at + .map(|d| Utc::now().signed_duration_since(d).num_days() < REDISCOVERY_DAYS) + .unwrap_or(false); + + if is_fresh { + match parse_feed(http_client, cached_url, max_links).await { + Ok(entries) if !entries.is_empty() => { + return FeedResult::Found { + feed_url: cached_url.to_string(), + entries, + }; + } + _ => { + tracing::warn!(url = cached_url, "Cached feed failed to parse, attempting re-discovery"); + } + } + } + } + + // Case 2: No cache or stale — discover + let discovered = discover_feed(http_client, source_url).await; + + if let Some(feed_url) = discovered { + match parse_feed(http_client, &feed_url, max_links).await { + Ok(entries) if !entries.is_empty() => { + return FeedResult::Found { + feed_url, + entries, + }; + } + Ok(_) => { + tracing::info!(url = feed_url, "Discovered feed is empty"); + } + Err(e) => { + tracing::warn!(url = feed_url, error = %e, "Discovered feed failed to parse"); + } + } + } + + FeedResult::NotFound +} + #[cfg(test)] mod tests { use super::*; @@ -450,4 +510,158 @@ mod tests { assert!(feed_url.starts_with(&server.uri())); assert!(feed_url.ends_with("/feed.xml")); } + + #[tokio::test] + async fn detect_and_parse_cached_fresh_feed() { + let server = MockServer::start().await; + let rss_body = r#" +T + A1https://example.com/1Thu, 03 Apr 2026 10:00:00 GMT + A2https://example.com/2Wed, 02 Apr 2026 10:00:00 GMT + A3https://example.com/3Tue, 01 Apr 2026 10:00:00 GMT +"#; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml")) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let result = detect_and_parse_feed( + &client, + "https://example.com", + Some(&server.uri()), + Some(Utc::now()), // fresh + 10, + ).await; + + match result { + FeedResult::Found { entries, .. } => assert_eq!(entries.len(), 3), + FeedResult::NotFound => panic!("Expected Found"), + } + } + + #[tokio::test] + async fn detect_and_parse_no_cache_discovers_feed() { + let server = MockServer::start().await; + + // First request: HTML page with feed link + let feed_path = format!("{}/feed.xml", server.uri()); + let html = format!( + r#" + + "#, + feed_path + ); + + let rss_body = r#" +T + A1https://example.com/1Thu, 03 Apr 2026 10:00:00 GMT + A2https://example.com/2Wed, 02 Apr 2026 10:00:00 GMT + A3https://example.com/3Tue, 01 Apr 2026 10:00:00 GMT +"#; + + // Mock: source page returns HTML + Mock::given(method("GET")) + .and(wiremock::matchers::path("/")) + .respond_with(ResponseTemplate::new(200).set_body_string(html)) + .mount(&server) + .await; + + // Mock: feed URL returns RSS + Mock::given(method("GET")) + .and(wiremock::matchers::path("/feed.xml")) + .respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml")) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let result = detect_and_parse_feed( + &client, + &server.uri(), + None, // no cache + None, + 10, + ).await; + + match result { + FeedResult::Found { feed_url, entries } => { + assert!(feed_url.contains("/feed.xml")); + assert_eq!(entries.len(), 3); + } + FeedResult::NotFound => panic!("Expected Found"), + } + } + + #[tokio::test] + async fn detect_and_parse_no_feed_returns_not_found() { + let server = MockServer::start().await; + let html = "No feed"; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(200).set_body_string(html)) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let result = detect_and_parse_feed( + &client, + &server.uri(), + None, + None, + 10, + ).await; + + assert!(matches!(result, FeedResult::NotFound)); + } + + #[tokio::test] + async fn detect_and_parse_stale_cache_rediscovers() { + let server = MockServer::start().await; + + let feed_path = format!("{}/feed.xml", server.uri()); + let html = format!( + r#" + + "#, + feed_path + ); + + let rss_body = r#" +T + A1https://example.com/1Thu, 03 Apr 2026 10:00:00 GMT + A2https://example.com/2Wed, 02 Apr 2026 10:00:00 GMT + A3https://example.com/3Tue, 01 Apr 2026 10:00:00 GMT +"#; + + Mock::given(method("GET")) + .and(wiremock::matchers::path("/")) + .respond_with(ResponseTemplate::new(200).set_body_string(html)) + .mount(&server) + .await; + + Mock::given(method("GET")) + .and(wiremock::matchers::path("/feed.xml")) + .respond_with(ResponseTemplate::new(200).set_body_raw(rss_body, "application/rss+xml")) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let stale_date = Utc::now() - chrono::Duration::days(31); + let result = detect_and_parse_feed( + &client, + &server.uri(), + Some("https://old-feed.example.com/rss"), // stale cached URL + Some(stale_date), + 10, + ).await; + + match result { + FeedResult::Found { feed_url, entries } => { + assert!(feed_url.contains("/feed.xml"), "Should discover new feed URL"); + assert_eq!(entries.len(), 3); + } + FeedResult::NotFound => panic!("Expected Found after re-discovery"), + } + } }