diff --git a/backend/src/services/feed_parser.rs b/backend/src/services/feed_parser.rs index 6234a26..27b5748 100644 --- a/backend/src/services/feed_parser.rs +++ b/backend/src/services/feed_parser.rs @@ -36,6 +36,9 @@ pub const MIN_FEED_ENTRIES: usize = 3; /// Number of days before a cached feed URL is re-verified. pub const REDISCOVERY_DAYS: i64 = 30; +/// Maximum response body size in bytes (5 MB), matching the scraper limit. +const MAX_FEED_BODY_SIZE: usize = 5_000_000; + /// Parse an RSS/Atom feed URL and return entries sorted by date (newest first). /// /// Uses the `feed-rs` crate which handles RSS 1.0, RSS 2.0, Atom, and JSON Feed. @@ -53,7 +56,7 @@ pub async fn parse_feed( return Ok(Vec::new()); } - let response = http_client + let mut response = http_client .get(feed_url) .send() .await @@ -67,9 +70,28 @@ pub async fn parse_feed( return Ok(Vec::new()); } - let body = response.bytes().await.map_err(|e| { + // Enforce body size limit (chunked reading, matching scraper pattern) + let content_length = response.content_length(); + if let Some(len) = content_length { + if len as usize > MAX_FEED_BODY_SIZE { + tracing::warn!(url = feed_url, size = len, "Feed body exceeds size limit"); + return Ok(Vec::new()); + } + } + let mut bytes = match content_length { + Some(len) => Vec::with_capacity(len as usize), + None => Vec::new(), + }; + while let Some(chunk) = response.chunk().await.map_err(|e| { AppError::Internal(anyhow::anyhow!("Failed to read feed body: {}", e)) - })?; + })? { + if bytes.len() + chunk.len() > MAX_FEED_BODY_SIZE { + tracing::warn!(url = feed_url, "Feed body exceeds size limit during download"); + return Ok(Vec::new()); + } + bytes.extend_from_slice(&chunk); + } + let body = bytes; let feed = feed_rs::parser::parse(&body[..]).map_err(|e| { tracing::warn!(url = feed_url, error = %e, "Failed to parse feed"); @@ -152,7 +174,7 @@ pub async fn discover_feed( return None; } - let response = http_client + let mut response = http_client .get(source_url) .send() .await @@ -179,7 +201,26 @@ pub async fn discover_feed( } // For anything else (HTML or unknown content-type), try HTML link discovery - let body = response.text().await.ok()?; + // Enforce body size limit + let content_length = response.content_length(); + if let Some(len) = content_length { + if len as usize > MAX_FEED_BODY_SIZE { + tracing::warn!(url = source_url, size = len, "Source page exceeds size limit during feed discovery"); + return None; + } + } + let mut body_bytes = match content_length { + Some(len) => Vec::with_capacity(len as usize), + None => Vec::new(), + }; + while let Some(chunk) = response.chunk().await.ok()? { + if body_bytes.len() + chunk.len() > MAX_FEED_BODY_SIZE { + tracing::warn!(url = source_url, "Source page exceeds size limit during feed discovery"); + return None; + } + body_bytes.extend_from_slice(&chunk); + } + let body = String::from_utf8_lossy(&body_bytes).to_string(); let document = scraper::Html::parse_document(&body); let selector = scraper::Selector::parse(r#"link[rel="alternate"]"#) @@ -751,4 +792,27 @@ mod tests { // No feed found — pipeline would fall back to source_scraper assert!(matches!(result, FeedResult::NotFound)); } + + #[tokio::test] + async fn parse_feed_rejects_oversized_body() { + skip_ssrf_for_test(); + let server = MockServer::start().await; + + // Create a response larger than MAX_FEED_BODY_SIZE (5 MB). + // The Content-Length header must match the actual body size so that + // hyper does not panic; we rely on the fast-reject path that checks + // content_length() before reading any bytes. + let big_body = vec![b'x'; MAX_FEED_BODY_SIZE + 1]; + Mock::given(method("GET")) + .respond_with( + ResponseTemplate::new(200) + .set_body_bytes(big_body) + ) + .mount(&server) + .await; + + let client = reqwest::Client::new(); + let entries = parse_feed(&client, &server.uri(), 10).await.unwrap(); + assert!(entries.is_empty(), "Should reject oversized feed"); + } }