fix: add body size limit to feed_parser to prevent memory exhaustion

Adds chunked reading with a 5 MB cap (matching the scraper limit) to
both parse_feed and discover_feed, with fast rejection via Content-Length
header when available. Includes a unit test covering the oversize path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 2 months ago
parent 7d3dfa37a9
commit e2ce401ea6

@ -36,6 +36,9 @@ pub const MIN_FEED_ENTRIES: usize = 3;
/// Number of days before a cached feed URL is re-verified. /// Number of days before a cached feed URL is re-verified.
pub const REDISCOVERY_DAYS: i64 = 30; pub const REDISCOVERY_DAYS: i64 = 30;
/// Maximum response body size in bytes (5 MB), matching the scraper limit.
const MAX_FEED_BODY_SIZE: usize = 5_000_000;
/// Parse an RSS/Atom feed URL and return entries sorted by date (newest first). /// Parse an RSS/Atom feed URL and return entries sorted by date (newest first).
/// ///
/// Uses the `feed-rs` crate which handles RSS 1.0, RSS 2.0, Atom, and JSON Feed. /// Uses the `feed-rs` crate which handles RSS 1.0, RSS 2.0, Atom, and JSON Feed.
@ -53,7 +56,7 @@ pub async fn parse_feed(
return Ok(Vec::new()); return Ok(Vec::new());
} }
let response = http_client let mut response = http_client
.get(feed_url) .get(feed_url)
.send() .send()
.await .await
@ -67,9 +70,28 @@ pub async fn parse_feed(
return Ok(Vec::new()); return Ok(Vec::new());
} }
let body = response.bytes().await.map_err(|e| { // Enforce body size limit (chunked reading, matching scraper pattern)
let content_length = response.content_length();
if let Some(len) = content_length {
if len as usize > MAX_FEED_BODY_SIZE {
tracing::warn!(url = feed_url, size = len, "Feed body exceeds size limit");
return Ok(Vec::new());
}
}
let mut bytes = match content_length {
Some(len) => Vec::with_capacity(len as usize),
None => Vec::new(),
};
while let Some(chunk) = response.chunk().await.map_err(|e| {
AppError::Internal(anyhow::anyhow!("Failed to read feed body: {}", e)) AppError::Internal(anyhow::anyhow!("Failed to read feed body: {}", e))
})?; })? {
if bytes.len() + chunk.len() > MAX_FEED_BODY_SIZE {
tracing::warn!(url = feed_url, "Feed body exceeds size limit during download");
return Ok(Vec::new());
}
bytes.extend_from_slice(&chunk);
}
let body = bytes;
let feed = feed_rs::parser::parse(&body[..]).map_err(|e| { let feed = feed_rs::parser::parse(&body[..]).map_err(|e| {
tracing::warn!(url = feed_url, error = %e, "Failed to parse feed"); tracing::warn!(url = feed_url, error = %e, "Failed to parse feed");
@ -152,7 +174,7 @@ pub async fn discover_feed(
return None; return None;
} }
let response = http_client let mut response = http_client
.get(source_url) .get(source_url)
.send() .send()
.await .await
@ -179,7 +201,26 @@ pub async fn discover_feed(
} }
// For anything else (HTML or unknown content-type), try HTML link discovery // For anything else (HTML or unknown content-type), try HTML link discovery
let body = response.text().await.ok()?; // Enforce body size limit
let content_length = response.content_length();
if let Some(len) = content_length {
if len as usize > MAX_FEED_BODY_SIZE {
tracing::warn!(url = source_url, size = len, "Source page exceeds size limit during feed discovery");
return None;
}
}
let mut body_bytes = match content_length {
Some(len) => Vec::with_capacity(len as usize),
None => Vec::new(),
};
while let Some(chunk) = response.chunk().await.ok()? {
if body_bytes.len() + chunk.len() > MAX_FEED_BODY_SIZE {
tracing::warn!(url = source_url, "Source page exceeds size limit during feed discovery");
return None;
}
body_bytes.extend_from_slice(&chunk);
}
let body = String::from_utf8_lossy(&body_bytes).to_string();
let document = scraper::Html::parse_document(&body); let document = scraper::Html::parse_document(&body);
let selector = scraper::Selector::parse(r#"link[rel="alternate"]"#) let selector = scraper::Selector::parse(r#"link[rel="alternate"]"#)
@ -751,4 +792,27 @@ mod tests {
// No feed found — pipeline would fall back to source_scraper // No feed found — pipeline would fall back to source_scraper
assert!(matches!(result, FeedResult::NotFound)); assert!(matches!(result, FeedResult::NotFound));
} }
#[tokio::test]
async fn parse_feed_rejects_oversized_body() {
skip_ssrf_for_test();
let server = MockServer::start().await;
// Create a response larger than MAX_FEED_BODY_SIZE (5 MB).
// The Content-Length header must match the actual body size so that
// hyper does not panic; we rely on the fast-reject path that checks
// content_length() before reading any bytes.
let big_body = vec![b'x'; MAX_FEED_BODY_SIZE + 1];
Mock::given(method("GET"))
.respond_with(
ResponseTemplate::new(200)
.set_body_bytes(big_body)
)
.mount(&server)
.await;
let client = reqwest::Client::new();
let entries = parse_feed(&client, &server.uri(), 10).await.unwrap();
assert!(entries.is_empty(), "Should reject oversized feed");
}
} }

Loading…
Cancel
Save