@ -36,6 +36,9 @@ pub const MIN_FEED_ENTRIES: usize = 3;
/// Number of days before a cached feed URL is re-verified.
pub const REDISCOVERY_DAYS : i64 = 30 ;
/// Maximum response body size in bytes (5 MB), matching the scraper limit.
const MAX_FEED_BODY_SIZE : usize = 5_000_000 ;
/// Parse an RSS/Atom feed URL and return entries sorted by date (newest first).
///
/// Uses the `feed-rs` crate which handles RSS 1.0, RSS 2.0, Atom, and JSON Feed.
@ -53,7 +56,7 @@ pub async fn parse_feed(
return Ok ( Vec ::new ( ) ) ;
}
let response = http_client
let mut response = http_client
. get ( feed_url )
. send ( )
. await
@ -67,9 +70,28 @@ pub async fn parse_feed(
return Ok ( Vec ::new ( ) ) ;
}
let body = response . bytes ( ) . await . map_err ( | e | {
// Enforce body size limit (chunked reading, matching scraper pattern)
let content_length = response . content_length ( ) ;
if let Some ( len ) = content_length {
if len as usize > MAX_FEED_BODY_SIZE {
tracing ::warn ! ( url = feed_url , size = len , "Feed body exceeds size limit" ) ;
return Ok ( Vec ::new ( ) ) ;
}
}
let mut bytes = match content_length {
Some ( len ) = > Vec ::with_capacity ( len as usize ) ,
None = > Vec ::new ( ) ,
} ;
while let Some ( chunk ) = response . chunk ( ) . await . map_err ( | e | {
AppError ::Internal ( anyhow ::anyhow ! ( "Failed to read feed body: {}" , e ) )
} ) ? ;
} ) ? {
if bytes . len ( ) + chunk . len ( ) > MAX_FEED_BODY_SIZE {
tracing ::warn ! ( url = feed_url , "Feed body exceeds size limit during download" ) ;
return Ok ( Vec ::new ( ) ) ;
}
bytes . extend_from_slice ( & chunk ) ;
}
let body = bytes ;
let feed = feed_rs ::parser ::parse ( & body [ .. ] ) . map_err ( | e | {
tracing ::warn ! ( url = feed_url , error = % e , "Failed to parse feed" ) ;
@ -152,7 +174,7 @@ pub async fn discover_feed(
return None ;
}
let response = http_client
let mut response = http_client
. get ( source_url )
. send ( )
. await
@ -179,7 +201,26 @@ pub async fn discover_feed(
}
// For anything else (HTML or unknown content-type), try HTML link discovery
let body = response . text ( ) . await . ok ( ) ? ;
// Enforce body size limit
let content_length = response . content_length ( ) ;
if let Some ( len ) = content_length {
if len as usize > MAX_FEED_BODY_SIZE {
tracing ::warn ! ( url = source_url , size = len , "Source page exceeds size limit during feed discovery" ) ;
return None ;
}
}
let mut body_bytes = match content_length {
Some ( len ) = > Vec ::with_capacity ( len as usize ) ,
None = > Vec ::new ( ) ,
} ;
while let Some ( chunk ) = response . chunk ( ) . await . ok ( ) ? {
if body_bytes . len ( ) + chunk . len ( ) > MAX_FEED_BODY_SIZE {
tracing ::warn ! ( url = source_url , "Source page exceeds size limit during feed discovery" ) ;
return None ;
}
body_bytes . extend_from_slice ( & chunk ) ;
}
let body = String ::from_utf8_lossy ( & body_bytes ) . to_string ( ) ;
let document = scraper ::Html ::parse_document ( & body ) ;
let selector = scraper ::Selector ::parse ( r#"link[rel="alternate"]"# )
@ -751,4 +792,27 @@ mod tests {
// No feed found — pipeline would fall back to source_scraper
assert! ( matches! ( result , FeedResult ::NotFound ) ) ;
}
#[ tokio::test ]
async fn parse_feed_rejects_oversized_body ( ) {
skip_ssrf_for_test ( ) ;
let server = MockServer ::start ( ) . await ;
// Create a response larger than MAX_FEED_BODY_SIZE (5 MB).
// The Content-Length header must match the actual body size so that
// hyper does not panic; we rely on the fast-reject path that checks
// content_length() before reading any bytes.
let big_body = vec! [ b'x' ; MAX_FEED_BODY_SIZE + 1 ] ;
Mock ::given ( method ( "GET" ) )
. respond_with (
ResponseTemplate ::new ( 200 )
. set_body_bytes ( big_body )
)
. mount ( & server )
. await ;
let client = reqwest ::Client ::new ( ) ;
let entries = parse_feed ( & client , & server . uri ( ) , 10 ) . await . unwrap ( ) ;
assert! ( entries . is_empty ( ) , "Should reject oversized feed" ) ;
}
}