diff --git a/backend/src/services/scraper.rs b/backend/src/services/scraper.rs index d485317..41716de 100644 --- a/backend/src/services/scraper.rs +++ b/backend/src/services/scraper.rs @@ -52,14 +52,52 @@ pub struct ScrapedContent { /// Build a `reqwest::Client` configured for scraping. /// -/// Sets appropriate timeouts, redirect limits, and User-Agent. -/// This client should be stored in `AppState` and reused across requests. +/// Uses a custom redirect policy that validates each hop against private/internal +/// IP addresses (SSRF prevention). DNS is resolved synchronously in the redirect +/// handler via `std::net::ToSocketAddrs`. Max 3 redirects, only http/https schemes. +/// +/// **Residual risk**: There is a theoretical TOCTOU gap between the DNS check in +/// the redirect policy and reqwest's actual TCP connection. DNS rebinding could +/// bypass the check. This is accepted as a known limitation. pub fn build_scraper_client() -> Result { + use std::net::ToSocketAddrs; + + let redirect_policy = reqwest::redirect::Policy::custom(|attempt| { + if attempt.previous().len() >= 3 { + return attempt.error("Too many redirects"); + } + + let url = attempt.url(); + + if url.scheme() != "http" && url.scheme() != "https" { + return attempt.error("Blocked redirect to non-HTTP scheme"); + } + + if let Some(host) = url.host_str() { + let port = url.port().unwrap_or(if url.scheme() == "https" { 443 } else { 80 }); + let addr_str = format!("{}:{}", host, port); + match addr_str.to_socket_addrs() { + Ok(addrs) => { + for addr in addrs { + if is_private_ip(addr.ip()) { + return attempt.error("Blocked redirect to private/internal IP"); + } + } + } + Err(_) => { + return attempt.error("DNS resolution failed for redirect target"); + } + } + } + + attempt.follow() + }); + reqwest::Client::builder() .user_agent(USER_AGENT) .connect_timeout(std::time::Duration::from_secs(5)) .timeout(std::time::Duration::from_secs(15)) - .redirect(reqwest::redirect::Policy::limited(3)) + .redirect(redirect_policy) .build() .map_err(|e| AppError::Internal(anyhow::anyhow!("Failed to build scraper client: {}", e))) }