|
|
|
|
@ -52,14 +52,52 @@ pub struct ScrapedContent {
|
|
|
|
|
|
|
|
|
|
/// Build a `reqwest::Client` configured for scraping.
|
|
|
|
|
///
|
|
|
|
|
/// Sets appropriate timeouts, redirect limits, and User-Agent.
|
|
|
|
|
/// This client should be stored in `AppState` and reused across requests.
|
|
|
|
|
/// Uses a custom redirect policy that validates each hop against private/internal
|
|
|
|
|
/// IP addresses (SSRF prevention). DNS is resolved synchronously in the redirect
|
|
|
|
|
/// handler via `std::net::ToSocketAddrs`. Max 3 redirects, only http/https schemes.
|
|
|
|
|
///
|
|
|
|
|
/// **Residual risk**: There is a theoretical TOCTOU gap between the DNS check in
|
|
|
|
|
/// the redirect policy and reqwest's actual TCP connection. DNS rebinding could
|
|
|
|
|
/// bypass the check. This is accepted as a known limitation.
|
|
|
|
|
pub fn build_scraper_client() -> Result<reqwest::Client, AppError> {
|
|
|
|
|
use std::net::ToSocketAddrs;
|
|
|
|
|
|
|
|
|
|
let redirect_policy = reqwest::redirect::Policy::custom(|attempt| {
|
|
|
|
|
if attempt.previous().len() >= 3 {
|
|
|
|
|
return attempt.error("Too many redirects");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let url = attempt.url();
|
|
|
|
|
|
|
|
|
|
if url.scheme() != "http" && url.scheme() != "https" {
|
|
|
|
|
return attempt.error("Blocked redirect to non-HTTP scheme");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if let Some(host) = url.host_str() {
|
|
|
|
|
let port = url.port().unwrap_or(if url.scheme() == "https" { 443 } else { 80 });
|
|
|
|
|
let addr_str = format!("{}:{}", host, port);
|
|
|
|
|
match addr_str.to_socket_addrs() {
|
|
|
|
|
Ok(addrs) => {
|
|
|
|
|
for addr in addrs {
|
|
|
|
|
if is_private_ip(addr.ip()) {
|
|
|
|
|
return attempt.error("Blocked redirect to private/internal IP");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
Err(_) => {
|
|
|
|
|
return attempt.error("DNS resolution failed for redirect target");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
attempt.follow()
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
reqwest::Client::builder()
|
|
|
|
|
.user_agent(USER_AGENT)
|
|
|
|
|
.connect_timeout(std::time::Duration::from_secs(5))
|
|
|
|
|
.timeout(std::time::Duration::from_secs(15))
|
|
|
|
|
.redirect(reqwest::redirect::Policy::limited(3))
|
|
|
|
|
.redirect(redirect_policy)
|
|
|
|
|
.build()
|
|
|
|
|
.map_err(|e| AppError::Internal(anyhow::anyhow!("Failed to build scraper client: {}", e)))
|
|
|
|
|
}
|
|
|
|
|
|