Fix Phase 2 critical issues: SSRF IPv6 gaps, body text filtering, CSV validation

- Fix body text extraction to actually filter excluded elements (script,
  nav, footer, aside, etc.) using node ID tracking instead of unused HashSet
- Add IPv6 reserved range checks to SSRF prevention: ULA (fc00::/7),
  documentation (2001:db8::/32), discard prefix (100::/64)
- Add errors field to frontend BulkImportResponse type
- Validate Content-Type on CSV multipart upload (reject non-text files)
- Add 6 new unit tests for the fixes

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 2b75dc7049
commit 22ff026a4c

1
backend/Cargo.lock generated

@ -22,6 +22,7 @@ dependencies = [
"clap", "clap",
"dashmap", "dashmap",
"dotenvy", "dotenvy",
"ego-tree",
"email_address", "email_address",
"hex", "hex",
"http-body-util", "http-body-util",

@ -48,6 +48,7 @@ clap = { version = "4", features = ["derive"] }
# HTML parsing (scraper service) # HTML parsing (scraper service)
scraper = "0.22" scraper = "0.22"
ego-tree = "0.10"
# URL parsing (scraper SSRF checks) # URL parsing (scraper SSRF checks)
url = "2" url = "2"

@ -159,6 +159,20 @@ pub async fn import_csv(
.map_err(|e| AppError::BadRequest(format!("Failed to read multipart field: {}", e)))? .map_err(|e| AppError::BadRequest(format!("Failed to read multipart field: {}", e)))?
.ok_or_else(|| AppError::BadRequest("No file field found in upload".into()))?; .ok_or_else(|| AppError::BadRequest("No file field found in upload".into()))?;
// Validate Content-Type if present (allow text/csv, text/plain, or missing)
if let Some(content_type) = field.content_type() {
let ct = content_type.to_string();
if !ct.starts_with("text/csv")
&& !ct.starts_with("text/plain")
&& !ct.starts_with("application/octet-stream")
{
return Err(AppError::BadRequest(format!(
"Invalid file type: {}. Expected a CSV file (text/csv or text/plain).",
ct
)));
}
}
let content = field let content = field
.text() .text()
.await .await

@ -222,6 +222,9 @@ async fn check_ssrf(url: &url::Url) -> Result<(), AppError> {
/// - ::1 (IPv6 loopback) /// - ::1 (IPv6 loopback)
/// - :: (IPv6 unspecified) /// - :: (IPv6 unspecified)
/// - fe80::/10 (IPv6 link-local) /// - fe80::/10 (IPv6 link-local)
/// - fc00::/7 (IPv6 unique local addresses — ULA)
/// - 2001:db8::/32 (IPv6 documentation range)
/// - 100::/64 (IPv6 discard prefix)
fn is_private_ip(ip: IpAddr) -> bool { fn is_private_ip(ip: IpAddr) -> bool {
match ip { match ip {
IpAddr::V4(v4) => { IpAddr::V4(v4) => {
@ -231,10 +234,17 @@ fn is_private_ip(ip: IpAddr) -> bool {
|| v4.is_unspecified() // 0.0.0.0 || v4.is_unspecified() // 0.0.0.0
} }
IpAddr::V6(v6) => { IpAddr::V6(v6) => {
let segments = v6.segments();
v6.is_loopback() // ::1 v6.is_loopback() // ::1
|| v6.is_unspecified() // :: || v6.is_unspecified() // ::
// fe80::/10 (link-local) — check the first 10 bits // fe80::/10 (link-local)
|| (v6.segments()[0] & 0xffc0) == 0xfe80 || (segments[0] & 0xffc0) == 0xfe80
// fc00::/7 (unique local addresses — ULA, equivalent to IPv4 private)
|| (segments[0] & 0xfe00) == 0xfc00
// 2001:db8::/32 (documentation range — should never appear in production)
|| (segments[0] == 0x2001 && segments[1] == 0x0db8)
// 100::/64 (discard prefix, RFC 6666)
|| (segments[0] == 0x0100 && segments[1] == 0 && segments[2] == 0 && segments[3] == 0)
} }
} }
} }
@ -386,6 +396,9 @@ fn parse_date_string(s: &str) -> Option<DateTime<Utc>> {
/// elements, then collects all remaining text nodes, normalizes whitespace, /// elements, then collects all remaining text nodes, normalizes whitespace,
/// and truncates to [`MAX_BODY_TEXT_CHARS`]. /// and truncates to [`MAX_BODY_TEXT_CHARS`].
fn extract_body_text(doc: &Html) -> String { fn extract_body_text(doc: &Html) -> String {
use ego_tree::NodeId;
use scraper::node::Node;
let body_sel = match Selector::parse("body") { let body_sel = match Selector::parse("body") {
Ok(sel) => sel, Ok(sel) => sel,
Err(_) => return String::new(), Err(_) => return String::new(),
@ -407,23 +420,31 @@ fn extract_body_text(doc: &Html) -> String {
.filter_map(|tag| Selector::parse(tag).ok()) .filter_map(|tag| Selector::parse(tag).ok())
.collect(); .collect();
// Collect IDs of elements to exclude (and all their descendants) // Collect node IDs of all excluded elements and their descendants
let mut excluded_ids = std::collections::HashSet::new(); let mut excluded_node_ids = std::collections::HashSet::<NodeId>::new();
for sel in &exclude_selectors { for sel in &exclude_selectors {
for el in body.select(sel) { for el in body.select(sel) {
excluded_ids.insert(el.id()); excluded_node_ids.insert(el.id());
for descendant in el.descendants() { for descendant in el.descendants() {
if let Some(element_ref) = scraper::ElementRef::wrap(descendant) { excluded_node_ids.insert(descendant.id());
excluded_ids.insert(element_ref.id());
}
} }
} }
} }
// Collect text from non-excluded nodes // Walk all descendants of body and collect text from non-excluded nodes
let mut text_parts: Vec<&str> = Vec::new(); let mut text_parts: Vec<String> = Vec::new();
for text_node in body.text() { for descendant in body.descendants() {
text_parts.push(text_node); // Skip if this node (or an ancestor) was excluded
if excluded_node_ids.contains(&descendant.id()) {
continue;
}
// Only collect actual text nodes
if let Node::Text(text) = descendant.value() {
let t = text.text.trim();
if !t.is_empty() {
text_parts.push(t.to_string());
}
}
} }
// Join and normalize whitespace // Join and normalize whitespace
@ -513,6 +534,34 @@ mod tests {
assert!(is_private_ip(ip)); assert!(is_private_ip(ip));
} }
#[test]
fn test_ipv6_unique_local_rejected() {
// fc00::1 — unique local address (ULA)
let ip = IpAddr::V6(Ipv6Addr::new(0xfc00, 0, 0, 0, 0, 0, 0, 1));
assert!(is_private_ip(ip));
}
#[test]
fn test_ipv6_unique_local_fd_rejected() {
// fd12:3456::1 — also ULA (fd00::/8 is within fc00::/7)
let ip = IpAddr::V6(Ipv6Addr::new(0xfd12, 0x3456, 0, 0, 0, 0, 0, 1));
assert!(is_private_ip(ip));
}
#[test]
fn test_ipv6_documentation_rejected() {
// 2001:db8::1 — documentation range
let ip = IpAddr::V6(Ipv6Addr::new(0x2001, 0x0db8, 0, 0, 0, 0, 0, 1));
assert!(is_private_ip(ip));
}
#[test]
fn test_ipv6_discard_rejected() {
// 100::1 — discard prefix
let ip = IpAddr::V6(Ipv6Addr::new(0x0100, 0, 0, 0, 0, 0, 0, 1));
assert!(is_private_ip(ip));
}
#[test] #[test]
fn test_public_ipv4_allowed() { fn test_public_ipv4_allowed() {
let ip = IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)); let ip = IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8));
@ -697,9 +746,23 @@ mod tests {
let text = extract_body_text(&doc); let text = extract_body_text(&doc);
assert!(text.contains("Visible text")); assert!(text.contains("Visible text"));
assert!(text.contains("More visible text")); assert!(text.contains("More visible text"));
// Script content will still appear because body.text() collects all text nodes. assert!(!text.contains("hidden"), "Script content should be filtered out");
// The improved version should filter these, but the basic extraction }
// still provides usable content.
#[test]
fn test_body_text_strips_nav_footer_aside() {
let html = r#"<html><head></head><body>
<nav><a href="/">Home</a><a href="/about">About</a></nav>
<p>Article content here</p>
<aside>Related links sidebar</aside>
<footer>Copyright 2026</footer>
</body></html>"#;
let doc = Html::parse_document(html);
let text = extract_body_text(&doc);
assert!(text.contains("Article content here"));
assert!(!text.contains("Home"), "Nav content should be filtered out");
assert!(!text.contains("Related links"), "Aside content should be filtered out");
assert!(!text.contains("Copyright"), "Footer content should be filtered out");
} }
#[test] #[test]

@ -86,6 +86,7 @@ export interface BulkImportRequest {
export interface BulkImportResponse { export interface BulkImportResponse {
imported: number; imported: number;
skipped: number; skipped: number;
errors: string[];
} }
// ---- API Error ---- // ---- API Error ----

Loading…
Cancel
Save