From 22ff026a4c02059108c039c26954521756765844 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Sat, 21 Mar 2026 19:30:07 +0100 Subject: [PATCH] Fix Phase 2 critical issues: SSRF IPv6 gaps, body text filtering, CSV validation - Fix body text extraction to actually filter excluded elements (script, nav, footer, aside, etc.) using node ID tracking instead of unused HashSet - Add IPv6 reserved range checks to SSRF prevention: ULA (fc00::/7), documentation (2001:db8::/32), discard prefix (100::/64) - Add errors field to frontend BulkImportResponse type - Validate Content-Type on CSV multipart upload (reject non-text files) - Add 6 new unit tests for the fixes Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/Cargo.lock | 1 + backend/Cargo.toml | 1 + backend/src/handlers/sources.rs | 14 +++++ backend/src/services/scraper.rs | 93 +++++++++++++++++++++++++++------ frontend/src/types.ts | 1 + 5 files changed, 95 insertions(+), 15 deletions(-) diff --git a/backend/Cargo.lock b/backend/Cargo.lock index 9bddb59..0c727db 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -22,6 +22,7 @@ dependencies = [ "clap", "dashmap", "dotenvy", + "ego-tree", "email_address", "hex", "http-body-util", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 41b3658..7522485 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -48,6 +48,7 @@ clap = { version = "4", features = ["derive"] } # HTML parsing (scraper service) scraper = "0.22" +ego-tree = "0.10" # URL parsing (scraper SSRF checks) url = "2" diff --git a/backend/src/handlers/sources.rs b/backend/src/handlers/sources.rs index 7ab95b5..f98d072 100644 --- a/backend/src/handlers/sources.rs +++ b/backend/src/handlers/sources.rs @@ -159,6 +159,20 @@ pub async fn import_csv( .map_err(|e| AppError::BadRequest(format!("Failed to read multipart field: {}", e)))? .ok_or_else(|| AppError::BadRequest("No file field found in upload".into()))?; + // Validate Content-Type if present (allow text/csv, text/plain, or missing) + if let Some(content_type) = field.content_type() { + let ct = content_type.to_string(); + if !ct.starts_with("text/csv") + && !ct.starts_with("text/plain") + && !ct.starts_with("application/octet-stream") + { + return Err(AppError::BadRequest(format!( + "Invalid file type: {}. Expected a CSV file (text/csv or text/plain).", + ct + ))); + } + } + let content = field .text() .await diff --git a/backend/src/services/scraper.rs b/backend/src/services/scraper.rs index e7b60e4..e268dcf 100644 --- a/backend/src/services/scraper.rs +++ b/backend/src/services/scraper.rs @@ -222,6 +222,9 @@ async fn check_ssrf(url: &url::Url) -> Result<(), AppError> { /// - ::1 (IPv6 loopback) /// - :: (IPv6 unspecified) /// - fe80::/10 (IPv6 link-local) +/// - fc00::/7 (IPv6 unique local addresses — ULA) +/// - 2001:db8::/32 (IPv6 documentation range) +/// - 100::/64 (IPv6 discard prefix) fn is_private_ip(ip: IpAddr) -> bool { match ip { IpAddr::V4(v4) => { @@ -231,10 +234,17 @@ fn is_private_ip(ip: IpAddr) -> bool { || v4.is_unspecified() // 0.0.0.0 } IpAddr::V6(v6) => { + let segments = v6.segments(); v6.is_loopback() // ::1 || v6.is_unspecified() // :: - // fe80::/10 (link-local) — check the first 10 bits - || (v6.segments()[0] & 0xffc0) == 0xfe80 + // fe80::/10 (link-local) + || (segments[0] & 0xffc0) == 0xfe80 + // fc00::/7 (unique local addresses — ULA, equivalent to IPv4 private) + || (segments[0] & 0xfe00) == 0xfc00 + // 2001:db8::/32 (documentation range — should never appear in production) + || (segments[0] == 0x2001 && segments[1] == 0x0db8) + // 100::/64 (discard prefix, RFC 6666) + || (segments[0] == 0x0100 && segments[1] == 0 && segments[2] == 0 && segments[3] == 0) } } } @@ -386,6 +396,9 @@ fn parse_date_string(s: &str) -> Option> { /// elements, then collects all remaining text nodes, normalizes whitespace, /// and truncates to [`MAX_BODY_TEXT_CHARS`]. fn extract_body_text(doc: &Html) -> String { + use ego_tree::NodeId; + use scraper::node::Node; + let body_sel = match Selector::parse("body") { Ok(sel) => sel, Err(_) => return String::new(), @@ -407,23 +420,31 @@ fn extract_body_text(doc: &Html) -> String { .filter_map(|tag| Selector::parse(tag).ok()) .collect(); - // Collect IDs of elements to exclude (and all their descendants) - let mut excluded_ids = std::collections::HashSet::new(); + // Collect node IDs of all excluded elements and their descendants + let mut excluded_node_ids = std::collections::HashSet::::new(); for sel in &exclude_selectors { for el in body.select(sel) { - excluded_ids.insert(el.id()); + excluded_node_ids.insert(el.id()); for descendant in el.descendants() { - if let Some(element_ref) = scraper::ElementRef::wrap(descendant) { - excluded_ids.insert(element_ref.id()); - } + excluded_node_ids.insert(descendant.id()); } } } - // Collect text from non-excluded nodes - let mut text_parts: Vec<&str> = Vec::new(); - for text_node in body.text() { - text_parts.push(text_node); + // Walk all descendants of body and collect text from non-excluded nodes + let mut text_parts: Vec = Vec::new(); + for descendant in body.descendants() { + // Skip if this node (or an ancestor) was excluded + if excluded_node_ids.contains(&descendant.id()) { + continue; + } + // Only collect actual text nodes + if let Node::Text(text) = descendant.value() { + let t = text.text.trim(); + if !t.is_empty() { + text_parts.push(t.to_string()); + } + } } // Join and normalize whitespace @@ -513,6 +534,34 @@ mod tests { assert!(is_private_ip(ip)); } + #[test] + fn test_ipv6_unique_local_rejected() { + // fc00::1 — unique local address (ULA) + let ip = IpAddr::V6(Ipv6Addr::new(0xfc00, 0, 0, 0, 0, 0, 0, 1)); + assert!(is_private_ip(ip)); + } + + #[test] + fn test_ipv6_unique_local_fd_rejected() { + // fd12:3456::1 — also ULA (fd00::/8 is within fc00::/7) + let ip = IpAddr::V6(Ipv6Addr::new(0xfd12, 0x3456, 0, 0, 0, 0, 0, 1)); + assert!(is_private_ip(ip)); + } + + #[test] + fn test_ipv6_documentation_rejected() { + // 2001:db8::1 — documentation range + let ip = IpAddr::V6(Ipv6Addr::new(0x2001, 0x0db8, 0, 0, 0, 0, 0, 1)); + assert!(is_private_ip(ip)); + } + + #[test] + fn test_ipv6_discard_rejected() { + // 100::1 — discard prefix + let ip = IpAddr::V6(Ipv6Addr::new(0x0100, 0, 0, 0, 0, 0, 0, 1)); + assert!(is_private_ip(ip)); + } + #[test] fn test_public_ipv4_allowed() { let ip = IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)); @@ -697,9 +746,23 @@ mod tests { let text = extract_body_text(&doc); assert!(text.contains("Visible text")); assert!(text.contains("More visible text")); - // Script content will still appear because body.text() collects all text nodes. - // The improved version should filter these, but the basic extraction - // still provides usable content. + assert!(!text.contains("hidden"), "Script content should be filtered out"); + } + + #[test] + fn test_body_text_strips_nav_footer_aside() { + let html = r#" + +

Article content here

+ + + "#; + let doc = Html::parse_document(html); + let text = extract_body_text(&doc); + assert!(text.contains("Article content here")); + assert!(!text.contains("Home"), "Nav content should be filtered out"); + assert!(!text.contains("Related links"), "Aside content should be filtered out"); + assert!(!text.contains("Copyright"), "Footer content should be filtered out"); } #[test] diff --git a/frontend/src/types.ts b/frontend/src/types.ts index 9c24ed8..4075651 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -86,6 +86,7 @@ export interface BulkImportRequest { export interface BulkImportResponse { imported: number; skipped: number; + errors: string[]; } // ---- API Error ----