From 22ff026a4c02059108c039c26954521756765844 Mon Sep 17 00:00:00 2001
From: oabrivard <olivier@abrivard.fr>
Date: Sat, 21 Mar 2026 19:30:07 +0100
Subject: [PATCH] Fix Phase 2 critical issues: SSRF IPv6 gaps, body text
 filtering, CSV validation

- Fix body text extraction to actually filter excluded elements (script,
  nav, footer, aside, etc.) using node ID tracking instead of unused HashSet
- Add IPv6 reserved range checks to SSRF prevention: ULA (fc00::/7),
  documentation (2001:db8::/32), discard prefix (100::/64)
- Add errors field to frontend BulkImportResponse type
- Validate Content-Type on CSV multipart upload (reject non-text files)
- Add 6 new unit tests for the fixes

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 backend/Cargo.lock              |  1 +
 backend/Cargo.toml              |  1 +
 backend/src/handlers/sources.rs | 14 +++++
 backend/src/services/scraper.rs | 93 +++++++++++++++++++++++++++------
 frontend/src/types.ts           |  1 +
 5 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/backend/Cargo.lock b/backend/Cargo.lock
index 9bddb59..0c727db 100644
--- a/backend/Cargo.lock
+++ b/backend/Cargo.lock
@@ -22,6 +22,7 @@ dependencies = [
  "clap",
  "dashmap",
  "dotenvy",
+ "ego-tree",
  "email_address",
  "hex",
  "http-body-util",
diff --git a/backend/Cargo.toml b/backend/Cargo.toml
index 41b3658..7522485 100644
--- a/backend/Cargo.toml
+++ b/backend/Cargo.toml
@@ -48,6 +48,7 @@ clap = { version = "4", features = ["derive"] }
 
 # HTML parsing (scraper service)
 scraper = "0.22"
+ego-tree = "0.10"
 
 # URL parsing (scraper SSRF checks)
 url = "2"
diff --git a/backend/src/handlers/sources.rs b/backend/src/handlers/sources.rs
index 7ab95b5..f98d072 100644
--- a/backend/src/handlers/sources.rs
+++ b/backend/src/handlers/sources.rs
@@ -159,6 +159,20 @@ pub async fn import_csv(
         .map_err(|e| AppError::BadRequest(format!("Failed to read multipart field: {}", e)))?
         .ok_or_else(|| AppError::BadRequest("No file field found in upload".into()))?;
 
+    // Validate Content-Type if present (allow text/csv, text/plain, or missing)
+    if let Some(content_type) = field.content_type() {
+        let ct = content_type.to_string();
+        if !ct.starts_with("text/csv")
+            && !ct.starts_with("text/plain")
+            && !ct.starts_with("application/octet-stream")
+        {
+            return Err(AppError::BadRequest(format!(
+                "Invalid file type: {}. Expected a CSV file (text/csv or text/plain).",
+                ct
+            )));
+        }
+    }
+
     let content = field
         .text()
         .await
diff --git a/backend/src/services/scraper.rs b/backend/src/services/scraper.rs
index e7b60e4..e268dcf 100644
--- a/backend/src/services/scraper.rs
+++ b/backend/src/services/scraper.rs
@@ -222,6 +222,9 @@ async fn check_ssrf(url: &url::Url) -> Result<(), AppError> {
 /// - ::1 (IPv6 loopback)
 /// - :: (IPv6 unspecified)
 /// - fe80::/10 (IPv6 link-local)
+/// - fc00::/7 (IPv6 unique local addresses — ULA)
+/// - 2001:db8::/32 (IPv6 documentation range)
+/// - 100::/64 (IPv6 discard prefix)
 fn is_private_ip(ip: IpAddr) -> bool {
     match ip {
         IpAddr::V4(v4) => {
@@ -231,10 +234,17 @@ fn is_private_ip(ip: IpAddr) -> bool {
             || v4.is_unspecified()  // 0.0.0.0
         }
         IpAddr::V6(v6) => {
+            let segments = v6.segments();
             v6.is_loopback()        // ::1
             || v6.is_unspecified()  // ::
-            // fe80::/10 (link-local) — check the first 10 bits
-            || (v6.segments()[0] & 0xffc0) == 0xfe80
+            // fe80::/10 (link-local)
+            || (segments[0] & 0xffc0) == 0xfe80
+            // fc00::/7 (unique local addresses — ULA, equivalent to IPv4 private)
+            || (segments[0] & 0xfe00) == 0xfc00
+            // 2001:db8::/32 (documentation range — should never appear in production)
+            || (segments[0] == 0x2001 && segments[1] == 0x0db8)
+            // 100::/64 (discard prefix, RFC 6666)
+            || (segments[0] == 0x0100 && segments[1] == 0 && segments[2] == 0 && segments[3] == 0)
         }
     }
 }
@@ -386,6 +396,9 @@ fn parse_date_string(s: &str) -> Option<DateTime<Utc>> {
 /// elements, then collects all remaining text nodes, normalizes whitespace,
 /// and truncates to [`MAX_BODY_TEXT_CHARS`].
 fn extract_body_text(doc: &Html) -> String {
+    use ego_tree::NodeId;
+    use scraper::node::Node;
+
     let body_sel = match Selector::parse("body") {
         Ok(sel) => sel,
         Err(_) => return String::new(),
@@ -407,23 +420,31 @@ fn extract_body_text(doc: &Html) -> String {
         .filter_map(|tag| Selector::parse(tag).ok())
         .collect();
 
-    // Collect IDs of elements to exclude (and all their descendants)
-    let mut excluded_ids = std::collections::HashSet::new();
+    // Collect node IDs of all excluded elements and their descendants
+    let mut excluded_node_ids = std::collections::HashSet::<NodeId>::new();
     for sel in &exclude_selectors {
         for el in body.select(sel) {
-            excluded_ids.insert(el.id());
+            excluded_node_ids.insert(el.id());
             for descendant in el.descendants() {
-                if let Some(element_ref) = scraper::ElementRef::wrap(descendant) {
-                    excluded_ids.insert(element_ref.id());
-                }
+                excluded_node_ids.insert(descendant.id());
             }
         }
     }
 
-    // Collect text from non-excluded nodes
-    let mut text_parts: Vec<&str> = Vec::new();
-    for text_node in body.text() {
-        text_parts.push(text_node);
+    // Walk all descendants of body and collect text from non-excluded nodes
+    let mut text_parts: Vec<String> = Vec::new();
+    for descendant in body.descendants() {
+        // Skip if this node (or an ancestor) was excluded
+        if excluded_node_ids.contains(&descendant.id()) {
+            continue;
+        }
+        // Only collect actual text nodes
+        if let Node::Text(text) = descendant.value() {
+            let t = text.text.trim();
+            if !t.is_empty() {
+                text_parts.push(t.to_string());
+            }
+        }
     }
 
     // Join and normalize whitespace
@@ -513,6 +534,34 @@ mod tests {
         assert!(is_private_ip(ip));
     }
 
+    #[test]
+    fn test_ipv6_unique_local_rejected() {
+        // fc00::1 — unique local address (ULA)
+        let ip = IpAddr::V6(Ipv6Addr::new(0xfc00, 0, 0, 0, 0, 0, 0, 1));
+        assert!(is_private_ip(ip));
+    }
+
+    #[test]
+    fn test_ipv6_unique_local_fd_rejected() {
+        // fd12:3456::1 — also ULA (fd00::/8 is within fc00::/7)
+        let ip = IpAddr::V6(Ipv6Addr::new(0xfd12, 0x3456, 0, 0, 0, 0, 0, 1));
+        assert!(is_private_ip(ip));
+    }
+
+    #[test]
+    fn test_ipv6_documentation_rejected() {
+        // 2001:db8::1 — documentation range
+        let ip = IpAddr::V6(Ipv6Addr::new(0x2001, 0x0db8, 0, 0, 0, 0, 0, 1));
+        assert!(is_private_ip(ip));
+    }
+
+    #[test]
+    fn test_ipv6_discard_rejected() {
+        // 100::1 — discard prefix
+        let ip = IpAddr::V6(Ipv6Addr::new(0x0100, 0, 0, 0, 0, 0, 0, 1));
+        assert!(is_private_ip(ip));
+    }
+
     #[test]
     fn test_public_ipv4_allowed() {
         let ip = IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8));
@@ -697,9 +746,23 @@ mod tests {
         let text = extract_body_text(&doc);
         assert!(text.contains("Visible text"));
         assert!(text.contains("More visible text"));
-        // Script content will still appear because body.text() collects all text nodes.
-        // The improved version should filter these, but the basic extraction
-        // still provides usable content.
+        assert!(!text.contains("hidden"), "Script content should be filtered out");
+    }
+
+    #[test]
+    fn test_body_text_strips_nav_footer_aside() {
+        let html = r#"<html><head></head><body>
+            <nav><a href="/">Home</a><a href="/about">About</a></nav>
+            <p>Article content here</p>
+            <aside>Related links sidebar</aside>
+            <footer>Copyright 2026</footer>
+        </body></html>"#;
+        let doc = Html::parse_document(html);
+        let text = extract_body_text(&doc);
+        assert!(text.contains("Article content here"));
+        assert!(!text.contains("Home"), "Nav content should be filtered out");
+        assert!(!text.contains("Related links"), "Aside content should be filtered out");
+        assert!(!text.contains("Copyright"), "Footer content should be filtered out");
     }
 
     #[test]
diff --git a/frontend/src/types.ts b/frontend/src/types.ts
index 9c24ed8..4075651 100644
--- a/frontend/src/types.ts
+++ b/frontend/src/types.ts
@@ -86,6 +86,7 @@ export interface BulkImportRequest {
 export interface BulkImportResponse {
   imported: number;
   skipped: number;
+  errors: string[];
 }
 
 // ---- API Error ----