@ -222,6 +222,9 @@ async fn check_ssrf(url: &url::Url) -> Result<(), AppError> {
/// - ::1 (IPv6 loopback)
/// - ::1 (IPv6 loopback)
/// - :: (IPv6 unspecified)
/// - :: (IPv6 unspecified)
/// - fe80::/10 (IPv6 link-local)
/// - fe80::/10 (IPv6 link-local)
/// - fc00::/7 (IPv6 unique local addresses — ULA)
/// - 2001:db8::/32 (IPv6 documentation range)
/// - 100::/64 (IPv6 discard prefix)
fn is_private_ip ( ip : IpAddr ) -> bool {
fn is_private_ip ( ip : IpAddr ) -> bool {
match ip {
match ip {
IpAddr ::V4 ( v4 ) = > {
IpAddr ::V4 ( v4 ) = > {
@ -231,10 +234,17 @@ fn is_private_ip(ip: IpAddr) -> bool {
| | v4 . is_unspecified ( ) // 0.0.0.0
| | v4 . is_unspecified ( ) // 0.0.0.0
}
}
IpAddr ::V6 ( v6 ) = > {
IpAddr ::V6 ( v6 ) = > {
let segments = v6 . segments ( ) ;
v6 . is_loopback ( ) // ::1
v6 . is_loopback ( ) // ::1
| | v6 . is_unspecified ( ) // ::
| | v6 . is_unspecified ( ) // ::
// fe80::/10 (link-local) — check the first 10 bits
// fe80::/10 (link-local)
| | ( v6 . segments ( ) [ 0 ] & 0xffc0 ) = = 0xfe80
| | ( segments [ 0 ] & 0xffc0 ) = = 0xfe80
// fc00::/7 (unique local addresses — ULA, equivalent to IPv4 private)
| | ( segments [ 0 ] & 0xfe00 ) = = 0xfc00
// 2001:db8::/32 (documentation range — should never appear in production)
| | ( segments [ 0 ] = = 0x2001 & & segments [ 1 ] = = 0x0db8 )
// 100::/64 (discard prefix, RFC 6666)
| | ( segments [ 0 ] = = 0x0100 & & segments [ 1 ] = = 0 & & segments [ 2 ] = = 0 & & segments [ 3 ] = = 0 )
}
}
}
}
}
}
@ -386,6 +396,9 @@ fn parse_date_string(s: &str) -> Option<DateTime<Utc>> {
/// elements, then collects all remaining text nodes, normalizes whitespace,
/// elements, then collects all remaining text nodes, normalizes whitespace,
/// and truncates to [`MAX_BODY_TEXT_CHARS`].
/// and truncates to [`MAX_BODY_TEXT_CHARS`].
fn extract_body_text ( doc : & Html ) -> String {
fn extract_body_text ( doc : & Html ) -> String {
use ego_tree ::NodeId ;
use scraper ::node ::Node ;
let body_sel = match Selector ::parse ( "body" ) {
let body_sel = match Selector ::parse ( "body" ) {
Ok ( sel ) = > sel ,
Ok ( sel ) = > sel ,
Err ( _ ) = > return String ::new ( ) ,
Err ( _ ) = > return String ::new ( ) ,
@ -407,23 +420,31 @@ fn extract_body_text(doc: &Html) -> String {
. filter_map ( | tag | Selector ::parse ( tag ) . ok ( ) )
. filter_map ( | tag | Selector ::parse ( tag ) . ok ( ) )
. collect ( ) ;
. collect ( ) ;
// Collect IDs of elements to exclude (and all their descendants)
// Collect node IDs of all excluded elements and their descendants
let mut excluded_ ids = std ::collections ::HashSet ::new ( ) ;
let mut excluded_ node_ ids = std ::collections ::HashSet ::< NodeId > ::new ( ) ;
for sel in & exclude_selectors {
for sel in & exclude_selectors {
for el in body . select ( sel ) {
for el in body . select ( sel ) {
excluded_ ids. insert ( el . id ( ) ) ;
excluded_ node_ ids. insert ( el . id ( ) ) ;
for descendant in el . descendants ( ) {
for descendant in el . descendants ( ) {
if let Some ( element_ref ) = scraper ::ElementRef ::wrap ( descendant ) {
excluded_node_ids . insert ( descendant . id ( ) ) ;
excluded_ids . insert ( element_ref . id ( ) ) ;
}
}
}
}
}
}
}
// Collect text from non-excluded nodes
// Walk all descendants of body and collect text from non-excluded nodes
let mut text_parts : Vec < & str > = Vec ::new ( ) ;
let mut text_parts : Vec < String > = Vec ::new ( ) ;
for text_node in body . text ( ) {
for descendant in body . descendants ( ) {
text_parts . push ( text_node ) ;
// Skip if this node (or an ancestor) was excluded
if excluded_node_ids . contains ( & descendant . id ( ) ) {
continue ;
}
// Only collect actual text nodes
if let Node ::Text ( text ) = descendant . value ( ) {
let t = text . text . trim ( ) ;
if ! t . is_empty ( ) {
text_parts . push ( t . to_string ( ) ) ;
}
}
}
}
// Join and normalize whitespace
// Join and normalize whitespace
@ -513,6 +534,34 @@ mod tests {
assert! ( is_private_ip ( ip ) ) ;
assert! ( is_private_ip ( ip ) ) ;
}
}
#[ test ]
fn test_ipv6_unique_local_rejected ( ) {
// fc00::1 — unique local address (ULA)
let ip = IpAddr ::V6 ( Ipv6Addr ::new ( 0xfc00 , 0 , 0 , 0 , 0 , 0 , 0 , 1 ) ) ;
assert! ( is_private_ip ( ip ) ) ;
}
#[ test ]
fn test_ipv6_unique_local_fd_rejected ( ) {
// fd12:3456::1 — also ULA (fd00::/8 is within fc00::/7)
let ip = IpAddr ::V6 ( Ipv6Addr ::new ( 0xfd12 , 0x3456 , 0 , 0 , 0 , 0 , 0 , 1 ) ) ;
assert! ( is_private_ip ( ip ) ) ;
}
#[ test ]
fn test_ipv6_documentation_rejected ( ) {
// 2001:db8::1 — documentation range
let ip = IpAddr ::V6 ( Ipv6Addr ::new ( 0x2001 , 0x0db8 , 0 , 0 , 0 , 0 , 0 , 1 ) ) ;
assert! ( is_private_ip ( ip ) ) ;
}
#[ test ]
fn test_ipv6_discard_rejected ( ) {
// 100::1 — discard prefix
let ip = IpAddr ::V6 ( Ipv6Addr ::new ( 0x0100 , 0 , 0 , 0 , 0 , 0 , 0 , 1 ) ) ;
assert! ( is_private_ip ( ip ) ) ;
}
#[ test ]
#[ test ]
fn test_public_ipv4_allowed ( ) {
fn test_public_ipv4_allowed ( ) {
let ip = IpAddr ::V4 ( Ipv4Addr ::new ( 8 , 8 , 8 , 8 ) ) ;
let ip = IpAddr ::V4 ( Ipv4Addr ::new ( 8 , 8 , 8 , 8 ) ) ;
@ -697,9 +746,23 @@ mod tests {
let text = extract_body_text ( & doc ) ;
let text = extract_body_text ( & doc ) ;
assert! ( text . contains ( "Visible text" ) ) ;
assert! ( text . contains ( "Visible text" ) ) ;
assert! ( text . contains ( "More visible text" ) ) ;
assert! ( text . contains ( "More visible text" ) ) ;
// Script content will still appear because body.text() collects all text nodes.
assert! ( ! text . contains ( "hidden" ) , "Script content should be filtered out" ) ;
// The improved version should filter these, but the basic extraction
}
// still provides usable content.
#[ test ]
fn test_body_text_strips_nav_footer_aside ( ) {
let html = r #" < html > < head > < / head > < body >
< nav > < a href = "/" > Home < / a > < a href = "/about" > About < / a > < / nav >
< p > Article content here < / p >
< aside > Related links sidebar < / aside >
< footer > Copyright 2026 < / footer >
< / body > < / html > " #;
let doc = Html ::parse_document ( html ) ;
let text = extract_body_text ( & doc ) ;
assert! ( text . contains ( "Article content here" ) ) ;
assert! ( ! text . contains ( "Home" ) , "Nav content should be filtered out" ) ;
assert! ( ! text . contains ( "Related links" ) , "Aside content should be filtered out" ) ;
assert! ( ! text . contains ( "Copyright" ) , "Footer content should be filtered out" ) ;
}
}
#[ test ]
#[ test ]