@ -5,6 +5,7 @@
//! synthesis generation (Phase 5) to validate and enrich news articles.
//! synthesis generation (Phase 5) to validate and enrich news articles.
use std ::net ::IpAddr ;
use std ::net ::IpAddr ;
use std ::sync ::LazyLock ;
use chrono ::{ DateTime , NaiveDate , Utc } ;
use chrono ::{ DateTime , NaiveDate , Utc } ;
use scraper ::{ Html , Selector } ;
use scraper ::{ Html , Selector } ;
@ -12,6 +13,10 @@ use serde::Serialize;
use crate ::errors ::AppError ;
use crate ::errors ::AppError ;
static SEL_TITLE : LazyLock < Selector > = LazyLock ::new ( | | Selector ::parse ( "title" ) . unwrap ( ) ) ;
static SEL_H1 : LazyLock < Selector > = LazyLock ::new ( | | Selector ::parse ( "h1" ) . unwrap ( ) ) ;
static SEL_BODY : LazyLock < Selector > = LazyLock ::new ( | | Selector ::parse ( "body" ) . unwrap ( ) ) ;
/// Custom User-Agent used for all scraper requests.
/// Custom User-Agent used for all scraper requests.
const USER_AGENT : & str = "AISynth/1.0 (+https://github.com/ai-synth)" ;
const USER_AGENT : & str = "AISynth/1.0 (+https://github.com/ai-synth)" ;
@ -329,15 +334,13 @@ fn is_private_ip(ip: IpAddr) -> bool {
/// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None.
/// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None.
fn extract_page_title ( doc : & Html ) -> Option < String > {
fn extract_page_title ( doc : & Html ) -> Option < String > {
// 1. Try <title> element
// 1. Try <title> element
if let Ok ( sel ) = Selector ::parse ( "title" ) {
if let Some ( title ) = doc
if let Some ( title ) = doc
. select ( & SEL_TITLE )
. select ( & sel )
. next ( )
. next ( )
. map ( | el | el . text ( ) . collect ::< String > ( ) . trim ( ) . to_string ( ) )
. map ( | el | el . text ( ) . collect ::< String > ( ) . trim ( ) . to_string ( ) )
. filter ( | t | ! t . is_empty ( ) )
. filter ( | t | ! t . is_empty ( ) )
{
{
return Some ( title ) ;
return Some ( title ) ;
}
}
}
// 2. Try <meta property="og:title">
// 2. Try <meta property="og:title">
@ -354,15 +357,13 @@ fn extract_page_title(doc: &Html) -> Option<String> {
}
}
// 3. Try first <h1>
// 3. Try first <h1>
if let Ok ( sel ) = Selector ::parse ( "h1" ) {
if let Some ( h1 ) = doc
if let Some ( h1 ) = doc
. select ( & SEL_H1 )
. select ( & sel )
. next ( )
. next ( )
. map ( | el | el . text ( ) . collect ::< String > ( ) . trim ( ) . to_string ( ) )
. map ( | el | el . text ( ) . collect ::< String > ( ) . trim ( ) . to_string ( ) )
. filter ( | t | ! t . is_empty ( ) )
. filter ( | t | ! t . is_empty ( ) )
{
{
return Some ( h1 ) ;
return Some ( h1 ) ;
}
}
}
None
None
@ -371,15 +372,15 @@ fn extract_page_title(doc: &Html) -> Option<String> {
/// Detect whether a page is a soft-404 by checking the page title
/// Detect whether a page is a soft-404 by checking the page title
/// and first `<h1>` element for error keywords.
/// and first `<h1>` element for error keywords.
fn detect_soft_404 ( doc : & Html ) -> bool {
fn detect_soft_404 ( doc : & Html ) -> bool {
let title_text = Selector::parse ( "title" )
let title_text = doc
. ok( )
. select( & SEL_TITLE )
. and_then( | sel | doc . select ( & sel ) . next( ) )
. next( )
. map ( | el | el . text ( ) . collect ::< String > ( ) . to_lowercase ( ) )
. map ( | el | el . text ( ) . collect ::< String > ( ) . to_lowercase ( ) )
. unwrap_or_default ( ) ;
. unwrap_or_default ( ) ;
let h1_text = Selector::parse ( "h1" )
let h1_text = doc
. ok( )
. select( & SEL_H1 )
. and_then( | sel | doc . select ( & sel ) . next( ) )
. next( )
. map ( | el | el . text ( ) . collect ::< String > ( ) . to_lowercase ( ) )
. map ( | el | el . text ( ) . collect ::< String > ( ) . to_lowercase ( ) )
. unwrap_or_default ( ) ;
. unwrap_or_default ( ) ;
@ -607,12 +608,7 @@ fn extract_body_text(doc: &Html) -> String {
use ego_tree ::NodeId ;
use ego_tree ::NodeId ;
use scraper ::node ::Node ;
use scraper ::node ::Node ;
let body_sel = match Selector ::parse ( "body" ) {
let body = match doc . select ( & SEL_BODY ) . next ( ) {
Ok ( sel ) = > sel ,
Err ( _ ) = > return String ::new ( ) ,
} ;
let body = match doc . select ( & body_sel ) . next ( ) {
Some ( b ) = > b ,
Some ( b ) = > b ,
None = > return String ::new ( ) ,
None = > return String ::new ( ) ,
} ;
} ;