diff --git a/backend/src/services/scraper.rs b/backend/src/services/scraper.rs index 1ac6fe0..0c138bc 100644 --- a/backend/src/services/scraper.rs +++ b/backend/src/services/scraper.rs @@ -5,6 +5,7 @@ //! synthesis generation (Phase 5) to validate and enrich news articles. use std::net::IpAddr; +use std::sync::LazyLock; use chrono::{DateTime, NaiveDate, Utc}; use scraper::{Html, Selector}; @@ -12,6 +13,10 @@ use serde::Serialize; use crate::errors::AppError; +static SEL_TITLE: LazyLock = LazyLock::new(|| Selector::parse("title").unwrap()); +static SEL_H1: LazyLock = LazyLock::new(|| Selector::parse("h1").unwrap()); +static SEL_BODY: LazyLock = LazyLock::new(|| Selector::parse("body").unwrap()); + /// Custom User-Agent used for all scraper requests. const USER_AGENT: &str = "AISynth/1.0 (+https://github.com/ai-synth)"; @@ -329,15 +334,13 @@ fn is_private_ip(ip: IpAddr) -> bool { /// Extract the page title using a priority chain: `` -> `og:title` -> `<h1>` -> None. fn extract_page_title(doc: &Html) -> Option<String> { // 1. Try <title> element - if let Ok(sel) = Selector::parse("title") { - if let Some(title) = doc - .select(&sel) - .next() - .map(|el| el.text().collect::<String>().trim().to_string()) - .filter(|t| !t.is_empty()) - { - return Some(title); - } + if let Some(title) = doc + .select(&SEL_TITLE) + .next() + .map(|el| el.text().collect::<String>().trim().to_string()) + .filter(|t| !t.is_empty()) + { + return Some(title); } // 2. Try <meta property="og:title"> @@ -354,15 +357,13 @@ fn extract_page_title(doc: &Html) -> Option<String> { } // 3. Try first <h1> - if let Ok(sel) = Selector::parse("h1") { - if let Some(h1) = doc - .select(&sel) - .next() - .map(|el| el.text().collect::<String>().trim().to_string()) - .filter(|t| !t.is_empty()) - { - return Some(h1); - } + if let Some(h1) = doc + .select(&SEL_H1) + .next() + .map(|el| el.text().collect::<String>().trim().to_string()) + .filter(|t| !t.is_empty()) + { + return Some(h1); } None @@ -371,15 +372,15 @@ fn extract_page_title(doc: &Html) -> Option<String> { /// Detect whether a page is a soft-404 by checking the page title /// and first `<h1>` element for error keywords. fn detect_soft_404(doc: &Html) -> bool { - let title_text = Selector::parse("title") - .ok() - .and_then(|sel| doc.select(&sel).next()) + let title_text = doc + .select(&SEL_TITLE) + .next() .map(|el| el.text().collect::<String>().to_lowercase()) .unwrap_or_default(); - let h1_text = Selector::parse("h1") - .ok() - .and_then(|sel| doc.select(&sel).next()) + let h1_text = doc + .select(&SEL_H1) + .next() .map(|el| el.text().collect::<String>().to_lowercase()) .unwrap_or_default(); @@ -607,12 +608,7 @@ fn extract_body_text(doc: &Html) -> String { use ego_tree::NodeId; use scraper::node::Node; - let body_sel = match Selector::parse("body") { - Ok(sel) => sel, - Err(_) => return String::new(), - }; - - let body = match doc.select(&body_sel).next() { + let body = match doc.select(&SEL_BODY).next() { Some(b) => b, None => return String::new(), }; diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs index 5caa0e7..883e554 100644 --- a/backend/src/services/source_scraper.rs +++ b/backend/src/services/source_scraper.rs @@ -3,7 +3,7 @@ //! Used in Phase 1 of the generation pipeline to discover articles //! from user-configured sources before falling back to LLM web search. -use std::sync::Arc; +use std::sync::{Arc, LazyLock}; use crate::errors::AppError; use crate::services::llm::LlmProvider; use crate::services::llm::schema::build_link_extraction_schema; @@ -11,6 +11,8 @@ use crate::services::prompts::build_link_extraction_prompt; use scraper::{Html, Selector}; use url::Url; +static ANCHOR_SELECTOR: LazyLock<Selector> = LazyLock::new(|| Selector::parse("a[href]").unwrap()); + /// Patterns in URL paths that indicate non-article pages. const EXCLUDED_PATH_PATTERNS: &[&str] = &[ "/tag", "/category", "/author", "/page", "/login", "/signup", @@ -77,11 +79,10 @@ pub fn extract_links_from_html( base_domain: &str, ) -> Vec<String> { let document = Html::parse_document(html); - let selector = Selector::parse("a[href]").unwrap(); let mut seen = std::collections::HashSet::new(); let mut links = Vec::new(); - for element in document.select(&selector) { + for element in document.select(&ANCHOR_SELECTOR) { if let Some(href) = element.value().attr("href") { let resolved = match base_url.join(href) { Ok(u) => u, @@ -134,10 +135,9 @@ pub fn extract_links_as_pairs( ) -> Vec<(String, String)> { let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); let document = Html::parse_document(html); - let selector = Selector::parse("a[href]").unwrap(); let mut pairs = Vec::new(); - for element in document.select(&selector) { + for element in document.select(&ANCHOR_SELECTOR) { if let Some(href) = element.value().attr("href") { let resolved = match base_url.join(href) { Ok(u) => u,