perf: cache CSS selectors with LazyLock to avoid re-parsing

Replace runtime Selector::parse calls on static strings with module-level LazyLock statics in source_scraper.rs (ANCHOR_SELECTOR) and scraper.rs (SEL_TITLE, SEL_H1, SEL_BODY), so each selector is compiled once at first use instead of on every function call. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · 60494aeceb
parent 69c1688bc7
commit 60494aeceb
2 changed files with 31 additions and 35 deletions
--- a/backend/src/services/scraper.rs
+++ b/backend/src/services/scraper.rs
@ -5,6 +5,7 @@
 //! synthesis generation (Phase 5) to validate and enrich news articles.
 use std::net::IpAddr;
 use std::sync::LazyLock;
 use chrono::{DateTime, NaiveDate, Utc};
 use scraper::{Html, Selector};
@ -12,6 +13,10 @@ use serde::Serialize;
 use crate::errors::AppError;
 static SEL_TITLE: LazyLock<Selector> = LazyLock::new(|| Selector::parse("title").unwrap());
 static SEL_H1: LazyLock<Selector> = LazyLock::new(|| Selector::parse("h1").unwrap());
 static SEL_BODY: LazyLock<Selector> = LazyLock::new(|| Selector::parse("body").unwrap());
 /// Custom User-Agent used for all scraper requests.
 const USER_AGENT: &str = "AISynth/1.0 (+https://github.com/ai-synth)";
@ -329,16 +334,14 @@ fn is_private_ip(ip: IpAddr) -> bool {
 /// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None.
 fn extract_page_title(doc: &Html) -> Option<String> {
    // 1. Try <title> element
    if let Ok(sel) = Selector::parse("title") {
    if let Some(title) = doc
-            .select(&sel)
+        .select(&SEL_TITLE)
        .next()
        .map(|el| el.text().collect::<String>().trim().to_string())
        .filter(|t| !t.is_empty())
    {
        return Some(title);
    }
    }
    // 2. Try <meta property="og:title">
    if let Ok(sel) = Selector::parse(r#"meta[property="og:title"]"#) {
@ -354,16 +357,14 @@ fn extract_page_title(doc: &Html) -> Option<String> {
    }
    // 3. Try first <h1>
    if let Ok(sel) = Selector::parse("h1") {
    if let Some(h1) = doc
-            .select(&sel)
+        .select(&SEL_H1)
        .next()
        .map(|el| el.text().collect::<String>().trim().to_string())
        .filter(|t| !t.is_empty())
    {
        return Some(h1);
    }
    }
    None
 }
@ -371,15 +372,15 @@ fn extract_page_title(doc: &Html) -> Option<String> {
 /// Detect whether a page is a soft-404 by checking the page title
 /// and first `<h1>` element for error keywords.
 fn detect_soft_404(doc: &Html) -> bool {
-    let title_text = Selector::parse("title")
+    let title_text = doc
-        .ok()
+        .select(&SEL_TITLE)
-        .and_then(|sel| doc.select(&sel).next())
+        .next()
        .map(|el| el.text().collect::<String>().to_lowercase())
        .unwrap_or_default();
-    let h1_text = Selector::parse("h1")
+    let h1_text = doc
-        .ok()
+        .select(&SEL_H1)
-        .and_then(|sel| doc.select(&sel).next())
+        .next()
        .map(|el| el.text().collect::<String>().to_lowercase())
        .unwrap_or_default();
@ -607,12 +608,7 @@ fn extract_body_text(doc: &Html) -> String {
    use ego_tree::NodeId;
    use scraper::node::Node;
-    let body_sel = match Selector::parse("body") {
+    let body = match doc.select(&SEL_BODY).next() {
        Ok(sel) => sel,
        Err(_) => return String::new(),
    };
    let body = match doc.select(&body_sel).next() {
        Some(b) => b,
        None => return String::new(),
    };
--- a/backend/src/services/source_scraper.rs
+++ b/backend/src/services/source_scraper.rs
@ -3,7 +3,7 @@
 //! Used in Phase 1 of the generation pipeline to discover articles
 //! from user-configured sources before falling back to LLM web search.
-use std::sync::Arc;
+use std::sync::{Arc, LazyLock};
 use crate::errors::AppError;
 use crate::services::llm::LlmProvider;
 use crate::services::llm::schema::build_link_extraction_schema;
@ -11,6 +11,8 @@ use crate::services::prompts::build_link_extraction_prompt;
 use scraper::{Html, Selector};
 use url::Url;
 static ANCHOR_SELECTOR: LazyLock<Selector> = LazyLock::new(|| Selector::parse("a[href]").unwrap());
 /// Patterns in URL paths that indicate non-article pages.
 const EXCLUDED_PATH_PATTERNS: &[&str] = &[
    "/tag", "/category", "/author", "/page", "/login", "/signup",
@ -77,11 +79,10 @@ pub fn extract_links_from_html(
    base_domain: &str,
 ) -> Vec<String> {
    let document = Html::parse_document(html);
    let selector = Selector::parse("a[href]").unwrap();
    let mut seen = std::collections::HashSet::new();
    let mut links = Vec::new();
-    for element in document.select(&selector) {
+    for element in document.select(&ANCHOR_SELECTOR) {
        if let Some(href) = element.value().attr("href") {
            let resolved = match base_url.join(href) {
                Ok(u) => u,
@ -134,10 +135,9 @@ pub fn extract_links_as_pairs(
 ) -> Vec<(String, String)> {
    let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
    let document = Html::parse_document(html);
    let selector = Selector::parse("a[href]").unwrap();
    let mut pairs = Vec::new();
-    for element in document.select(&selector) {
+    for element in document.select(&ANCHOR_SELECTOR) {
        if let Some(href) = element.value().attr("href") {
            let resolved = match base_url.join(href) {
                Ok(u) => u,