perf: cache CSS selectors with LazyLock to avoid re-parsing

Replace runtime Selector::parse calls on static strings with module-level
LazyLock statics in source_scraper.rs (ANCHOR_SELECTOR) and scraper.rs
(SEL_TITLE, SEL_H1, SEL_BODY), so each selector is compiled once at
first use instead of on every function call.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 69c1688bc7
commit 60494aeceb

@ -5,6 +5,7 @@
//! synthesis generation (Phase 5) to validate and enrich news articles. //! synthesis generation (Phase 5) to validate and enrich news articles.
use std::net::IpAddr; use std::net::IpAddr;
use std::sync::LazyLock;
use chrono::{DateTime, NaiveDate, Utc}; use chrono::{DateTime, NaiveDate, Utc};
use scraper::{Html, Selector}; use scraper::{Html, Selector};
@ -12,6 +13,10 @@ use serde::Serialize;
use crate::errors::AppError; use crate::errors::AppError;
static SEL_TITLE: LazyLock<Selector> = LazyLock::new(|| Selector::parse("title").unwrap());
static SEL_H1: LazyLock<Selector> = LazyLock::new(|| Selector::parse("h1").unwrap());
static SEL_BODY: LazyLock<Selector> = LazyLock::new(|| Selector::parse("body").unwrap());
/// Custom User-Agent used for all scraper requests. /// Custom User-Agent used for all scraper requests.
const USER_AGENT: &str = "AISynth/1.0 (+https://github.com/ai-synth)"; const USER_AGENT: &str = "AISynth/1.0 (+https://github.com/ai-synth)";
@ -329,16 +334,14 @@ fn is_private_ip(ip: IpAddr) -> bool {
/// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None. /// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None.
fn extract_page_title(doc: &Html) -> Option<String> { fn extract_page_title(doc: &Html) -> Option<String> {
// 1. Try <title> element // 1. Try <title> element
if let Ok(sel) = Selector::parse("title") {
if let Some(title) = doc if let Some(title) = doc
.select(&sel) .select(&SEL_TITLE)
.next() .next()
.map(|el| el.text().collect::<String>().trim().to_string()) .map(|el| el.text().collect::<String>().trim().to_string())
.filter(|t| !t.is_empty()) .filter(|t| !t.is_empty())
{ {
return Some(title); return Some(title);
} }
}
// 2. Try <meta property="og:title"> // 2. Try <meta property="og:title">
if let Ok(sel) = Selector::parse(r#"meta[property="og:title"]"#) { if let Ok(sel) = Selector::parse(r#"meta[property="og:title"]"#) {
@ -354,16 +357,14 @@ fn extract_page_title(doc: &Html) -> Option<String> {
} }
// 3. Try first <h1> // 3. Try first <h1>
if let Ok(sel) = Selector::parse("h1") {
if let Some(h1) = doc if let Some(h1) = doc
.select(&sel) .select(&SEL_H1)
.next() .next()
.map(|el| el.text().collect::<String>().trim().to_string()) .map(|el| el.text().collect::<String>().trim().to_string())
.filter(|t| !t.is_empty()) .filter(|t| !t.is_empty())
{ {
return Some(h1); return Some(h1);
} }
}
None None
} }
@ -371,15 +372,15 @@ fn extract_page_title(doc: &Html) -> Option<String> {
/// Detect whether a page is a soft-404 by checking the page title /// Detect whether a page is a soft-404 by checking the page title
/// and first `<h1>` element for error keywords. /// and first `<h1>` element for error keywords.
fn detect_soft_404(doc: &Html) -> bool { fn detect_soft_404(doc: &Html) -> bool {
let title_text = Selector::parse("title") let title_text = doc
.ok() .select(&SEL_TITLE)
.and_then(|sel| doc.select(&sel).next()) .next()
.map(|el| el.text().collect::<String>().to_lowercase()) .map(|el| el.text().collect::<String>().to_lowercase())
.unwrap_or_default(); .unwrap_or_default();
let h1_text = Selector::parse("h1") let h1_text = doc
.ok() .select(&SEL_H1)
.and_then(|sel| doc.select(&sel).next()) .next()
.map(|el| el.text().collect::<String>().to_lowercase()) .map(|el| el.text().collect::<String>().to_lowercase())
.unwrap_or_default(); .unwrap_or_default();
@ -607,12 +608,7 @@ fn extract_body_text(doc: &Html) -> String {
use ego_tree::NodeId; use ego_tree::NodeId;
use scraper::node::Node; use scraper::node::Node;
let body_sel = match Selector::parse("body") { let body = match doc.select(&SEL_BODY).next() {
Ok(sel) => sel,
Err(_) => return String::new(),
};
let body = match doc.select(&body_sel).next() {
Some(b) => b, Some(b) => b,
None => return String::new(), None => return String::new(),
}; };

@ -3,7 +3,7 @@
//! Used in Phase 1 of the generation pipeline to discover articles //! Used in Phase 1 of the generation pipeline to discover articles
//! from user-configured sources before falling back to LLM web search. //! from user-configured sources before falling back to LLM web search.
use std::sync::Arc; use std::sync::{Arc, LazyLock};
use crate::errors::AppError; use crate::errors::AppError;
use crate::services::llm::LlmProvider; use crate::services::llm::LlmProvider;
use crate::services::llm::schema::build_link_extraction_schema; use crate::services::llm::schema::build_link_extraction_schema;
@ -11,6 +11,8 @@ use crate::services::prompts::build_link_extraction_prompt;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use url::Url; use url::Url;
static ANCHOR_SELECTOR: LazyLock<Selector> = LazyLock::new(|| Selector::parse("a[href]").unwrap());
/// Patterns in URL paths that indicate non-article pages. /// Patterns in URL paths that indicate non-article pages.
const EXCLUDED_PATH_PATTERNS: &[&str] = &[ const EXCLUDED_PATH_PATTERNS: &[&str] = &[
"/tag", "/category", "/author", "/page", "/login", "/signup", "/tag", "/category", "/author", "/page", "/login", "/signup",
@ -77,11 +79,10 @@ pub fn extract_links_from_html(
base_domain: &str, base_domain: &str,
) -> Vec<String> { ) -> Vec<String> {
let document = Html::parse_document(html); let document = Html::parse_document(html);
let selector = Selector::parse("a[href]").unwrap();
let mut seen = std::collections::HashSet::new(); let mut seen = std::collections::HashSet::new();
let mut links = Vec::new(); let mut links = Vec::new();
for element in document.select(&selector) { for element in document.select(&ANCHOR_SELECTOR) {
if let Some(href) = element.value().attr("href") { if let Some(href) = element.value().attr("href") {
let resolved = match base_url.join(href) { let resolved = match base_url.join(href) {
Ok(u) => u, Ok(u) => u,
@ -134,10 +135,9 @@ pub fn extract_links_as_pairs(
) -> Vec<(String, String)> { ) -> Vec<(String, String)> {
let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
let document = Html::parse_document(html); let document = Html::parse_document(html);
let selector = Selector::parse("a[href]").unwrap();
let mut pairs = Vec::new(); let mut pairs = Vec::new();
for element in document.select(&selector) { for element in document.select(&ANCHOR_SELECTOR) {
if let Some(href) = element.value().attr("href") { if let Some(href) = element.value().attr("href") {
let resolved = match base_url.join(href) { let resolved = match base_url.join(href) {
Ok(u) => u, Ok(u) => u,

Loading…
Cancel
Save