perf: cache CSS selectors with LazyLock to avoid re-parsing

Replace runtime Selector::parse calls on static strings with module-level
LazyLock statics in source_scraper.rs (ANCHOR_SELECTOR) and scraper.rs
(SEL_TITLE, SEL_H1, SEL_BODY), so each selector is compiled once at
first use instead of on every function call.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 69c1688bc7
commit 60494aeceb

@ -5,6 +5,7 @@
//! synthesis generation (Phase 5) to validate and enrich news articles.
use std::net::IpAddr;
use std::sync::LazyLock;
use chrono::{DateTime, NaiveDate, Utc};
use scraper::{Html, Selector};
@ -12,6 +13,10 @@ use serde::Serialize;
use crate::errors::AppError;
static SEL_TITLE: LazyLock<Selector> = LazyLock::new(|| Selector::parse("title").unwrap());
static SEL_H1: LazyLock<Selector> = LazyLock::new(|| Selector::parse("h1").unwrap());
static SEL_BODY: LazyLock<Selector> = LazyLock::new(|| Selector::parse("body").unwrap());
/// Custom User-Agent used for all scraper requests.
const USER_AGENT: &str = "AISynth/1.0 (+https://github.com/ai-synth)";
@ -329,16 +334,14 @@ fn is_private_ip(ip: IpAddr) -> bool {
/// Extract the page title using a priority chain: `<title>` -> `og:title` -> `<h1>` -> None.
fn extract_page_title(doc: &Html) -> Option<String> {
// 1. Try <title> element
if let Ok(sel) = Selector::parse("title") {
if let Some(title) = doc
.select(&sel)
.select(&SEL_TITLE)
.next()
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|t| !t.is_empty())
{
return Some(title);
}
}
// 2. Try <meta property="og:title">
if let Ok(sel) = Selector::parse(r#"meta[property="og:title"]"#) {
@ -354,16 +357,14 @@ fn extract_page_title(doc: &Html) -> Option<String> {
}
// 3. Try first <h1>
if let Ok(sel) = Selector::parse("h1") {
if let Some(h1) = doc
.select(&sel)
.select(&SEL_H1)
.next()
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|t| !t.is_empty())
{
return Some(h1);
}
}
None
}
@ -371,15 +372,15 @@ fn extract_page_title(doc: &Html) -> Option<String> {
/// Detect whether a page is a soft-404 by checking the page title
/// and first `<h1>` element for error keywords.
fn detect_soft_404(doc: &Html) -> bool {
let title_text = Selector::parse("title")
.ok()
.and_then(|sel| doc.select(&sel).next())
let title_text = doc
.select(&SEL_TITLE)
.next()
.map(|el| el.text().collect::<String>().to_lowercase())
.unwrap_or_default();
let h1_text = Selector::parse("h1")
.ok()
.and_then(|sel| doc.select(&sel).next())
let h1_text = doc
.select(&SEL_H1)
.next()
.map(|el| el.text().collect::<String>().to_lowercase())
.unwrap_or_default();
@ -607,12 +608,7 @@ fn extract_body_text(doc: &Html) -> String {
use ego_tree::NodeId;
use scraper::node::Node;
let body_sel = match Selector::parse("body") {
Ok(sel) => sel,
Err(_) => return String::new(),
};
let body = match doc.select(&body_sel).next() {
let body = match doc.select(&SEL_BODY).next() {
Some(b) => b,
None => return String::new(),
};

@ -3,7 +3,7 @@
//! Used in Phase 1 of the generation pipeline to discover articles
//! from user-configured sources before falling back to LLM web search.
use std::sync::Arc;
use std::sync::{Arc, LazyLock};
use crate::errors::AppError;
use crate::services::llm::LlmProvider;
use crate::services::llm::schema::build_link_extraction_schema;
@ -11,6 +11,8 @@ use crate::services::prompts::build_link_extraction_prompt;
use scraper::{Html, Selector};
use url::Url;
static ANCHOR_SELECTOR: LazyLock<Selector> = LazyLock::new(|| Selector::parse("a[href]").unwrap());
/// Patterns in URL paths that indicate non-article pages.
const EXCLUDED_PATH_PATTERNS: &[&str] = &[
"/tag", "/category", "/author", "/page", "/login", "/signup",
@ -77,11 +79,10 @@ pub fn extract_links_from_html(
base_domain: &str,
) -> Vec<String> {
let document = Html::parse_document(html);
let selector = Selector::parse("a[href]").unwrap();
let mut seen = std::collections::HashSet::new();
let mut links = Vec::new();
for element in document.select(&selector) {
for element in document.select(&ANCHOR_SELECTOR) {
if let Some(href) = element.value().attr("href") {
let resolved = match base_url.join(href) {
Ok(u) => u,
@ -134,10 +135,9 @@ pub fn extract_links_as_pairs(
) -> Vec<(String, String)> {
let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
let document = Html::parse_document(html);
let selector = Selector::parse("a[href]").unwrap();
let mut pairs = Vec::new();
for element in document.select(&selector) {
for element in document.select(&ANCHOR_SELECTOR) {
if let Some(href) = element.value().attr("href") {
let resolved = match base_url.join(href) {
Ok(u) => u,

Loading…
Cancel
Save