diff --git a/backend/src/services/mod.rs b/backend/src/services/mod.rs index 1956d4d..603e5ab 100644 --- a/backend/src/services/mod.rs +++ b/backend/src/services/mod.rs @@ -11,6 +11,7 @@ pub mod prompts; pub mod rate_limiter; pub mod scheduler; pub mod scraper; +pub mod site_search; pub mod source_scraper; pub mod synthesis; pub mod turnstile; diff --git a/backend/src/services/site_search.rs b/backend/src/services/site_search.rs new file mode 100644 index 0000000..5ef4412 --- /dev/null +++ b/backend/src/services/site_search.rs @@ -0,0 +1,134 @@ +//! Site-scoped search fallback service. +//! +//! When a personalized source yields 0 links from RSS + HTML extraction, +//! this service searches `site:{domain} {theme}` via Brave Search API +//! or LLM websearch to discover articles from that source. + +use std::sync::Arc; + +use crate::services::llm::LlmProvider; + +/// Configuration for a site-scoped search. +pub struct SiteSearchConfig { + pub domain: String, + pub theme: String, + pub max_results: usize, + pub max_age_days: i32, +} + +/// Provider for executing the site-scoped search. +pub enum SiteSearchProvider { + /// Use the Brave Search API. + Brave { api_key: String }, + /// Use an LLM with websearch capabilities. + Llm { + provider: Arc, + model: String, + }, +} + +/// Execute a site-scoped search, returning article URLs. +/// +/// Searches `site:{domain} {theme}` via the configured provider. +/// Returns an empty Vec on failure (silent fallback — this is a last-resort strategy). +pub async fn search( + http_client: &reqwest::Client, + config: &SiteSearchConfig, + provider: &SiteSearchProvider, +) -> Vec { + match provider { + SiteSearchProvider::Brave { api_key } => { + search_brave(http_client, config, api_key).await + } + SiteSearchProvider::Llm { + provider: llm, + model, + } => search_llm(config, llm, model).await, + } +} + +/// Brave Search path: query `site:{domain} {theme}` via the Brave API. +async fn search_brave( + http_client: &reqwest::Client, + config: &SiteSearchConfig, + api_key: &str, +) -> Vec { + let query = format!("site:{} {}", config.domain, config.theme); + + let results = match crate::services::brave_search::search( + http_client, + api_key, + &query, + config.max_results as u32, + config.max_age_days, + ) + .await + { + Ok(results) => results, + Err(e) => { + tracing::warn!( + domain = %config.domain, + error = %e, + "Site search fallback (Brave) failed" + ); + return Vec::new(); + } + }; + + let urls: Vec = results + .into_iter() + .filter(|r| url_matches_domain(&r.url, &config.domain)) + .map(|r| r.url) + .collect(); + + tracing::info!( + domain = %config.domain, + results = urls.len(), + "Site search fallback (Brave) completed" + ); + + urls +} + +/// Check if a URL belongs to the expected domain. +fn url_matches_domain(url: &str, expected_domain: &str) -> bool { + url::Url::parse(url) + .ok() + .and_then(|u| u.host_str().map(|h| h.to_lowercase())) + .map(|host| host == expected_domain || host.ends_with(&format!(".{}", expected_domain))) + .unwrap_or(false) +} + +// Placeholder for LLM path (Task 2) +async fn search_llm( + _config: &SiteSearchConfig, + _provider: &Arc, + _model: &str, +) -> Vec { + Vec::new() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn url_matches_domain_exact() { + assert!(url_matches_domain("https://korben.info/article", "korben.info")); + } + + #[test] + fn url_matches_domain_subdomain() { + assert!(url_matches_domain("https://www.korben.info/article", "korben.info")); + } + + #[test] + fn url_matches_domain_mismatch() { + assert!(!url_matches_domain("https://evil.com/korben.info", "korben.info")); + } + + #[test] + fn url_matches_domain_invalid_url() { + assert!(!url_matches_domain("not a url", "korben.info")); + } +}