feat: add site_search service with Brave path and domain filtering
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>master
parent
c45050ce3c
commit
a4f008bc42
@ -0,0 +1,134 @@
|
|||||||
|
//! Site-scoped search fallback service.
|
||||||
|
//!
|
||||||
|
//! When a personalized source yields 0 links from RSS + HTML extraction,
|
||||||
|
//! this service searches `site:{domain} {theme}` via Brave Search API
|
||||||
|
//! or LLM websearch to discover articles from that source.
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use crate::services::llm::LlmProvider;
|
||||||
|
|
||||||
|
/// Configuration for a site-scoped search.
|
||||||
|
pub struct SiteSearchConfig {
|
||||||
|
pub domain: String,
|
||||||
|
pub theme: String,
|
||||||
|
pub max_results: usize,
|
||||||
|
pub max_age_days: i32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Provider for executing the site-scoped search.
|
||||||
|
pub enum SiteSearchProvider {
|
||||||
|
/// Use the Brave Search API.
|
||||||
|
Brave { api_key: String },
|
||||||
|
/// Use an LLM with websearch capabilities.
|
||||||
|
Llm {
|
||||||
|
provider: Arc<dyn LlmProvider>,
|
||||||
|
model: String,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute a site-scoped search, returning article URLs.
|
||||||
|
///
|
||||||
|
/// Searches `site:{domain} {theme}` via the configured provider.
|
||||||
|
/// Returns an empty Vec on failure (silent fallback — this is a last-resort strategy).
|
||||||
|
pub async fn search(
|
||||||
|
http_client: &reqwest::Client,
|
||||||
|
config: &SiteSearchConfig,
|
||||||
|
provider: &SiteSearchProvider,
|
||||||
|
) -> Vec<String> {
|
||||||
|
match provider {
|
||||||
|
SiteSearchProvider::Brave { api_key } => {
|
||||||
|
search_brave(http_client, config, api_key).await
|
||||||
|
}
|
||||||
|
SiteSearchProvider::Llm {
|
||||||
|
provider: llm,
|
||||||
|
model,
|
||||||
|
} => search_llm(config, llm, model).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Brave Search path: query `site:{domain} {theme}` via the Brave API.
|
||||||
|
async fn search_brave(
|
||||||
|
http_client: &reqwest::Client,
|
||||||
|
config: &SiteSearchConfig,
|
||||||
|
api_key: &str,
|
||||||
|
) -> Vec<String> {
|
||||||
|
let query = format!("site:{} {}", config.domain, config.theme);
|
||||||
|
|
||||||
|
let results = match crate::services::brave_search::search(
|
||||||
|
http_client,
|
||||||
|
api_key,
|
||||||
|
&query,
|
||||||
|
config.max_results as u32,
|
||||||
|
config.max_age_days,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(results) => results,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
domain = %config.domain,
|
||||||
|
error = %e,
|
||||||
|
"Site search fallback (Brave) failed"
|
||||||
|
);
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let urls: Vec<String> = results
|
||||||
|
.into_iter()
|
||||||
|
.filter(|r| url_matches_domain(&r.url, &config.domain))
|
||||||
|
.map(|r| r.url)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
domain = %config.domain,
|
||||||
|
results = urls.len(),
|
||||||
|
"Site search fallback (Brave) completed"
|
||||||
|
);
|
||||||
|
|
||||||
|
urls
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a URL belongs to the expected domain.
|
||||||
|
fn url_matches_domain(url: &str, expected_domain: &str) -> bool {
|
||||||
|
url::Url::parse(url)
|
||||||
|
.ok()
|
||||||
|
.and_then(|u| u.host_str().map(|h| h.to_lowercase()))
|
||||||
|
.map(|host| host == expected_domain || host.ends_with(&format!(".{}", expected_domain)))
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Placeholder for LLM path (Task 2)
|
||||||
|
async fn search_llm(
|
||||||
|
_config: &SiteSearchConfig,
|
||||||
|
_provider: &Arc<dyn LlmProvider>,
|
||||||
|
_model: &str,
|
||||||
|
) -> Vec<String> {
|
||||||
|
Vec::new()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn url_matches_domain_exact() {
|
||||||
|
assert!(url_matches_domain("https://korben.info/article", "korben.info"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn url_matches_domain_subdomain() {
|
||||||
|
assert!(url_matches_domain("https://www.korben.info/article", "korben.info"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn url_matches_domain_mismatch() {
|
||||||
|
assert!(!url_matches_domain("https://evil.com/korben.info", "korben.info"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn url_matches_domain_invalid_url() {
|
||||||
|
assert!(!url_matches_domain("not a url", "korben.info"));
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue