//! Site-scoped search fallback service. //! //! When a personalized source yields 0 links from RSS + HTML extraction, //! this service searches `site:{domain} {theme}` via Brave Search API //! or LLM websearch to discover articles from that source. use std::sync::Arc; use crate::services::llm::LlmProvider; /// Configuration for a site-scoped search. pub struct SiteSearchConfig { pub domain: String, pub theme: String, pub max_results: usize, pub max_age_days: i32, } /// Provider for executing the site-scoped search. pub enum SiteSearchProvider { /// Use the Brave Search API. Brave { api_key: String }, /// Use an LLM with websearch capabilities. Llm { provider: Arc, model: String, }, } /// Execute a site-scoped search, returning article URLs. /// /// Searches `site:{domain} {theme}` via the configured provider. /// Returns an empty Vec on failure (silent fallback — this is a last-resort strategy). pub async fn search( http_client: &reqwest::Client, config: &SiteSearchConfig, provider: &SiteSearchProvider, ) -> Vec { match provider { SiteSearchProvider::Brave { api_key } => { search_brave(http_client, config, api_key).await } SiteSearchProvider::Llm { provider: llm, model, } => search_llm(config, llm, model).await, } } /// Brave Search path: query `site:{domain} {theme}` via the Brave API. async fn search_brave( http_client: &reqwest::Client, config: &SiteSearchConfig, api_key: &str, ) -> Vec { let query = format!("site:{} {}", config.domain, config.theme); let results = match crate::services::brave_search::search( http_client, api_key, &query, config.max_results as u32, config.max_age_days, ) .await { Ok(results) => results, Err(e) => { tracing::warn!( domain = %config.domain, error = %e, "Site search fallback (Brave) failed" ); return Vec::new(); } }; let urls: Vec = results .into_iter() .filter(|r| url_matches_domain(&r.url, &config.domain)) .map(|r| r.url) .collect(); tracing::info!( domain = %config.domain, results = urls.len(), "Site search fallback (Brave) completed" ); urls } /// Check if a URL belongs to the expected domain. fn url_matches_domain(url: &str, expected_domain: &str) -> bool { url::Url::parse(url) .ok() .and_then(|u| u.host_str().map(|h| h.to_lowercase())) .map(|host| host == expected_domain || host.ends_with(&format!(".{}", expected_domain))) .unwrap_or(false) } /// Build the LLM prompt for site-scoped article discovery. fn build_site_search_prompt(config: &SiteSearchConfig) -> String { format!( "Trouve les {} articles les plus récents publiés sur le site {} \ à propos de \"{}\".\n\n\ Retourne uniquement un tableau JSON d'URLs, sans explication :\n\ [\"https://...\", \"https://...\", ...]\n\n\ Critères :\n\ - Articles publiés dans les {} derniers jours\n\ - URLs complètes pointant vers des pages d'articles \ (pas de pages catégorie, tag, ou accueil)\n\ - Uniquement des URLs du domaine {}", config.max_results, config.domain, config.theme, config.max_age_days, config.domain, ) } /// LLM websearch path: ask the LLM to find recent articles from a domain. async fn search_llm( config: &SiteSearchConfig, provider: &Arc, model: &str, ) -> Vec { let prompt = build_site_search_prompt(config); let schema = serde_json::json!({ "type": "array", "items": { "type": "string" } }); let result = provider .call_llm(model, "Tu es un assistant de recherche web.", &prompt, &schema) .await; match result { Ok(response) => { let urls = parse_llm_url_response(&response, &config.domain); tracing::info!( domain = %config.domain, results = urls.len(), "Site search fallback (LLM) completed" ); urls } Err(e) => { tracing::warn!( domain = %config.domain, error = %e, "Site search fallback (LLM) failed" ); Vec::new() } } } /// Parse the LLM response as a JSON array of URL strings. /// /// Filters URLs to only keep those matching the target domain /// (protection against LLM hallucinations). fn parse_llm_url_response(response: &serde_json::Value, domain: &str) -> Vec { let Some(arr) = response.as_array() else { tracing::warn!("LLM site search response is not a JSON array"); return Vec::new(); }; arr.iter() .filter_map(|v| v.as_str()) .map(|s| s.to_string()) .filter(|url| url_matches_domain(url, domain)) .collect() } #[cfg(test)] mod tests { use super::*; /// Set SKIP_SSRF_CHECK for tests using wiremock (localhost). fn skip_ssrf_for_test() { unsafe { std::env::set_var("SKIP_SSRF_CHECK", "1"); } } #[tokio::test] async fn search_brave_returns_filtered_urls() { skip_ssrf_for_test(); let config = SiteSearchConfig { domain: "korben.info".to_string(), theme: "intelligence artificielle".to_string(), max_results: 10, max_age_days: 7, }; // Test error path: Brave with invalid key against real API → returns empty (no panic) let provider = SiteSearchProvider::Brave { api_key: "invalid-key".to_string(), }; let client = reqwest::Client::new(); let results = search(&client, &config, &provider).await; // Will fail against real Brave API but should return empty vec, not panic assert!(results.is_empty()); } #[tokio::test] async fn search_llm_returns_urls_from_mock() { let config = SiteSearchConfig { domain: "korben.info".to_string(), theme: "intelligence artificielle".to_string(), max_results: 5, max_age_days: 7, }; // MockLlmProvider doesn't have a site_search handler, so it will return // a classify response which won't parse as a URL array → empty vec let mock_provider = crate::services::llm::mock::MockLlmProvider::new(); let provider = SiteSearchProvider::Llm { provider: Arc::new(mock_provider), model: "mock-model".to_string(), }; let client = reqwest::Client::new(); let results = search(&client, &config, &provider).await; assert!(results.is_empty()); } #[test] fn build_site_search_prompt_contains_domain_and_theme() { let config = SiteSearchConfig { domain: "korben.info".to_string(), theme: "intelligence artificielle".to_string(), max_results: 10, max_age_days: 7, }; let prompt = build_site_search_prompt(&config); assert!(prompt.contains("korben.info")); assert!(prompt.contains("intelligence artificielle")); assert!(prompt.contains("10")); assert!(prompt.contains("7")); } #[test] fn url_matches_domain_exact() { assert!(url_matches_domain("https://korben.info/article", "korben.info")); } #[test] fn url_matches_domain_subdomain() { assert!(url_matches_domain("https://www.korben.info/article", "korben.info")); } #[test] fn url_matches_domain_mismatch() { assert!(!url_matches_domain("https://evil.com/korben.info", "korben.info")); } #[test] fn url_matches_domain_invalid_url() { assert!(!url_matches_domain("not a url", "korben.info")); } #[test] fn parse_llm_url_response_valid_json_array() { let response = serde_json::json!([ "https://korben.info/article-1", "https://korben.info/article-2", "https://other.com/article" ]); let urls = parse_llm_url_response(&response, "korben.info"); assert_eq!(urls.len(), 2); assert!(urls[0].contains("article-1")); assert!(urls[1].contains("article-2")); } #[test] fn parse_llm_url_response_non_array() { let response = serde_json::json!({"urls": ["https://korben.info/a"]}); let urls = parse_llm_url_response(&response, "korben.info"); assert!(urls.is_empty()); } #[test] fn parse_llm_url_response_mixed_types() { let response = serde_json::json!([ "https://korben.info/article-1", 42, null, "https://korben.info/article-2" ]); let urls = parse_llm_url_response(&response, "korben.info"); assert_eq!(urls.len(), 2); } #[test] fn parse_llm_url_response_filters_wrong_domain() { let response = serde_json::json!([ "https://evil.com/fake", "https://korben.info/real" ]); let urls = parse_llm_url_response(&response, "korben.info"); assert_eq!(urls.len(), 1); assert!(urls[0].contains("real")); } }