You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
308 lines
9.3 KiB
Rust
308 lines
9.3 KiB
Rust
//! Site-scoped search fallback service.
|
|
//!
|
|
//! When a personalized source yields 0 links from RSS + HTML extraction,
|
|
//! this service searches `site:{domain} {theme}` via Brave Search API
|
|
//! or LLM websearch to discover articles from that source.
|
|
|
|
use std::sync::Arc;
|
|
|
|
use crate::services::llm::LlmProvider;
|
|
|
|
/// Configuration for a site-scoped search.
|
|
pub struct SiteSearchConfig {
|
|
pub domain: String,
|
|
pub theme: String,
|
|
pub max_results: usize,
|
|
pub max_age_days: i32,
|
|
}
|
|
|
|
/// Provider for executing the site-scoped search.
|
|
pub enum SiteSearchProvider {
|
|
/// Use the Brave Search API.
|
|
Brave { api_key: String },
|
|
/// Use an LLM with websearch capabilities.
|
|
Llm {
|
|
provider: Arc<dyn LlmProvider>,
|
|
model: String,
|
|
},
|
|
}
|
|
|
|
/// Execute a site-scoped search, returning article URLs.
|
|
///
|
|
/// Searches `site:{domain} {theme}` via the configured provider.
|
|
/// Returns an empty Vec on failure (silent fallback — this is a last-resort strategy).
|
|
pub async fn search(
|
|
http_client: &reqwest::Client,
|
|
config: &SiteSearchConfig,
|
|
provider: &SiteSearchProvider,
|
|
) -> Vec<String> {
|
|
match provider {
|
|
SiteSearchProvider::Brave { api_key } => {
|
|
search_brave(http_client, config, api_key).await
|
|
}
|
|
SiteSearchProvider::Llm {
|
|
provider: llm,
|
|
model,
|
|
} => search_llm(config, llm, model).await,
|
|
}
|
|
}
|
|
|
|
/// Brave Search path: query `site:{domain} {theme}` via the Brave API.
|
|
async fn search_brave(
|
|
http_client: &reqwest::Client,
|
|
config: &SiteSearchConfig,
|
|
api_key: &str,
|
|
) -> Vec<String> {
|
|
let query = format!("site:{} {}", config.domain, config.theme);
|
|
|
|
let results = match crate::services::brave_search::search(
|
|
http_client,
|
|
api_key,
|
|
&query,
|
|
config.max_results as u32,
|
|
config.max_age_days,
|
|
)
|
|
.await
|
|
{
|
|
Ok(results) => results,
|
|
Err(e) => {
|
|
tracing::warn!(
|
|
domain = %config.domain,
|
|
error = %e,
|
|
"Site search fallback (Brave) failed"
|
|
);
|
|
return Vec::new();
|
|
}
|
|
};
|
|
|
|
let urls: Vec<String> = results
|
|
.into_iter()
|
|
.filter(|r| url_matches_domain(&r.url, &config.domain))
|
|
.map(|r| r.url)
|
|
.collect();
|
|
|
|
tracing::info!(
|
|
domain = %config.domain,
|
|
results = urls.len(),
|
|
"Site search fallback (Brave) completed"
|
|
);
|
|
|
|
urls
|
|
}
|
|
|
|
/// Check if a URL belongs to the expected domain.
|
|
fn url_matches_domain(url: &str, expected_domain: &str) -> bool {
|
|
url::Url::parse(url)
|
|
.ok()
|
|
.and_then(|u| u.host_str().map(|h| h.to_lowercase()))
|
|
.map(|host| host == expected_domain || host.ends_with(&format!(".{}", expected_domain)))
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
/// Build the LLM prompt for site-scoped article discovery.
|
|
fn build_site_search_prompt(config: &SiteSearchConfig) -> String {
|
|
format!(
|
|
"Trouve les {} articles les plus récents publiés sur le site {} \
|
|
à propos de \"{}\".\n\n\
|
|
Retourne uniquement un tableau JSON d'URLs, sans explication :\n\
|
|
[\"https://...\", \"https://...\", ...]\n\n\
|
|
Critères :\n\
|
|
- Articles publiés dans les {} derniers jours\n\
|
|
- URLs complètes pointant vers des pages d'articles \
|
|
(pas de pages catégorie, tag, ou accueil)\n\
|
|
- Uniquement des URLs du domaine {}",
|
|
config.max_results,
|
|
config.domain,
|
|
config.theme,
|
|
config.max_age_days,
|
|
config.domain,
|
|
)
|
|
}
|
|
|
|
/// LLM websearch path: ask the LLM to find recent articles from a domain.
|
|
async fn search_llm(
|
|
config: &SiteSearchConfig,
|
|
provider: &Arc<dyn LlmProvider>,
|
|
model: &str,
|
|
) -> Vec<String> {
|
|
let prompt = build_site_search_prompt(config);
|
|
let schema = serde_json::json!({
|
|
"type": "array",
|
|
"items": { "type": "string" }
|
|
});
|
|
|
|
let result = provider
|
|
.call_llm(model, "Tu es un assistant de recherche web.", &prompt, &schema)
|
|
.await;
|
|
|
|
match result {
|
|
Ok(response) => {
|
|
let urls = parse_llm_url_response(&response, &config.domain);
|
|
tracing::info!(
|
|
domain = %config.domain,
|
|
results = urls.len(),
|
|
"Site search fallback (LLM) completed"
|
|
);
|
|
urls
|
|
}
|
|
Err(e) => {
|
|
tracing::warn!(
|
|
domain = %config.domain,
|
|
error = %e,
|
|
"Site search fallback (LLM) failed"
|
|
);
|
|
Vec::new()
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Parse the LLM response as a JSON array of URL strings.
|
|
///
|
|
/// Filters URLs to only keep those matching the target domain
|
|
/// (protection against LLM hallucinations).
|
|
fn parse_llm_url_response(response: &serde_json::Value, domain: &str) -> Vec<String> {
|
|
let Some(arr) = response.as_array() else {
|
|
tracing::warn!("LLM site search response is not a JSON array");
|
|
return Vec::new();
|
|
};
|
|
|
|
arr.iter()
|
|
.filter_map(|v| v.as_str())
|
|
.map(|s| s.to_string())
|
|
.filter(|url| url_matches_domain(url, domain))
|
|
.collect()
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
/// Set SKIP_SSRF_CHECK for tests using wiremock (localhost).
|
|
fn skip_ssrf_for_test() {
|
|
unsafe { std::env::set_var("SKIP_SSRF_CHECK", "1"); }
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn search_brave_returns_filtered_urls() {
|
|
skip_ssrf_for_test();
|
|
|
|
let config = SiteSearchConfig {
|
|
domain: "korben.info".to_string(),
|
|
theme: "intelligence artificielle".to_string(),
|
|
max_results: 10,
|
|
max_age_days: 7,
|
|
};
|
|
|
|
// Test error path: Brave with invalid key against real API → returns empty (no panic)
|
|
let provider = SiteSearchProvider::Brave {
|
|
api_key: "invalid-key".to_string(),
|
|
};
|
|
|
|
let client = reqwest::Client::new();
|
|
let results = search(&client, &config, &provider).await;
|
|
// Will fail against real Brave API but should return empty vec, not panic
|
|
assert!(results.is_empty());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn search_llm_returns_urls_from_mock() {
|
|
let config = SiteSearchConfig {
|
|
domain: "korben.info".to_string(),
|
|
theme: "intelligence artificielle".to_string(),
|
|
max_results: 5,
|
|
max_age_days: 7,
|
|
};
|
|
|
|
// MockLlmProvider doesn't have a site_search handler, so it will return
|
|
// a classify response which won't parse as a URL array → empty vec
|
|
let mock_provider = crate::services::llm::mock::MockLlmProvider::new();
|
|
|
|
let provider = SiteSearchProvider::Llm {
|
|
provider: Arc::new(mock_provider),
|
|
model: "mock-model".to_string(),
|
|
};
|
|
|
|
let client = reqwest::Client::new();
|
|
let results = search(&client, &config, &provider).await;
|
|
assert!(results.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn build_site_search_prompt_contains_domain_and_theme() {
|
|
let config = SiteSearchConfig {
|
|
domain: "korben.info".to_string(),
|
|
theme: "intelligence artificielle".to_string(),
|
|
max_results: 10,
|
|
max_age_days: 7,
|
|
};
|
|
let prompt = build_site_search_prompt(&config);
|
|
assert!(prompt.contains("korben.info"));
|
|
assert!(prompt.contains("intelligence artificielle"));
|
|
assert!(prompt.contains("10"));
|
|
assert!(prompt.contains("7"));
|
|
}
|
|
|
|
#[test]
|
|
fn url_matches_domain_exact() {
|
|
assert!(url_matches_domain("https://korben.info/article", "korben.info"));
|
|
}
|
|
|
|
#[test]
|
|
fn url_matches_domain_subdomain() {
|
|
assert!(url_matches_domain("https://www.korben.info/article", "korben.info"));
|
|
}
|
|
|
|
#[test]
|
|
fn url_matches_domain_mismatch() {
|
|
assert!(!url_matches_domain("https://evil.com/korben.info", "korben.info"));
|
|
}
|
|
|
|
#[test]
|
|
fn url_matches_domain_invalid_url() {
|
|
assert!(!url_matches_domain("not a url", "korben.info"));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_llm_url_response_valid_json_array() {
|
|
let response = serde_json::json!([
|
|
"https://korben.info/article-1",
|
|
"https://korben.info/article-2",
|
|
"https://other.com/article"
|
|
]);
|
|
let urls = parse_llm_url_response(&response, "korben.info");
|
|
assert_eq!(urls.len(), 2);
|
|
assert!(urls[0].contains("article-1"));
|
|
assert!(urls[1].contains("article-2"));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_llm_url_response_non_array() {
|
|
let response = serde_json::json!({"urls": ["https://korben.info/a"]});
|
|
let urls = parse_llm_url_response(&response, "korben.info");
|
|
assert!(urls.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn parse_llm_url_response_mixed_types() {
|
|
let response = serde_json::json!([
|
|
"https://korben.info/article-1",
|
|
42,
|
|
null,
|
|
"https://korben.info/article-2"
|
|
]);
|
|
let urls = parse_llm_url_response(&response, "korben.info");
|
|
assert_eq!(urls.len(), 2);
|
|
}
|
|
|
|
#[test]
|
|
fn parse_llm_url_response_filters_wrong_domain() {
|
|
let response = serde_json::json!([
|
|
"https://evil.com/fake",
|
|
"https://korben.info/real"
|
|
]);
|
|
let urls = parse_llm_url_response(&response, "korben.info");
|
|
assert_eq!(urls.len(), 1);
|
|
assert!(urls[0].contains("real"));
|
|
}
|
|
}
|