feat: implement LLM websearch path in site_search service

Replace placeholder search_llm with real implementation: builds a French
prompt asking the LLM for recent articles from a domain, calls call_llm
with a JSON-array schema, and filters results through url_matches_domain
to guard against hallucinated URLs. Add build_site_search_prompt and
parse_llm_url_response helpers with 4 unit tests (valid array, non-array,
mixed types, wrong-domain filtering).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 2 months ago
parent a4f008bc42
commit 71c791fec0

@ -99,13 +99,78 @@ fn url_matches_domain(url: &str, expected_domain: &str) -> bool {
.unwrap_or(false)
}
// Placeholder for LLM path (Task 2)
/// Build the LLM prompt for site-scoped article discovery.
fn build_site_search_prompt(config: &SiteSearchConfig) -> String {
format!(
"Trouve les {} articles les plus récents publiés sur le site {} \
à propos de \"{}\".\n\n\
Retourne uniquement un tableau JSON d'URLs, sans explication :\n\
[\"https://...\", \"https://...\", ...]\n\n\
Critères :\n\
- Articles publiés dans les {} derniers jours\n\
- URLs complètes pointant vers des pages d'articles \
(pas de pages catégorie, tag, ou accueil)\n\
- Uniquement des URLs du domaine {}",
config.max_results,
config.domain,
config.theme,
config.max_age_days,
config.domain,
)
}
/// LLM websearch path: ask the LLM to find recent articles from a domain.
async fn search_llm(
_config: &SiteSearchConfig,
_provider: &Arc<dyn LlmProvider>,
_model: &str,
config: &SiteSearchConfig,
provider: &Arc<dyn LlmProvider>,
model: &str,
) -> Vec<String> {
Vec::new()
let prompt = build_site_search_prompt(config);
let schema = serde_json::json!({
"type": "array",
"items": { "type": "string" }
});
let result = provider
.call_llm(model, "Tu es un assistant de recherche web.", &prompt, &schema)
.await;
match result {
Ok(response) => {
let urls = parse_llm_url_response(&response, &config.domain);
tracing::info!(
domain = %config.domain,
results = urls.len(),
"Site search fallback (LLM) completed"
);
urls
}
Err(e) => {
tracing::warn!(
domain = %config.domain,
error = %e,
"Site search fallback (LLM) failed"
);
Vec::new()
}
}
}
/// Parse the LLM response as a JSON array of URL strings.
///
/// Filters URLs to only keep those matching the target domain
/// (protection against LLM hallucinations).
fn parse_llm_url_response(response: &serde_json::Value, domain: &str) -> Vec<String> {
let Some(arr) = response.as_array() else {
tracing::warn!("LLM site search response is not a JSON array");
return Vec::new();
};
arr.iter()
.filter_map(|v| v.as_str())
.map(|s| s.to_string())
.filter(|url| url_matches_domain(url, domain))
.collect()
}
#[cfg(test)]
@ -131,4 +196,47 @@ mod tests {
fn url_matches_domain_invalid_url() {
assert!(!url_matches_domain("not a url", "korben.info"));
}
#[test]
fn parse_llm_url_response_valid_json_array() {
let response = serde_json::json!([
"https://korben.info/article-1",
"https://korben.info/article-2",
"https://other.com/article"
]);
let urls = parse_llm_url_response(&response, "korben.info");
assert_eq!(urls.len(), 2);
assert!(urls[0].contains("article-1"));
assert!(urls[1].contains("article-2"));
}
#[test]
fn parse_llm_url_response_non_array() {
let response = serde_json::json!({"urls": ["https://korben.info/a"]});
let urls = parse_llm_url_response(&response, "korben.info");
assert!(urls.is_empty());
}
#[test]
fn parse_llm_url_response_mixed_types() {
let response = serde_json::json!([
"https://korben.info/article-1",
42,
null,
"https://korben.info/article-2"
]);
let urls = parse_llm_url_response(&response, "korben.info");
assert_eq!(urls.len(), 2);
}
#[test]
fn parse_llm_url_response_filters_wrong_domain() {
let response = serde_json::json!([
"https://evil.com/fake",
"https://korben.info/real"
]);
let urls = parse_llm_url_response(&response, "korben.info");
assert_eq!(urls.len(), 1);
assert!(urls[0].contains("real"));
}
}

Loading…
Cancel
Save