diff --git a/backend/src/services/site_search.rs b/backend/src/services/site_search.rs index 5ef4412..ce42f05 100644 --- a/backend/src/services/site_search.rs +++ b/backend/src/services/site_search.rs @@ -99,13 +99,78 @@ fn url_matches_domain(url: &str, expected_domain: &str) -> bool { .unwrap_or(false) } -// Placeholder for LLM path (Task 2) +/// Build the LLM prompt for site-scoped article discovery. +fn build_site_search_prompt(config: &SiteSearchConfig) -> String { + format!( + "Trouve les {} articles les plus récents publiés sur le site {} \ + à propos de \"{}\".\n\n\ + Retourne uniquement un tableau JSON d'URLs, sans explication :\n\ + [\"https://...\", \"https://...\", ...]\n\n\ + Critères :\n\ + - Articles publiés dans les {} derniers jours\n\ + - URLs complètes pointant vers des pages d'articles \ + (pas de pages catégorie, tag, ou accueil)\n\ + - Uniquement des URLs du domaine {}", + config.max_results, + config.domain, + config.theme, + config.max_age_days, + config.domain, + ) +} + +/// LLM websearch path: ask the LLM to find recent articles from a domain. async fn search_llm( - _config: &SiteSearchConfig, - _provider: &Arc, - _model: &str, + config: &SiteSearchConfig, + provider: &Arc, + model: &str, ) -> Vec { - Vec::new() + let prompt = build_site_search_prompt(config); + let schema = serde_json::json!({ + "type": "array", + "items": { "type": "string" } + }); + + let result = provider + .call_llm(model, "Tu es un assistant de recherche web.", &prompt, &schema) + .await; + + match result { + Ok(response) => { + let urls = parse_llm_url_response(&response, &config.domain); + tracing::info!( + domain = %config.domain, + results = urls.len(), + "Site search fallback (LLM) completed" + ); + urls + } + Err(e) => { + tracing::warn!( + domain = %config.domain, + error = %e, + "Site search fallback (LLM) failed" + ); + Vec::new() + } + } +} + +/// Parse the LLM response as a JSON array of URL strings. +/// +/// Filters URLs to only keep those matching the target domain +/// (protection against LLM hallucinations). +fn parse_llm_url_response(response: &serde_json::Value, domain: &str) -> Vec { + let Some(arr) = response.as_array() else { + tracing::warn!("LLM site search response is not a JSON array"); + return Vec::new(); + }; + + arr.iter() + .filter_map(|v| v.as_str()) + .map(|s| s.to_string()) + .filter(|url| url_matches_domain(url, domain)) + .collect() } #[cfg(test)] @@ -131,4 +196,47 @@ mod tests { fn url_matches_domain_invalid_url() { assert!(!url_matches_domain("not a url", "korben.info")); } + + #[test] + fn parse_llm_url_response_valid_json_array() { + let response = serde_json::json!([ + "https://korben.info/article-1", + "https://korben.info/article-2", + "https://other.com/article" + ]); + let urls = parse_llm_url_response(&response, "korben.info"); + assert_eq!(urls.len(), 2); + assert!(urls[0].contains("article-1")); + assert!(urls[1].contains("article-2")); + } + + #[test] + fn parse_llm_url_response_non_array() { + let response = serde_json::json!({"urls": ["https://korben.info/a"]}); + let urls = parse_llm_url_response(&response, "korben.info"); + assert!(urls.is_empty()); + } + + #[test] + fn parse_llm_url_response_mixed_types() { + let response = serde_json::json!([ + "https://korben.info/article-1", + 42, + null, + "https://korben.info/article-2" + ]); + let urls = parse_llm_url_response(&response, "korben.info"); + assert_eq!(urls.len(), 2); + } + + #[test] + fn parse_llm_url_response_filters_wrong_domain() { + let response = serde_json::json!([ + "https://evil.com/fake", + "https://korben.info/real" + ]); + let urls = parse_llm_url_response(&response, "korben.info"); + assert_eq!(urls.len(), 1); + assert!(urls[0].contains("real")); + } }