@ -99,13 +99,78 @@ fn url_matches_domain(url: &str, expected_domain: &str) -> bool {
. unwrap_or ( false )
}
// Placeholder for LLM path (Task 2)
/// Build the LLM prompt for site-scoped article discovery.
fn build_site_search_prompt ( config : & SiteSearchConfig ) -> String {
format! (
" Trouve les { } articles les plus r é cents publi é s sur le site { } \
à propos de \ " { } \ " . \ n \ n \
Retourne uniquement un tableau JSON d ' URLs , sans explication :\ n \
[ \ " https ://...\", \"https://...\", ...]\n\n\
Crit è res :\ n \
- Articles publi é s dans les { } derniers jours \ n \
- URLs compl è tes pointant vers des pages d ' articles \
( pas de pages cat é gorie , tag , ou accueil ) \ n \
- Uniquement des URLs du domaine { } " ,
config . max_results ,
config . domain ,
config . theme ,
config . max_age_days ,
config . domain ,
)
}
/// LLM websearch path: ask the LLM to find recent articles from a domain.
async fn search_llm (
_config : & SiteSearchConfig ,
_provider : & Arc < dyn LlmProvider > ,
_model : & str ,
config: & SiteSearchConfig ,
provider: & Arc < dyn LlmProvider > ,
model: & str ,
) -> Vec < String > {
Vec ::new ( )
let prompt = build_site_search_prompt ( config ) ;
let schema = serde_json ::json ! ( {
"type" : "array" ,
"items" : { "type" : "string" }
} ) ;
let result = provider
. call_llm ( model , "Tu es un assistant de recherche web." , & prompt , & schema )
. await ;
match result {
Ok ( response ) = > {
let urls = parse_llm_url_response ( & response , & config . domain ) ;
tracing ::info ! (
domain = % config . domain ,
results = urls . len ( ) ,
"Site search fallback (LLM) completed"
) ;
urls
}
Err ( e ) = > {
tracing ::warn ! (
domain = % config . domain ,
error = % e ,
"Site search fallback (LLM) failed"
) ;
Vec ::new ( )
}
}
}
/// Parse the LLM response as a JSON array of URL strings.
///
/// Filters URLs to only keep those matching the target domain
/// (protection against LLM hallucinations).
fn parse_llm_url_response ( response : & serde_json ::Value , domain : & str ) -> Vec < String > {
let Some ( arr ) = response . as_array ( ) else {
tracing ::warn ! ( "LLM site search response is not a JSON array" ) ;
return Vec ::new ( ) ;
} ;
arr . iter ( )
. filter_map ( | v | v . as_str ( ) )
. map ( | s | s . to_string ( ) )
. filter ( | url | url_matches_domain ( url , domain ) )
. collect ( )
}
#[ cfg(test) ]
@ -131,4 +196,47 @@ mod tests {
fn url_matches_domain_invalid_url ( ) {
assert! ( ! url_matches_domain ( "not a url" , "korben.info" ) ) ;
}
#[ test ]
fn parse_llm_url_response_valid_json_array ( ) {
let response = serde_json ::json ! ( [
"https://korben.info/article-1" ,
"https://korben.info/article-2" ,
"https://other.com/article"
] ) ;
let urls = parse_llm_url_response ( & response , "korben.info" ) ;
assert_eq! ( urls . len ( ) , 2 ) ;
assert! ( urls [ 0 ] . contains ( "article-1" ) ) ;
assert! ( urls [ 1 ] . contains ( "article-2" ) ) ;
}
#[ test ]
fn parse_llm_url_response_non_array ( ) {
let response = serde_json ::json ! ( { "urls" : [ "https://korben.info/a" ] } ) ;
let urls = parse_llm_url_response ( & response , "korben.info" ) ;
assert! ( urls . is_empty ( ) ) ;
}
#[ test ]
fn parse_llm_url_response_mixed_types ( ) {
let response = serde_json ::json ! ( [
"https://korben.info/article-1" ,
42 ,
null ,
"https://korben.info/article-2"
] ) ;
let urls = parse_llm_url_response ( & response , "korben.info" ) ;
assert_eq! ( urls . len ( ) , 2 ) ;
}
#[ test ]
fn parse_llm_url_response_filters_wrong_domain ( ) {
let response = serde_json ::json ! ( [
"https://evil.com/fake" ,
"https://korben.info/real"
] ) ;
let urls = parse_llm_url_response ( & response , "korben.info" ) ;
assert_eq! ( urls . len ( ) , 1 ) ;
assert! ( urls [ 0 ] . contains ( "real" ) ) ;
}
}