diff --git a/backend/src/services/prompts.rs b/backend/src/services/prompts.rs index b51e8f8..ce40503 100644 --- a/backend/src/services/prompts.rs +++ b/backend/src/services/prompts.rs @@ -120,25 +120,23 @@ pub fn build_search_prompt( } /// Build a prompt for LLM-assisted link extraction from a source page. -pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) { +pub fn build_link_extraction_prompt(body_html: &str) -> (String, String) { let system_prompt = "Tu es un assistant qui analyse des pages web. \ Tu dois identifier les liens vers des articles d'actualite. \ Reponds uniquement au format JSON demande." .to_string(); - let body_truncated: String = body_html.chars().take(8000).collect(); - + let body_truncated: String = body_html.chars().take(12000).collect(); + let user_prompt = format!( - "Voici le contenu HTML d'une page de blog ou de site d'actualites.\n\n\ - \n{head}\n\n\n\ - \n{body}\n\n\n\ + "Voici le contenu HTML du body d'une page de blog ou de site d'actualites.\n\n\ + {body}\n\n\ Extrais UNIQUEMENT les URLs qui pointent vers des articles \ (pas les liens de navigation, tags, categories, login, pages statiques, topics, \ archive, companies, events, company, event, collections, etc.).\n\ Retourne les URLs completes, sans les modifier, dans le format JSON demande. \ Ne change jamais les URLs retournees, et ne les tronque jamais.", - head = head_html, body = body_truncated, ); @@ -354,18 +352,19 @@ mod tests { } #[test] - fn link_extraction_prompt_includes_html() { - let (sys, user) = build_link_extraction_prompt("Blog", "P"); - assert!(user.contains("Blog")); + fn link_extraction_prompt_includes_body() { + let (sys, user) = build_link_extraction_prompt("P"); + assert!(user.contains("")); assert!(user.contains("articles")); assert!(sys.contains("liens")); + assert!(!user.contains("")); } #[test] fn link_extraction_prompt_truncates_body() { let long_body = "x".repeat(20000); - let (_, user) = build_link_extraction_prompt("", &long_body); - assert!(user.len() < 15000); + let (_, user) = build_link_extraction_prompt(&long_body); + assert!(user.len() < 18000); // 12000 chars of body + prompt text } #[test] diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs index e2e3d8f..7a5cc9c 100644 --- a/backend/src/services/source_scraper.rs +++ b/backend/src/services/source_scraper.rs @@ -117,16 +117,10 @@ pub fn extract_links_from_html( links } -/// Extract section and first 8000 chars of from HTML (UTF-8 safe). -pub fn extract_head_and_body(html: &str) -> (String, String) { - let head_start = html.find("").map(|i| i + 7).unwrap_or(head_start); - let head = &html[head_start..head_end]; - - let body_start = html.find(" section from HTML (UTF-8 safe, up to 12000 chars). +pub fn extract_body_html(html: &str) -> String { + let body_start = html.find("T")); + let body = extract_body_html(html); assert!(body.contains("

Content

")); + assert!(!body.contains("")); } #[test] - fn extract_head_and_body_truncates_body_safely() { + fn extract_body_html_truncates_safely() { let long_body = "x".repeat(20000); let html = format!("<head></head><body>{}</body>", long_body); - let (_, body) = extract_head_and_body(&html); - assert_eq!(body.chars().count(), 8000); + let body = extract_body_html(&html); + assert_eq!(body.chars().count(), 12000); } }