diff --git a/backend/src/services/prompts.rs b/backend/src/services/prompts.rs
index b51e8f8..ce40503 100644
--- a/backend/src/services/prompts.rs
+++ b/backend/src/services/prompts.rs
@@ -120,25 +120,23 @@ pub fn build_search_prompt(
}
/// Build a prompt for LLM-assisted link extraction from a source page.
-pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) {
+pub fn build_link_extraction_prompt(body_html: &str) -> (String, String) {
let system_prompt =
"Tu es un assistant qui analyse des pages web. \
Tu dois identifier les liens vers des articles d'actualite. \
Reponds uniquement au format JSON demande."
.to_string();
- let body_truncated: String = body_html.chars().take(8000).collect();
-
+ let body_truncated: String = body_html.chars().take(12000).collect();
+
let user_prompt = format!(
- "Voici le contenu HTML d'une page de blog ou de site d'actualites.\n\n\
-
\n{head}\n\n\n\
- \n{body}\n\n\n\
+ "Voici le contenu HTML du body d'une page de blog ou de site d'actualites.\n\n\
+ {body}\n\n\
Extrais UNIQUEMENT les URLs qui pointent vers des articles \
(pas les liens de navigation, tags, categories, login, pages statiques, topics, \
archive, companies, events, company, event, collections, etc.).\n\
Retourne les URLs completes, sans les modifier, dans le format JSON demande. \
Ne change jamais les URLs retournees, et ne les tronque jamais.",
- head = head_html,
body = body_truncated,
);
@@ -354,18 +352,19 @@ mod tests {
}
#[test]
- fn link_extraction_prompt_includes_html() {
- let (sys, user) = build_link_extraction_prompt("Blog", "P");
- assert!(user.contains("Blog"));
+ fn link_extraction_prompt_includes_body() {
+ let (sys, user) = build_link_extraction_prompt("P");
+ assert!(user.contains(""));
assert!(user.contains("articles"));
assert!(sys.contains("liens"));
+ assert!(!user.contains(""));
}
#[test]
fn link_extraction_prompt_truncates_body() {
let long_body = "x".repeat(20000);
- let (_, user) = build_link_extraction_prompt("", &long_body);
- assert!(user.len() < 15000);
+ let (_, user) = build_link_extraction_prompt(&long_body);
+ assert!(user.len() < 18000); // 12000 chars of body + prompt text
}
#[test]
diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs
index e2e3d8f..7a5cc9c 100644
--- a/backend/src/services/source_scraper.rs
+++ b/backend/src/services/source_scraper.rs
@@ -117,16 +117,10 @@ pub fn extract_links_from_html(
links
}
-/// Extract section and first 8000 chars of from HTML (UTF-8 safe).
-pub fn extract_head_and_body(html: &str) -> (String, String) {
- let head_start = html.find("").map(|i| i + 7).unwrap_or(head_start);
- let head = &html[head_start..head_end];
-
- let body_start = html.find(" section from HTML (UTF-8 safe, up to 12000 chars).
+pub fn extract_body_html(html: &str) -> String {
+ let body_start = html.find("T"));
+ let body = extract_body_html(html);
assert!(body.contains("Content
"));
+ assert!(!body.contains(""));
}
#[test]
- fn extract_head_and_body_truncates_body_safely() {
+ fn extract_body_html_truncates_safely() {
let long_body = "x".repeat(20000);
let html = format!("{}", long_body);
- let (_, body) = extract_head_and_body(&html);
- assert_eq!(body.chars().count(), 8000);
+ let body = extract_body_html(&html);
+ assert_eq!(body.chars().count(), 12000);
}
}