refactor: LLM link extraction uses body only (no head), increased to 12000 chars

3 months ago · 14b0a0b7e8
parent 3353e5261f
commit 14b0a0b7e8
2 changed files with 23 additions and 30 deletions
--- a/backend/src/services/prompts.rs
+++ b/backend/src/services/prompts.rs
@ -120,25 +120,23 @@ pub fn build_search_prompt(
 }

 /// Build a prompt for LLM-assisted link extraction from a source page.
-pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) {
+pub fn build_link_extraction_prompt(body_html: &str) -> (String, String) {
    let system_prompt =
        "Tu es un assistant qui analyse des pages web. \
         Tu dois identifier les liens vers des articles d'actualite. \
         Reponds uniquement au format JSON demande."
            .to_string();

-    let body_truncated: String = body_html.chars().take(8000).collect();
-   
+    let body_truncated: String = body_html.chars().take(12000).collect();
+
    let user_prompt = format!(
-        "Voici le contenu HTML d'une page de blog ou de site d'actualites.\n\n\
-         <head>\n{head}\n</head>\n\n\
-         <body (extrait)>\n{body}\n</body>\n\n\
+        "Voici le contenu HTML du body d'une page de blog ou de site d'actualites.\n\n\
+         {body}\n\n\
         Extrais UNIQUEMENT les URLs qui pointent vers des articles \
         (pas les liens de navigation, tags, categories, login, pages statiques, topics, \
         archive, companies, events, company, event, collections, etc.).\n\
         Retourne les URLs completes, sans les modifier, dans le format JSON demande. \
         Ne change jamais les URLs retournees, et ne les tronque jamais.",
-        head = head_html,
        body = body_truncated,
    );

@ -354,18 +352,19 @@ mod tests {
    }

    #[test]
-    fn link_extraction_prompt_includes_html() {
-        let (sys, user) = build_link_extraction_prompt("<title>Blog</title>", "<a href='/post'>P</a>");
-        assert!(user.contains("<title>Blog</title>"));
+    fn link_extraction_prompt_includes_body() {
+        let (sys, user) = build_link_extraction_prompt("<a href='/post'>P</a>");
+        assert!(user.contains("<a href='/post'>"));
        assert!(user.contains("articles"));
        assert!(sys.contains("liens"));
+        assert!(!user.contains("<head>"));
    }

    #[test]
    fn link_extraction_prompt_truncates_body() {
        let long_body = "x".repeat(20000);
-        let (_, user) = build_link_extraction_prompt("", &long_body);
-        assert!(user.len() < 15000);
+        let (_, user) = build_link_extraction_prompt(&long_body);
+        assert!(user.len() < 18000); // 12000 chars of body + prompt text
    }

    #[test]
--- a/backend/src/services/source_scraper.rs
+++ b/backend/src/services/source_scraper.rs
@ -117,16 +117,10 @@ pub fn extract_links_from_html(
    links
 }

-/// Extract <head> section and first 8000 chars of <body> from HTML (UTF-8 safe).
-pub fn extract_head_and_body(html: &str) -> (String, String) {
-    let head_start = html.find("<head").unwrap_or(0);
-    let head_end = html.find("</head>").map(|i| i + 7).unwrap_or(head_start);
-    let head = &html[head_start..head_end];
-
-    let body_start = html.find("<body").unwrap_or(head_end);
-    let body: String = html[body_start..].chars().take(8000).collect();
-
-    (head.to_string(), body)
+/// Extract the <body> section from HTML (UTF-8 safe, up to 12000 chars).
+pub fn extract_body_html(html: &str) -> String {
+    let body_start = html.find("<body").unwrap_or(0);
+    html[body_start..].chars().take(12000).collect()
 }

 /// Extract article links using LLM analysis of the page HTML.
@ -160,8 +154,8 @@ pub async fn extract_article_links_with_llm(
        AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
    })?;

-    let (head_html, body_html) = extract_head_and_body(&html_text);
-    let (system, user) = build_link_extraction_prompt(&head_html, &body_html);
+    let body_html = extract_body_html(&html_text);
+    let (system, user) = build_link_extraction_prompt(&body_html);
    let schema = build_link_extraction_schema();

    let llm_start = std::time::Instant::now();
@ -312,18 +306,18 @@ mod tests {
    }

    #[test]
-    fn extract_head_and_body_splits_correctly() {
+    fn extract_body_html_gets_body_content() {
        let html = "<html><head><title>T</title></head><body><p>Content</p></body></html>";
-        let (head, body) = extract_head_and_body(html);
-        assert!(head.contains("<title>T</title>"));
+        let body = extract_body_html(html);
        assert!(body.contains("<p>Content</p>"));
+        assert!(!body.contains("<title>"));
    }

    #[test]
-    fn extract_head_and_body_truncates_body_safely() {
+    fn extract_body_html_truncates_safely() {
        let long_body = "x".repeat(20000);
        let html = format!("<head></head><body>{}</body>", long_body);
-        let (_, body) = extract_head_and_body(&html);
-        assert_eq!(body.chars().count(), 8000);
+        let body = extract_body_html(&html);
+        assert_eq!(body.chars().count(), 12000);
    }
 }