From 41109b3d9325dfcef004c4a92d11b8586920598b Mon Sep 17 00:00:00 2001
From: oabrivard <olivier@abrivard.fr>
Date: Wed, 25 Mar 2026 10:53:56 +0100
Subject: [PATCH] feat: send structured link pairs to LLM instead of raw HTML
 body

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/src/services/prompts.rs        |  33 +++---
 backend/src/services/source_scraper.rs | 135 ++++++++++++++++++++++---
 2 files changed, 135 insertions(+), 33 deletions(-)
diff --git a/backend/src/services/prompts.rs b/backend/src/services/prompts.rs
index 84577ef..9dbbe2d 100644
--- a/backend/src/services/prompts.rs
+++ b/backend/src/services/prompts.rs
@@ -120,24 +120,24 @@ pub fn build_search_prompt(
 }
 
 /// Build a prompt for LLM-assisted link extraction from a source page.
-pub fn build_link_extraction_prompt(body_html: &str) -> (String, String) {
+///
+/// Receives a pre-formatted list of (href, anchor_text) pairs, not raw HTML.
+pub fn build_link_extraction_prompt(links_text: &str) -> (String, String) {
     let system_prompt =
-        "Tu es un assistant qui analyse des pages web. \
+        "Tu es un assistant qui analyse des listes de liens. \
          Tu dois identifier les liens vers des articles d'actualite. \
          Reponds uniquement au format JSON demande."
             .to_string();
 
-    let body_truncated: String = body_html.chars().take(12000).collect();
-
     let user_prompt = format!(
-        "Voici le contenu HTML du body d'une page de blog ou de site d'actualites.\n\n\
-         {body}\n\n\
-         Extrais UNIQUEMENT les URLs qui pointent vers des articles \
+        "Voici une liste de liens extraits d'une page de blog ou de site d'actualites.\n\n\
+         {links}\n\n\
+         Selectionne UNIQUEMENT les URLs qui pointent vers des articles \
          (pas les liens de navigation, tags, categories, login, pages statiques, topics, \
          archive, companies, events, company, event, collections, etc.).\n\
          Retourne les URLs completes, sans les modifier, dans le format JSON demande. \
          Ne change jamais les URLs retournees, et ne les tronque jamais.",
-        body = body_truncated,
+        links = links_text,
     );
 
     (system_prompt, user_prompt)
@@ -353,19 +353,18 @@ mod tests {
     }
 
     #[test]
-    fn link_extraction_prompt_includes_body() {
-        let (sys, user) = build_link_extraction_prompt("<a href='/post'>P</a>");
-        assert!(user.contains("<a href='/post'>"));
-        assert!(user.contains("articles"));
+    fn link_extraction_prompt_includes_links() {
+        let links = "- https://example.com/post-1 | \"Breaking News\"\n- https://example.com/post-2 | \"Update\"";
+        let (sys, user) = build_link_extraction_prompt(links);
+        assert!(user.contains("https://example.com/post-1"));
+        assert!(user.contains("Breaking News"));
         assert!(sys.contains("liens"));
-        assert!(!user.contains("<head>"));
     }
 
     #[test]
-    fn link_extraction_prompt_truncates_body() {
-        let long_body = "x".repeat(20000);
-        let (_, user) = build_link_extraction_prompt(&long_body);
-        assert!(user.len() < 18000); // 12000 chars of body + prompt text
+    fn link_extraction_prompt_empty_links() {
+        let (_, user) = build_link_extraction_prompt("");
+        assert!(user.contains("articles"));
     }
 
     #[test]
diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs
index 2b1656a..ae07afc 100644
--- a/backend/src/services/source_scraper.rs
+++ b/backend/src/services/source_scraper.rs
@@ -117,10 +117,65 @@ pub fn extract_links_from_html(
     links
 }
 
-/// Extract the <body> section from HTML (UTF-8 safe, up to 12000 chars).
-pub fn extract_body_html(html: &str) -> String {
-    let body_start = html.find("<body").unwrap_or(0);
-    html[body_start..].chars().take(12000).collect()
+/// Extract all links from HTML as (href, anchor_text) pairs for LLM analysis.
+///
+/// Minimal filtering: same-domain, http/https, non-empty path.
+/// No article-pattern filtering — the LLM decides which are articles.
+pub fn extract_links_as_pairs(
+    html: &str,
+    base_url: &Url,
+) -> Vec<(String, String)> {
+    let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
+    let document = Html::parse_document(html);
+    let selector = Selector::parse("a[href]").unwrap();
+    let mut pairs = Vec::new();
+
+    for element in document.select(&selector) {
+        if let Some(href) = element.value().attr("href") {
+            let resolved = match base_url.join(href) {
+                Ok(u) => u,
+                Err(_) => continue,
+            };
+
+            if resolved.scheme() != "http" && resolved.scheme() != "https" {
+                continue;
+            }
+
+            let link_domain = resolved.host_str().unwrap_or("").to_lowercase();
+            if link_domain != base_domain {
+                continue;
+            }
+
+            let path = resolved.path();
+            if path.is_empty() || path == "/" {
+                continue;
+            }
+
+            let anchor_text: String = element.text().collect::<Vec<_>>().join(" ");
+            let anchor_text = anchor_text.trim().to_string();
+
+            pairs.push((resolved.to_string(), anchor_text));
+        }
+    }
+
+    pairs
+}
+
+/// Format link pairs as a text list for the LLM prompt.
+/// Caps at 200 links to limit token usage.
+fn format_links_for_llm(pairs: &[(String, String)]) -> String {
+    pairs
+        .iter()
+        .take(200)
+        .map(|(href, text)| {
+            if text.is_empty() {
+                format!("- {}", href)
+            } else {
+                format!("- {} | \"{}\"", href, text)
+            }
+        })
+        .collect::<Vec<_>>()
+        .join("\n")
 }
 
 /// Extract article links using LLM analysis of the page HTML.
@@ -155,8 +210,9 @@ pub async fn extract_article_links_with_llm(
         AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
     })?;
 
-    let body_html = extract_body_html(&html_text);
-    let (system, user) = build_link_extraction_prompt(&body_html);
+    let pairs = extract_links_as_pairs(&html_text, &base_url);
+    let links_text = format_links_for_llm(&pairs);
+    let (system, user) = build_link_extraction_prompt(&links_text);
     let schema = build_link_extraction_schema();
 
     let llm_start = std::time::Instant::now();
@@ -308,18 +364,65 @@ mod tests {
     }
 
     #[test]
-    fn extract_body_html_gets_body_content() {
-        let html = "<html><head><title>T</title></head><body><p>Content</p></body></html>";
-        let body = extract_body_html(html);
-        assert!(body.contains("<p>Content</p>"));
-        assert!(!body.contains("<title>"));
+    fn extract_pairs_returns_href_and_text() {
+        let html = r#"
+        <html><body>
+            <a href="/blog/article-1">Breaking AI News</a>
+            <a href="/blog/article-2">GPT-6 Released</a>
+        </body></html>"#;
+        let base = base_url("https://example.com/blog");
+        let pairs = extract_links_as_pairs(html, &base);
+        assert_eq!(pairs.len(), 2);
+        assert!(pairs[0].0.contains("/blog/article-1"));
+        assert_eq!(pairs[0].1, "Breaking AI News");
+        assert!(pairs[1].0.contains("/blog/article-2"));
+        assert_eq!(pairs[1].1, "GPT-6 Released");
+    }
+
+    #[test]
+    fn extract_pairs_filters_external_links() {
+        let html = r#"<a href="https://other.com/article">External</a>"#;
+        let base = base_url("https://example.com");
+        let pairs = extract_links_as_pairs(html, &base);
+        assert!(pairs.is_empty());
+    }
+
+    #[test]
+    fn extract_pairs_filters_root_path() {
+        let html = r#"<a href="/">Home</a>"#;
+        let base = base_url("https://example.com");
+        let pairs = extract_links_as_pairs(html, &base);
+        assert!(pairs.is_empty());
+    }
+
+    #[test]
+    fn extract_pairs_handles_empty_anchor_text() {
+        let html = r#"<a href="/article"><img src="pic.jpg"/></a>"#;
+        let base = base_url("https://example.com");
+        let pairs = extract_links_as_pairs(html, &base);
+        assert_eq!(pairs.len(), 1);
+        assert_eq!(pairs[0].1, "");
+    }
+
+    #[test]
+    fn format_links_for_llm_formats_correctly() {
+        let pairs = vec![
+            ("https://example.com/a".to_string(), "Article One".to_string()),
+            ("https://example.com/b".to_string(), "".to_string()),
+        ];
+        let result = format_links_for_llm(&pairs);
+        assert!(result.contains("- https://example.com/a | \"Article One\""));
+        assert!(result.contains("- https://example.com/b"));
+        assert!(!result.contains("| \"\""));
     }
 
     #[test]
-    fn extract_body_html_truncates_safely() {
-        let long_body = "x".repeat(20000);
-        let html = format!("<head></head><body>{}</body>", long_body);
-        let body = extract_body_html(&html);
-        assert_eq!(body.chars().count(), 12000);
+    fn format_links_for_llm_caps_at_200() {
+        let pairs: Vec<(String, String)> = (0..300)
+            .map(|i| (format!("https://example.com/{}", i), format!("Link {}", i)))
+            .collect();
+        let result = format_links_for_llm(&pairs);
+        let line_count = result.lines().count();
+        assert_eq!(line_count, 200);
     }
 }