From 357f06e40578670858262e3059c5b3fe2e79f53b Mon Sep 17 00:00:00 2001
From: oabrivard <olivier@abrivard.fr>
Date: Tue, 24 Mar 2026 10:45:14 +0100
Subject: [PATCH] feat: LLM-assisted source link extraction with heuristic
 fallback

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/src/services/source_scraper.rs | 104 +++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs
index 2d64482..c272156 100644
--- a/backend/src/services/source_scraper.rs
+++ b/backend/src/services/source_scraper.rs
@@ -3,7 +3,11 @@
 //! Used in Phase 1 of the generation pipeline to discover articles
 //! from user-configured sources before falling back to LLM web search.
 
+use std::sync::Arc;
 use crate::errors::AppError;
+use crate::services::llm::LlmProvider;
+use crate::services::llm::schema::build_link_extraction_schema;
+use crate::services::prompts::build_link_extraction_prompt;
 use scraper::{Html, Selector};
 use url::Url;
 
@@ -112,6 +116,90 @@ pub fn extract_links_from_html(
     links
 }
 
+/// Extract <head> section and first 8000 chars of <body> from HTML (UTF-8 safe).
+pub fn extract_head_and_body(html: &str) -> (String, String) {
+    let head_start = html.find("<head").unwrap_or(0);
+    let head_end = html.find("</head>").map(|i| i + 7).unwrap_or(head_start);
+    let head = &html[head_start..head_end];
+
+    let body_start = html.find("<body").unwrap_or(head_end);
+    let body: String = html[body_start..].chars().take(8000).collect();
+
+    (head.to_string(), body)
+}
+
+/// Extract article links using LLM analysis of the page HTML.
+///
+/// Falls back to heuristic extraction if the LLM call fails or returns empty.
+pub async fn extract_article_links_with_llm(
+    http_client: &reqwest::Client,
+    source_url: &str,
+    max_links: usize,
+    provider: &Arc<dyn LlmProvider>,
+    model: &str,
+) -> Result<Vec<String>, AppError> {
+    let base_url = Url::parse(source_url)
+        .map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?;
+    let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
+
+    let response = http_client.get(source_url).send().await.map_err(|e| {
+        tracing::warn!(url = source_url, error = %e, "Failed to fetch source page");
+        AppError::Internal(anyhow::anyhow!("Failed to fetch source page"))
+    })?;
+
+    if !response.status().is_success() {
+        tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200");
+        return Ok(Vec::new());
+    }
+
+    let html_text = response.text().await.map_err(|e| {
+        AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
+    })?;
+
+    let (head_html, body_html) = extract_head_and_body(&html_text);
+    let (system, user) = build_link_extraction_prompt(&head_html, &body_html);
+    let schema = build_link_extraction_schema();
+
+    match provider.generate_rewrite_pass(model, &system, &user, &schema).await {
+        Ok(llm_response) => {
+            let urls: Vec<String> = llm_response
+                .get("urls")
+                .and_then(|u| u.as_array())
+                .map(|arr| {
+                    arr.iter()
+                        .filter_map(|v| v.as_str())
+                        .filter_map(|href| {
+                            let resolved = base_url.join(href).ok()?;
+                            if resolved.scheme() != "http" && resolved.scheme() != "https" {
+                                return None;
+                            }
+                            if resolved.host_str()?.to_lowercase() != base_domain {
+                                return None;
+                            }
+                            Some(resolved.to_string())
+                        })
+                        .collect()
+                })
+                .unwrap_or_default();
+
+            if urls.is_empty() {
+                tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic");
+                let fallback = extract_links_from_html(&html_text, &base_url, &base_domain);
+                Ok(fallback.into_iter().take(max_links).collect())
+            } else {
+                let mut seen = std::collections::HashSet::new();
+                let deduped: Vec<String> = urls.into_iter().filter(|u| seen.insert(u.clone())).collect();
+                Ok(deduped.into_iter().take(max_links).collect())
+            }
+        }
+        Err(e) => {
+            tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back to heuristic");
+            let fallback = extract_links_from_html(&html_text, &base_url, &base_domain);
+            Ok(fallback.into_iter().take(max_links).collect())
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -202,4 +290,20 @@ mod tests {
         let links = extract_links_from_html("", &base_url("https://example.com"), "example.com");
         assert!(links.is_empty());
     }
+
+    #[test]
+    fn extract_head_and_body_splits_correctly() {
+        let html = "<html><head><title>T</title></head><body><p>Content</p></body></html>";
+        let (head, body) = extract_head_and_body(html);
+        assert!(head.contains("<title>T</title>"));
+        assert!(body.contains("<p>Content</p>"));
+    }
+
+    #[test]
+    fn extract_head_and_body_truncates_body_safely() {
+        let long_body = "x".repeat(20000);
+        let html = format!("<head></head><body>{}</body>", long_body);
+        let (_, body) = extract_head_and_body(&html);
+        assert_eq!(body.chars().count(), 8000);
+    }
 }