From 41109b3d9325dfcef004c4a92d11b8586920598b Mon Sep 17 00:00:00 2001 From: oabrivard Date: Wed, 25 Mar 2026 10:53:56 +0100 Subject: [PATCH] feat: send structured link pairs to LLM instead of raw HTML body Co-Authored-By: Claude Sonnet 4.6 --- backend/src/services/prompts.rs | 33 +++--- backend/src/services/source_scraper.rs | 135 ++++++++++++++++++++++--- 2 files changed, 135 insertions(+), 33 deletions(-) diff --git a/backend/src/services/prompts.rs b/backend/src/services/prompts.rs index 84577ef..9dbbe2d 100644 --- a/backend/src/services/prompts.rs +++ b/backend/src/services/prompts.rs @@ -120,24 +120,24 @@ pub fn build_search_prompt( } /// Build a prompt for LLM-assisted link extraction from a source page. -pub fn build_link_extraction_prompt(body_html: &str) -> (String, String) { +/// +/// Receives a pre-formatted list of (href, anchor_text) pairs, not raw HTML. +pub fn build_link_extraction_prompt(links_text: &str) -> (String, String) { let system_prompt = - "Tu es un assistant qui analyse des pages web. \ + "Tu es un assistant qui analyse des listes de liens. \ Tu dois identifier les liens vers des articles d'actualite. \ Reponds uniquement au format JSON demande." .to_string(); - let body_truncated: String = body_html.chars().take(12000).collect(); - let user_prompt = format!( - "Voici le contenu HTML du body d'une page de blog ou de site d'actualites.\n\n\ - {body}\n\n\ - Extrais UNIQUEMENT les URLs qui pointent vers des articles \ + "Voici une liste de liens extraits d'une page de blog ou de site d'actualites.\n\n\ + {links}\n\n\ + Selectionne UNIQUEMENT les URLs qui pointent vers des articles \ (pas les liens de navigation, tags, categories, login, pages statiques, topics, \ archive, companies, events, company, event, collections, etc.).\n\ Retourne les URLs completes, sans les modifier, dans le format JSON demande. \ Ne change jamais les URLs retournees, et ne les tronque jamais.", - body = body_truncated, + links = links_text, ); (system_prompt, user_prompt) @@ -353,19 +353,18 @@ mod tests { } #[test] - fn link_extraction_prompt_includes_body() { - let (sys, user) = build_link_extraction_prompt("P"); - assert!(user.contains("")); - assert!(user.contains("articles")); + fn link_extraction_prompt_includes_links() { + let links = "- https://example.com/post-1 | \"Breaking News\"\n- https://example.com/post-2 | \"Update\""; + let (sys, user) = build_link_extraction_prompt(links); + assert!(user.contains("https://example.com/post-1")); + assert!(user.contains("Breaking News")); assert!(sys.contains("liens")); - assert!(!user.contains("")); } #[test] - fn link_extraction_prompt_truncates_body() { - let long_body = "x".repeat(20000); - let (_, user) = build_link_extraction_prompt(&long_body); - assert!(user.len() < 18000); // 12000 chars of body + prompt text + fn link_extraction_prompt_empty_links() { + let (_, user) = build_link_extraction_prompt(""); + assert!(user.contains("articles")); } #[test] diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs index 2b1656a..ae07afc 100644 --- a/backend/src/services/source_scraper.rs +++ b/backend/src/services/source_scraper.rs @@ -117,10 +117,65 @@ pub fn extract_links_from_html( links } -/// Extract the section from HTML (UTF-8 safe, up to 12000 chars). -pub fn extract_body_html(html: &str) -> String { - let body_start = html.find(" Vec<(String, String)> { + let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); + let document = Html::parse_document(html); + let selector = Selector::parse("a[href]").unwrap(); + let mut pairs = Vec::new(); + + for element in document.select(&selector) { + if let Some(href) = element.value().attr("href") { + let resolved = match base_url.join(href) { + Ok(u) => u, + Err(_) => continue, + }; + + if resolved.scheme() != "http" && resolved.scheme() != "https" { + continue; + } + + let link_domain = resolved.host_str().unwrap_or("").to_lowercase(); + if link_domain != base_domain { + continue; + } + + let path = resolved.path(); + if path.is_empty() || path == "/" { + continue; + } + + let anchor_text: String = element.text().collect::>().join(" "); + let anchor_text = anchor_text.trim().to_string(); + + pairs.push((resolved.to_string(), anchor_text)); + } + } + + pairs +} + +/// Format link pairs as a text list for the LLM prompt. +/// Caps at 200 links to limit token usage. +fn format_links_for_llm(pairs: &[(String, String)]) -> String { + pairs + .iter() + .take(200) + .map(|(href, text)| { + if text.is_empty() { + format!("- {}", href) + } else { + format!("- {} | \"{}\"", href, text) + } + }) + .collect::>() + .join("\n") } /// Extract article links using LLM analysis of the page HTML. @@ -155,8 +210,9 @@ pub async fn extract_article_links_with_llm( AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e)) })?; - let body_html = extract_body_html(&html_text); - let (system, user) = build_link_extraction_prompt(&body_html); + let pairs = extract_links_as_pairs(&html_text, &base_url); + let links_text = format_links_for_llm(&pairs); + let (system, user) = build_link_extraction_prompt(&links_text); let schema = build_link_extraction_schema(); let llm_start = std::time::Instant::now(); @@ -308,18 +364,65 @@ mod tests { } #[test] - fn extract_body_html_gets_body_content() { - let html = "T

Content

"; - let body = extract_body_html(html); - assert!(body.contains("

Content

")); - assert!(!body.contains("")); + fn extract_pairs_returns_href_and_text() { + let html = r#" + <html><body> + <a href="/blog/article-1">Breaking AI News</a> + <a href="/blog/article-2">GPT-6 Released</a> + </body></html>"#; + let base = base_url("https://example.com/blog"); + let pairs = extract_links_as_pairs(html, &base); + assert_eq!(pairs.len(), 2); + assert!(pairs[0].0.contains("/blog/article-1")); + assert_eq!(pairs[0].1, "Breaking AI News"); + assert!(pairs[1].0.contains("/blog/article-2")); + assert_eq!(pairs[1].1, "GPT-6 Released"); + } + + #[test] + fn extract_pairs_filters_external_links() { + let html = r#"<a href="https://other.com/article">External</a>"#; + let base = base_url("https://example.com"); + let pairs = extract_links_as_pairs(html, &base); + assert!(pairs.is_empty()); + } + + #[test] + fn extract_pairs_filters_root_path() { + let html = r#"<a href="/">Home</a>"#; + let base = base_url("https://example.com"); + let pairs = extract_links_as_pairs(html, &base); + assert!(pairs.is_empty()); + } + + #[test] + fn extract_pairs_handles_empty_anchor_text() { + let html = r#"<a href="/article"><img src="pic.jpg"/></a>"#; + let base = base_url("https://example.com"); + let pairs = extract_links_as_pairs(html, &base); + assert_eq!(pairs.len(), 1); + assert_eq!(pairs[0].1, ""); + } + + #[test] + fn format_links_for_llm_formats_correctly() { + let pairs = vec![ + ("https://example.com/a".to_string(), "Article One".to_string()), + ("https://example.com/b".to_string(), "".to_string()), + ]; + let result = format_links_for_llm(&pairs); + assert!(result.contains("- https://example.com/a | \"Article One\"")); + assert!(result.contains("- https://example.com/b")); + assert!(!result.contains("| \"\"")); } #[test] - fn extract_body_html_truncates_safely() { - let long_body = "x".repeat(20000); - let html = format!("<head></head><body>{}</body>", long_body); - let body = extract_body_html(&html); - assert_eq!(body.chars().count(), 12000); + fn format_links_for_llm_caps_at_200() { + let pairs: Vec<(String, String)> = (0..300) + .map(|i| (format!("https://example.com/{}", i), format!("Link {}", i))) + .collect(); + let result = format_links_for_llm(&pairs); + let line_count = result.lines().count(); + assert_eq!(line_count, 200); } }