feat: send structured link pairs to LLM instead of raw HTML body

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 3 months ago
parent a5332f0996
commit 41109b3d93

@ -120,24 +120,24 @@ pub fn build_search_prompt(
} }
/// Build a prompt for LLM-assisted link extraction from a source page. /// Build a prompt for LLM-assisted link extraction from a source page.
pub fn build_link_extraction_prompt(body_html: &str) -> (String, String) { ///
/// Receives a pre-formatted list of (href, anchor_text) pairs, not raw HTML.
pub fn build_link_extraction_prompt(links_text: &str) -> (String, String) {
let system_prompt = let system_prompt =
"Tu es un assistant qui analyse des pages web. \ "Tu es un assistant qui analyse des listes de liens. \
Tu dois identifier les liens vers des articles d'actualite. \ Tu dois identifier les liens vers des articles d'actualite. \
Reponds uniquement au format JSON demande." Reponds uniquement au format JSON demande."
.to_string(); .to_string();
let body_truncated: String = body_html.chars().take(12000).collect();
let user_prompt = format!( let user_prompt = format!(
"Voici le contenu HTML du body d'une page de blog ou de site d'actualites.\n\n\ "Voici une liste de liens extraits d'une page de blog ou de site d'actualites.\n\n\
{body}\n\n\ {links}\n\n\
Extrais UNIQUEMENT les URLs qui pointent vers des articles \ Selectionne UNIQUEMENT les URLs qui pointent vers des articles \
(pas les liens de navigation, tags, categories, login, pages statiques, topics, \ (pas les liens de navigation, tags, categories, login, pages statiques, topics, \
archive, companies, events, company, event, collections, etc.).\n\ archive, companies, events, company, event, collections, etc.).\n\
Retourne les URLs completes, sans les modifier, dans le format JSON demande. \ Retourne les URLs completes, sans les modifier, dans le format JSON demande. \
Ne change jamais les URLs retournees, et ne les tronque jamais.", Ne change jamais les URLs retournees, et ne les tronque jamais.",
body = body_truncated, links = links_text,
); );
(system_prompt, user_prompt) (system_prompt, user_prompt)
@ -353,19 +353,18 @@ mod tests {
} }
#[test] #[test]
fn link_extraction_prompt_includes_body() { fn link_extraction_prompt_includes_links() {
let (sys, user) = build_link_extraction_prompt("<a href='/post'>P</a>"); let links = "- https://example.com/post-1 | \"Breaking News\"\n- https://example.com/post-2 | \"Update\"";
assert!(user.contains("<a href='/post'>")); let (sys, user) = build_link_extraction_prompt(links);
assert!(user.contains("articles")); assert!(user.contains("https://example.com/post-1"));
assert!(user.contains("Breaking News"));
assert!(sys.contains("liens")); assert!(sys.contains("liens"));
assert!(!user.contains("<head>"));
} }
#[test] #[test]
fn link_extraction_prompt_truncates_body() { fn link_extraction_prompt_empty_links() {
let long_body = "x".repeat(20000); let (_, user) = build_link_extraction_prompt("");
let (_, user) = build_link_extraction_prompt(&long_body); assert!(user.contains("articles"));
assert!(user.len() < 18000); // 12000 chars of body + prompt text
} }
#[test] #[test]

@ -117,10 +117,65 @@ pub fn extract_links_from_html(
links links
} }
/// Extract the <body> section from HTML (UTF-8 safe, up to 12000 chars). /// Extract all links from HTML as (href, anchor_text) pairs for LLM analysis.
pub fn extract_body_html(html: &str) -> String { ///
let body_start = html.find("<body").unwrap_or(0); /// Minimal filtering: same-domain, http/https, non-empty path.
html[body_start..].chars().take(12000).collect() /// No article-pattern filtering — the LLM decides which are articles.
pub fn extract_links_as_pairs(
html: &str,
base_url: &Url,
) -> Vec<(String, String)> {
let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
let document = Html::parse_document(html);
let selector = Selector::parse("a[href]").unwrap();
let mut pairs = Vec::new();
for element in document.select(&selector) {
if let Some(href) = element.value().attr("href") {
let resolved = match base_url.join(href) {
Ok(u) => u,
Err(_) => continue,
};
if resolved.scheme() != "http" && resolved.scheme() != "https" {
continue;
}
let link_domain = resolved.host_str().unwrap_or("").to_lowercase();
if link_domain != base_domain {
continue;
}
let path = resolved.path();
if path.is_empty() || path == "/" {
continue;
}
let anchor_text: String = element.text().collect::<Vec<_>>().join(" ");
let anchor_text = anchor_text.trim().to_string();
pairs.push((resolved.to_string(), anchor_text));
}
}
pairs
}
/// Format link pairs as a text list for the LLM prompt.
/// Caps at 200 links to limit token usage.
fn format_links_for_llm(pairs: &[(String, String)]) -> String {
pairs
.iter()
.take(200)
.map(|(href, text)| {
if text.is_empty() {
format!("- {}", href)
} else {
format!("- {} | \"{}\"", href, text)
}
})
.collect::<Vec<_>>()
.join("\n")
} }
/// Extract article links using LLM analysis of the page HTML. /// Extract article links using LLM analysis of the page HTML.
@ -155,8 +210,9 @@ pub async fn extract_article_links_with_llm(
AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e)) AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
})?; })?;
let body_html = extract_body_html(&html_text); let pairs = extract_links_as_pairs(&html_text, &base_url);
let (system, user) = build_link_extraction_prompt(&body_html); let links_text = format_links_for_llm(&pairs);
let (system, user) = build_link_extraction_prompt(&links_text);
let schema = build_link_extraction_schema(); let schema = build_link_extraction_schema();
let llm_start = std::time::Instant::now(); let llm_start = std::time::Instant::now();
@ -308,18 +364,65 @@ mod tests {
} }
#[test] #[test]
fn extract_body_html_gets_body_content() { fn extract_pairs_returns_href_and_text() {
let html = "<html><head><title>T</title></head><body><p>Content</p></body></html>"; let html = r#"
let body = extract_body_html(html); <html><body>
assert!(body.contains("<p>Content</p>")); <a href="/blog/article-1">Breaking AI News</a>
assert!(!body.contains("<title>")); <a href="/blog/article-2">GPT-6 Released</a>
</body></html>"#;
let base = base_url("https://example.com/blog");
let pairs = extract_links_as_pairs(html, &base);
assert_eq!(pairs.len(), 2);
assert!(pairs[0].0.contains("/blog/article-1"));
assert_eq!(pairs[0].1, "Breaking AI News");
assert!(pairs[1].0.contains("/blog/article-2"));
assert_eq!(pairs[1].1, "GPT-6 Released");
}
#[test]
fn extract_pairs_filters_external_links() {
let html = r#"<a href="https://other.com/article">External</a>"#;
let base = base_url("https://example.com");
let pairs = extract_links_as_pairs(html, &base);
assert!(pairs.is_empty());
}
#[test]
fn extract_pairs_filters_root_path() {
let html = r#"<a href="/">Home</a>"#;
let base = base_url("https://example.com");
let pairs = extract_links_as_pairs(html, &base);
assert!(pairs.is_empty());
}
#[test]
fn extract_pairs_handles_empty_anchor_text() {
let html = r#"<a href="/article"><img src="pic.jpg"/></a>"#;
let base = base_url("https://example.com");
let pairs = extract_links_as_pairs(html, &base);
assert_eq!(pairs.len(), 1);
assert_eq!(pairs[0].1, "");
}
#[test]
fn format_links_for_llm_formats_correctly() {
let pairs = vec![
("https://example.com/a".to_string(), "Article One".to_string()),
("https://example.com/b".to_string(), "".to_string()),
];
let result = format_links_for_llm(&pairs);
assert!(result.contains("- https://example.com/a | \"Article One\""));
assert!(result.contains("- https://example.com/b"));
assert!(!result.contains("| \"\""));
} }
#[test] #[test]
fn extract_body_html_truncates_safely() { fn format_links_for_llm_caps_at_200() {
let long_body = "x".repeat(20000); let pairs: Vec<(String, String)> = (0..300)
let html = format!("<head></head><body>{}</body>", long_body); .map(|i| (format!("https://example.com/{}", i), format!("Link {}", i)))
let body = extract_body_html(&html); .collect();
assert_eq!(body.chars().count(), 12000); let result = format_links_for_llm(&pairs);
let line_count = result.lines().count();
assert_eq!(line_count, 200);
} }
} }

Loading…
Cancel
Save