|
|
|
|
@ -117,10 +117,65 @@ pub fn extract_links_from_html(
|
|
|
|
|
links
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Extract the <body> section from HTML (UTF-8 safe, up to 12000 chars).
|
|
|
|
|
pub fn extract_body_html(html: &str) -> String {
|
|
|
|
|
let body_start = html.find("<body").unwrap_or(0);
|
|
|
|
|
html[body_start..].chars().take(12000).collect()
|
|
|
|
|
/// Extract all links from HTML as (href, anchor_text) pairs for LLM analysis.
|
|
|
|
|
///
|
|
|
|
|
/// Minimal filtering: same-domain, http/https, non-empty path.
|
|
|
|
|
/// No article-pattern filtering — the LLM decides which are articles.
|
|
|
|
|
pub fn extract_links_as_pairs(
|
|
|
|
|
html: &str,
|
|
|
|
|
base_url: &Url,
|
|
|
|
|
) -> Vec<(String, String)> {
|
|
|
|
|
let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
|
|
|
|
|
let document = Html::parse_document(html);
|
|
|
|
|
let selector = Selector::parse("a[href]").unwrap();
|
|
|
|
|
let mut pairs = Vec::new();
|
|
|
|
|
|
|
|
|
|
for element in document.select(&selector) {
|
|
|
|
|
if let Some(href) = element.value().attr("href") {
|
|
|
|
|
let resolved = match base_url.join(href) {
|
|
|
|
|
Ok(u) => u,
|
|
|
|
|
Err(_) => continue,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if resolved.scheme() != "http" && resolved.scheme() != "https" {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let link_domain = resolved.host_str().unwrap_or("").to_lowercase();
|
|
|
|
|
if link_domain != base_domain {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let path = resolved.path();
|
|
|
|
|
if path.is_empty() || path == "/" {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let anchor_text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
|
|
|
let anchor_text = anchor_text.trim().to_string();
|
|
|
|
|
|
|
|
|
|
pairs.push((resolved.to_string(), anchor_text));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pairs
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Format link pairs as a text list for the LLM prompt.
|
|
|
|
|
/// Caps at 200 links to limit token usage.
|
|
|
|
|
fn format_links_for_llm(pairs: &[(String, String)]) -> String {
|
|
|
|
|
pairs
|
|
|
|
|
.iter()
|
|
|
|
|
.take(200)
|
|
|
|
|
.map(|(href, text)| {
|
|
|
|
|
if text.is_empty() {
|
|
|
|
|
format!("- {}", href)
|
|
|
|
|
} else {
|
|
|
|
|
format!("- {} | \"{}\"", href, text)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
|
.join("\n")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Extract article links using LLM analysis of the page HTML.
|
|
|
|
|
@ -155,8 +210,9 @@ pub async fn extract_article_links_with_llm(
|
|
|
|
|
AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
let body_html = extract_body_html(&html_text);
|
|
|
|
|
let (system, user) = build_link_extraction_prompt(&body_html);
|
|
|
|
|
let pairs = extract_links_as_pairs(&html_text, &base_url);
|
|
|
|
|
let links_text = format_links_for_llm(&pairs);
|
|
|
|
|
let (system, user) = build_link_extraction_prompt(&links_text);
|
|
|
|
|
let schema = build_link_extraction_schema();
|
|
|
|
|
|
|
|
|
|
let llm_start = std::time::Instant::now();
|
|
|
|
|
@ -308,18 +364,65 @@ mod tests {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn extract_body_html_gets_body_content() {
|
|
|
|
|
let html = "<html><head><title>T</title></head><body><p>Content</p></body></html>";
|
|
|
|
|
let body = extract_body_html(html);
|
|
|
|
|
assert!(body.contains("<p>Content</p>"));
|
|
|
|
|
assert!(!body.contains("<title>"));
|
|
|
|
|
fn extract_pairs_returns_href_and_text() {
|
|
|
|
|
let html = r#"
|
|
|
|
|
<html><body>
|
|
|
|
|
<a href="/blog/article-1">Breaking AI News</a>
|
|
|
|
|
<a href="/blog/article-2">GPT-6 Released</a>
|
|
|
|
|
</body></html>"#;
|
|
|
|
|
let base = base_url("https://example.com/blog");
|
|
|
|
|
let pairs = extract_links_as_pairs(html, &base);
|
|
|
|
|
assert_eq!(pairs.len(), 2);
|
|
|
|
|
assert!(pairs[0].0.contains("/blog/article-1"));
|
|
|
|
|
assert_eq!(pairs[0].1, "Breaking AI News");
|
|
|
|
|
assert!(pairs[1].0.contains("/blog/article-2"));
|
|
|
|
|
assert_eq!(pairs[1].1, "GPT-6 Released");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn extract_pairs_filters_external_links() {
|
|
|
|
|
let html = r#"<a href="https://other.com/article">External</a>"#;
|
|
|
|
|
let base = base_url("https://example.com");
|
|
|
|
|
let pairs = extract_links_as_pairs(html, &base);
|
|
|
|
|
assert!(pairs.is_empty());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn extract_pairs_filters_root_path() {
|
|
|
|
|
let html = r#"<a href="/">Home</a>"#;
|
|
|
|
|
let base = base_url("https://example.com");
|
|
|
|
|
let pairs = extract_links_as_pairs(html, &base);
|
|
|
|
|
assert!(pairs.is_empty());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn extract_pairs_handles_empty_anchor_text() {
|
|
|
|
|
let html = r#"<a href="/article"><img src="pic.jpg"/></a>"#;
|
|
|
|
|
let base = base_url("https://example.com");
|
|
|
|
|
let pairs = extract_links_as_pairs(html, &base);
|
|
|
|
|
assert_eq!(pairs.len(), 1);
|
|
|
|
|
assert_eq!(pairs[0].1, "");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn format_links_for_llm_formats_correctly() {
|
|
|
|
|
let pairs = vec![
|
|
|
|
|
("https://example.com/a".to_string(), "Article One".to_string()),
|
|
|
|
|
("https://example.com/b".to_string(), "".to_string()),
|
|
|
|
|
];
|
|
|
|
|
let result = format_links_for_llm(&pairs);
|
|
|
|
|
assert!(result.contains("- https://example.com/a | \"Article One\""));
|
|
|
|
|
assert!(result.contains("- https://example.com/b"));
|
|
|
|
|
assert!(!result.contains("| \"\""));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn extract_body_html_truncates_safely() {
|
|
|
|
|
let long_body = "x".repeat(20000);
|
|
|
|
|
let html = format!("<head></head><body>{}</body>", long_body);
|
|
|
|
|
let body = extract_body_html(&html);
|
|
|
|
|
assert_eq!(body.chars().count(), 12000);
|
|
|
|
|
fn format_links_for_llm_caps_at_200() {
|
|
|
|
|
let pairs: Vec<(String, String)> = (0..300)
|
|
|
|
|
.map(|i| (format!("https://example.com/{}", i), format!("Link {}", i)))
|
|
|
|
|
.collect();
|
|
|
|
|
let result = format_links_for_llm(&pairs);
|
|
|
|
|
let line_count = result.lines().count();
|
|
|
|
|
assert_eq!(line_count, 200);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|