You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
429 lines
15 KiB
Rust
429 lines
15 KiB
Rust
//! Source page scraper: fetches a source URL and extracts article links.
|
|
//!
|
|
//! Used in Phase 1 of the generation pipeline to discover articles
|
|
//! from user-configured sources before falling back to LLM web search.
|
|
|
|
use std::sync::Arc;
|
|
use crate::errors::AppError;
|
|
use crate::services::llm::LlmProvider;
|
|
use crate::services::llm::schema::build_link_extraction_schema;
|
|
use crate::services::prompts::build_link_extraction_prompt;
|
|
use scraper::{Html, Selector};
|
|
use url::Url;
|
|
|
|
/// Patterns in URL paths that indicate non-article pages.
|
|
const EXCLUDED_PATH_PATTERNS: &[&str] = &[
|
|
"/tag", "/category", "/author", "/page", "/login", "/signup",
|
|
"/privacy", "/terms", "/search", "/contact", "/about", "/topics",
|
|
"/archive", "/companies", "/events", "/company", "/event", "/collections",
|
|
];
|
|
|
|
/// File extensions that indicate static assets, not articles.
|
|
const EXCLUDED_EXTENSIONS: &[&str] = &[
|
|
".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg",
|
|
".pdf", ".zip", ".xml", ".json", ".ico", ".woff", ".woff2",
|
|
];
|
|
|
|
/// Extract article links from a source page.
|
|
///
|
|
/// Fetches the HTML at `source_url`, extracts all `<a href>` links,
|
|
/// filters to same-domain article-like URLs, deduplicates, and returns
|
|
/// up to `max_links` candidate URLs.
|
|
pub async fn extract_article_links(
|
|
http_client: &reqwest::Client,
|
|
source_url: &str,
|
|
max_links: usize,
|
|
) -> Result<Vec<String>, AppError> {
|
|
let base_url = Url::parse(source_url)
|
|
.map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?;
|
|
let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
|
|
|
|
let response = http_client
|
|
.get(source_url)
|
|
.send()
|
|
.await
|
|
.map_err(|e| {
|
|
tracing::warn!(url = source_url, error = %e, "Failed to fetch source page");
|
|
AppError::Internal(anyhow::anyhow!("Failed to fetch source page"))
|
|
})?;
|
|
|
|
if !response.status().is_success() {
|
|
tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200");
|
|
return Ok(Vec::new());
|
|
}
|
|
|
|
let html_text = response.text().await.map_err(|e| {
|
|
AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
|
|
})?;
|
|
|
|
let links = extract_links_from_html(&html_text, &base_url, &base_domain);
|
|
|
|
Ok(links.into_iter().take(max_links).collect())
|
|
}
|
|
|
|
/// Extract and filter article links from HTML content.
|
|
///
|
|
/// This is a pure function (no I/O) for easy testing.
|
|
pub fn extract_links_from_html(
|
|
html: &str,
|
|
base_url: &Url,
|
|
base_domain: &str,
|
|
) -> Vec<String> {
|
|
let document = Html::parse_document(html);
|
|
let selector = Selector::parse("a[href]").unwrap();
|
|
let mut seen = std::collections::HashSet::new();
|
|
let mut links = Vec::new();
|
|
|
|
for element in document.select(&selector) {
|
|
if let Some(href) = element.value().attr("href") {
|
|
let resolved = match base_url.join(href) {
|
|
Ok(u) => u,
|
|
Err(_) => continue,
|
|
};
|
|
|
|
if resolved.scheme() != "http" && resolved.scheme() != "https" {
|
|
continue;
|
|
}
|
|
|
|
let link_domain = resolved.host_str().unwrap_or("").to_lowercase();
|
|
if link_domain != base_domain {
|
|
continue;
|
|
}
|
|
|
|
let path = resolved.path();
|
|
if path.is_empty() || path == "/" {
|
|
continue;
|
|
}
|
|
|
|
let path_lower = path.to_lowercase();
|
|
if EXCLUDED_PATH_PATTERNS.iter().any(|p| path_lower.contains(p)) {
|
|
continue;
|
|
}
|
|
|
|
if EXCLUDED_EXTENSIONS.iter().any(|ext| path_lower.ends_with(ext)) {
|
|
continue;
|
|
}
|
|
|
|
let mut normalized = resolved.clone();
|
|
normalized.set_fragment(None);
|
|
let url_str = normalized.to_string();
|
|
|
|
if seen.insert(url_str.clone()) {
|
|
links.push(url_str);
|
|
}
|
|
}
|
|
}
|
|
|
|
links
|
|
}
|
|
|
|
/// Extract all links from HTML as (href, anchor_text) pairs for LLM analysis.
|
|
///
|
|
/// Minimal filtering: same-domain, http/https, non-empty path.
|
|
/// No article-pattern filtering — the LLM decides which are articles.
|
|
pub fn extract_links_as_pairs(
|
|
html: &str,
|
|
base_url: &Url,
|
|
) -> Vec<(String, String)> {
|
|
let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
|
|
let document = Html::parse_document(html);
|
|
let selector = Selector::parse("a[href]").unwrap();
|
|
let mut pairs = Vec::new();
|
|
|
|
for element in document.select(&selector) {
|
|
if let Some(href) = element.value().attr("href") {
|
|
let resolved = match base_url.join(href) {
|
|
Ok(u) => u,
|
|
Err(_) => continue,
|
|
};
|
|
|
|
if resolved.scheme() != "http" && resolved.scheme() != "https" {
|
|
continue;
|
|
}
|
|
|
|
let link_domain = resolved.host_str().unwrap_or("").to_lowercase();
|
|
if link_domain != base_domain {
|
|
continue;
|
|
}
|
|
|
|
let path = resolved.path();
|
|
if path.is_empty() || path == "/" {
|
|
continue;
|
|
}
|
|
|
|
let anchor_text: String = element.text().collect::<Vec<_>>().join(" ");
|
|
let anchor_text = anchor_text.trim().to_string();
|
|
|
|
pairs.push((resolved.to_string(), anchor_text));
|
|
}
|
|
}
|
|
|
|
pairs
|
|
}
|
|
|
|
/// Format link pairs as a text list for the LLM prompt.
|
|
/// Caps at 200 links to limit token usage.
|
|
fn format_links_for_llm(pairs: &[(String, String)]) -> String {
|
|
pairs
|
|
.iter()
|
|
.take(200)
|
|
.map(|(href, text)| {
|
|
if text.is_empty() {
|
|
format!("- {}", href)
|
|
} else {
|
|
format!("- {} | \"{}\"", href, text)
|
|
}
|
|
})
|
|
.collect::<Vec<_>>()
|
|
.join("\n")
|
|
}
|
|
|
|
/// Extract article links using LLM analysis of the page HTML.
|
|
///
|
|
/// Falls back to heuristic extraction if the LLM call fails or returns empty.
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub async fn extract_article_links_with_llm(
|
|
http_client: &reqwest::Client,
|
|
source_url: &str,
|
|
max_links: usize,
|
|
provider: &Arc<dyn LlmProvider>,
|
|
model: &str,
|
|
pool: Option<&sqlx::PgPool>,
|
|
user_id: Option<uuid::Uuid>,
|
|
job_id: Option<uuid::Uuid>,
|
|
) -> Result<Vec<String>, AppError> {
|
|
let base_url = Url::parse(source_url)
|
|
.map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?;
|
|
let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
|
|
|
|
let response = http_client.get(source_url).send().await.map_err(|e| {
|
|
tracing::warn!(url = source_url, error = %e, "Failed to fetch source page");
|
|
AppError::Internal(anyhow::anyhow!("Failed to fetch source page"))
|
|
})?;
|
|
|
|
if !response.status().is_success() {
|
|
tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200");
|
|
return Ok(Vec::new());
|
|
}
|
|
|
|
let html_text = response.text().await.map_err(|e| {
|
|
AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
|
|
})?;
|
|
|
|
let pairs = extract_links_as_pairs(&html_text, &base_url);
|
|
let links_text = format_links_for_llm(&pairs);
|
|
let (system, user) = build_link_extraction_prompt(&links_text);
|
|
let schema = build_link_extraction_schema();
|
|
|
|
let llm_start = std::time::Instant::now();
|
|
let llm_result = provider.call_llm(model, &system, &user, &schema).await;
|
|
let llm_duration = llm_start.elapsed().as_millis() as u64;
|
|
|
|
// Log the LLM call if pool/user_id/job_id are provided
|
|
if let (Some(pool), Some(uid), Some(jid)) = (pool, user_id, job_id) {
|
|
let response_str = match &llm_result {
|
|
Ok(resp) => serde_json::to_string_pretty(resp).unwrap_or_default(),
|
|
Err(e) => format!("Error: {}", e),
|
|
};
|
|
crate::db::llm_call_log::insert(
|
|
pool, uid, jid, "link_extraction", model,
|
|
&system, &user, &response_str, llm_duration as i32,
|
|
None,
|
|
).await.ok();
|
|
}
|
|
|
|
match llm_result {
|
|
Ok(llm_response) => {
|
|
let urls: Vec<String> = llm_response
|
|
.get("urls")
|
|
.and_then(|u| u.as_array())
|
|
.map(|arr| {
|
|
arr.iter()
|
|
.filter_map(|v| v.as_str())
|
|
.filter_map(|href| {
|
|
let resolved = base_url.join(href).ok()?;
|
|
if resolved.scheme() != "http" && resolved.scheme() != "https" {
|
|
return None;
|
|
}
|
|
if resolved.host_str()?.to_lowercase() != base_domain {
|
|
return None;
|
|
}
|
|
Some(resolved.to_string())
|
|
})
|
|
.collect()
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
if urls.is_empty() {
|
|
tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic");
|
|
let fallback = extract_links_from_html(&html_text, &base_url, &base_domain);
|
|
Ok(fallback.into_iter().take(max_links).collect())
|
|
} else {
|
|
let mut seen = std::collections::HashSet::new();
|
|
let deduped: Vec<String> = urls.into_iter().filter(|u| seen.insert(u.clone())).collect();
|
|
Ok(deduped.into_iter().take(max_links).collect())
|
|
}
|
|
}
|
|
Err(e) => {
|
|
tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back to heuristic");
|
|
let fallback = extract_links_from_html(&html_text, &base_url, &base_domain);
|
|
Ok(fallback.into_iter().take(max_links).collect())
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn base_url(s: &str) -> Url {
|
|
Url::parse(s).unwrap()
|
|
}
|
|
|
|
#[test]
|
|
fn extracts_article_links_from_html() {
|
|
let html = r#"
|
|
<html><body>
|
|
<a href="/blog/article-1">Article 1</a>
|
|
<a href="/blog/article-2">Article 2</a>
|
|
<a href="/">Home</a>
|
|
</body></html>"#;
|
|
let base = base_url("https://example.com/blog");
|
|
let links = extract_links_from_html(html, &base, "example.com");
|
|
assert_eq!(links.len(), 2);
|
|
assert!(links[0].contains("/blog/article-1"));
|
|
assert!(links[1].contains("/blog/article-2"));
|
|
}
|
|
|
|
#[test]
|
|
fn filters_external_links() {
|
|
let html = r#"<a href="https://other.com/article">External</a>"#;
|
|
let base = base_url("https://example.com");
|
|
let links = extract_links_from_html(html, &base, "example.com");
|
|
assert!(links.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn filters_non_article_patterns() {
|
|
let html = r#"
|
|
<a href="/tag/ai">Tag</a>
|
|
<a href="/category/tech">Category</a>
|
|
<a href="/author/john">Author</a>
|
|
<a href="/login">Login</a>
|
|
"#;
|
|
let base = base_url("https://example.com");
|
|
let links = extract_links_from_html(html, &base, "example.com");
|
|
assert!(links.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn filters_static_assets() {
|
|
let html = r#"
|
|
<a href="/style.css">CSS</a>
|
|
<a href="/script.js">JS</a>
|
|
<a href="/logo.png">Image</a>
|
|
"#;
|
|
let base = base_url("https://example.com");
|
|
let links = extract_links_from_html(html, &base, "example.com");
|
|
assert!(links.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn deduplicates_links() {
|
|
let html = r#"
|
|
<a href="/article">Link 1</a>
|
|
<a href="/article">Link 2</a>
|
|
<a href="/article#section">Link 3</a>
|
|
"#;
|
|
let base = base_url("https://example.com");
|
|
let links = extract_links_from_html(html, &base, "example.com");
|
|
assert_eq!(links.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn resolves_relative_urls() {
|
|
let html = r#"<a href="my-post">Relative</a>"#;
|
|
let base = base_url("https://example.com/blog/");
|
|
let links = extract_links_from_html(html, &base, "example.com");
|
|
assert_eq!(links.len(), 1);
|
|
assert!(links[0].contains("/blog/my-post"));
|
|
}
|
|
|
|
#[test]
|
|
fn allows_single_segment_paths() {
|
|
let html = r#"<a href="/my-great-article">Article</a>"#;
|
|
let base = base_url("https://example.com");
|
|
let links = extract_links_from_html(html, &base, "example.com");
|
|
assert_eq!(links.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn empty_html_returns_empty() {
|
|
let links = extract_links_from_html("", &base_url("https://example.com"), "example.com");
|
|
assert!(links.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn extract_pairs_returns_href_and_text() {
|
|
let html = r#"
|
|
<html><body>
|
|
<a href="/blog/article-1">Breaking AI News</a>
|
|
<a href="/blog/article-2">GPT-6 Released</a>
|
|
</body></html>"#;
|
|
let base = base_url("https://example.com/blog");
|
|
let pairs = extract_links_as_pairs(html, &base);
|
|
assert_eq!(pairs.len(), 2);
|
|
assert!(pairs[0].0.contains("/blog/article-1"));
|
|
assert_eq!(pairs[0].1, "Breaking AI News");
|
|
assert!(pairs[1].0.contains("/blog/article-2"));
|
|
assert_eq!(pairs[1].1, "GPT-6 Released");
|
|
}
|
|
|
|
#[test]
|
|
fn extract_pairs_filters_external_links() {
|
|
let html = r#"<a href="https://other.com/article">External</a>"#;
|
|
let base = base_url("https://example.com");
|
|
let pairs = extract_links_as_pairs(html, &base);
|
|
assert!(pairs.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn extract_pairs_filters_root_path() {
|
|
let html = r#"<a href="/">Home</a>"#;
|
|
let base = base_url("https://example.com");
|
|
let pairs = extract_links_as_pairs(html, &base);
|
|
assert!(pairs.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn extract_pairs_handles_empty_anchor_text() {
|
|
let html = r#"<a href="/article"><img src="pic.jpg"/></a>"#;
|
|
let base = base_url("https://example.com");
|
|
let pairs = extract_links_as_pairs(html, &base);
|
|
assert_eq!(pairs.len(), 1);
|
|
assert_eq!(pairs[0].1, "");
|
|
}
|
|
|
|
#[test]
|
|
fn format_links_for_llm_formats_correctly() {
|
|
let pairs = vec![
|
|
("https://example.com/a".to_string(), "Article One".to_string()),
|
|
("https://example.com/b".to_string(), "".to_string()),
|
|
];
|
|
let result = format_links_for_llm(&pairs);
|
|
assert!(result.contains("- https://example.com/a | \"Article One\""));
|
|
assert!(result.contains("- https://example.com/b"));
|
|
assert!(!result.contains("| \"\""));
|
|
}
|
|
|
|
#[test]
|
|
fn format_links_for_llm_caps_at_200() {
|
|
let pairs: Vec<(String, String)> = (0..300)
|
|
.map(|i| (format!("https://example.com/{}", i), format!("Link {}", i)))
|
|
.collect();
|
|
let result = format_links_for_llm(&pairs);
|
|
let line_count = result.lines().count();
|
|
assert_eq!(line_count, 200);
|
|
}
|
|
}
|