//! Source page scraper: fetches a source URL and extracts article links. //! //! Used in Phase 1 of the generation pipeline to discover articles //! from user-configured sources before falling back to LLM web search. use std::sync::Arc; use crate::errors::AppError; use crate::services::llm::LlmProvider; use crate::services::llm::schema::build_link_extraction_schema; use crate::services::prompts::build_link_extraction_prompt; use scraper::{Html, Selector}; use url::Url; /// Patterns in URL paths that indicate non-article pages. const EXCLUDED_PATH_PATTERNS: &[&str] = &[ "/tag", "/category", "/author", "/page", "/login", "/signup", "/privacy", "/terms", "/search", "/contact", "/about", "/topics", "/archive", "/companies", "/events", "/company", "/event", "/collections", ]; /// File extensions that indicate static assets, not articles. const EXCLUDED_EXTENSIONS: &[&str] = &[ ".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".zip", ".xml", ".json", ".ico", ".woff", ".woff2", ]; /// Extract article links from a source page. /// /// Fetches the HTML at `source_url`, extracts all `` links, /// filters to same-domain article-like URLs, deduplicates, and returns /// up to `max_links` candidate URLs. pub async fn extract_article_links( http_client: &reqwest::Client, source_url: &str, max_links: usize, ) -> Result, AppError> { let base_url = Url::parse(source_url) .map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?; let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); let response = http_client .get(source_url) .send() .await .map_err(|e| { tracing::warn!(url = source_url, error = %e, "Failed to fetch source page"); AppError::Internal(anyhow::anyhow!("Failed to fetch source page")) })?; if !response.status().is_success() { tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200"); return Ok(Vec::new()); } let html_text = response.text().await.map_err(|e| { AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e)) })?; let links = extract_links_from_html(&html_text, &base_url, &base_domain); Ok(links.into_iter().take(max_links).collect()) } /// Extract and filter article links from HTML content. /// /// This is a pure function (no I/O) for easy testing. pub fn extract_links_from_html( html: &str, base_url: &Url, base_domain: &str, ) -> Vec { let document = Html::parse_document(html); let selector = Selector::parse("a[href]").unwrap(); let mut seen = std::collections::HashSet::new(); let mut links = Vec::new(); for element in document.select(&selector) { if let Some(href) = element.value().attr("href") { let resolved = match base_url.join(href) { Ok(u) => u, Err(_) => continue, }; if resolved.scheme() != "http" && resolved.scheme() != "https" { continue; } let link_domain = resolved.host_str().unwrap_or("").to_lowercase(); if link_domain != base_domain { continue; } let path = resolved.path(); if path.is_empty() || path == "/" { continue; } let path_lower = path.to_lowercase(); if EXCLUDED_PATH_PATTERNS.iter().any(|p| path_lower.contains(p)) { continue; } if EXCLUDED_EXTENSIONS.iter().any(|ext| path_lower.ends_with(ext)) { continue; } let mut normalized = resolved.clone(); normalized.set_fragment(None); let url_str = normalized.to_string(); if seen.insert(url_str.clone()) { links.push(url_str); } } } links } /// Extract all links from HTML as (href, anchor_text) pairs for LLM analysis. /// /// Minimal filtering: same-domain, http/https, non-empty path. /// No article-pattern filtering — the LLM decides which are articles. pub fn extract_links_as_pairs( html: &str, base_url: &Url, ) -> Vec<(String, String)> { let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); let document = Html::parse_document(html); let selector = Selector::parse("a[href]").unwrap(); let mut pairs = Vec::new(); for element in document.select(&selector) { if let Some(href) = element.value().attr("href") { let resolved = match base_url.join(href) { Ok(u) => u, Err(_) => continue, }; if resolved.scheme() != "http" && resolved.scheme() != "https" { continue; } let link_domain = resolved.host_str().unwrap_or("").to_lowercase(); if link_domain != base_domain { continue; } let path = resolved.path(); if path.is_empty() || path == "/" { continue; } let anchor_text: String = element.text().collect::>().join(" "); let anchor_text = anchor_text.trim().to_string(); pairs.push((resolved.to_string(), anchor_text)); } } pairs } /// Format link pairs as a text list for the LLM prompt. /// Caps at 200 links to limit token usage. fn format_links_for_llm(pairs: &[(String, String)]) -> String { pairs .iter() .take(200) .map(|(href, text)| { if text.is_empty() { format!("- {}", href) } else { format!("- {} | \"{}\"", href, text) } }) .collect::>() .join("\n") } /// Extract article links using LLM analysis of the page HTML. /// /// Falls back to heuristic extraction if the LLM call fails or returns empty. #[allow(clippy::too_many_arguments)] pub async fn extract_article_links_with_llm( http_client: &reqwest::Client, source_url: &str, max_links: usize, provider: &Arc, model: &str, pool: Option<&sqlx::PgPool>, user_id: Option, job_id: Option, ) -> Result, AppError> { let base_url = Url::parse(source_url) .map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?; let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); let response = http_client.get(source_url).send().await.map_err(|e| { tracing::warn!(url = source_url, error = %e, "Failed to fetch source page"); AppError::Internal(anyhow::anyhow!("Failed to fetch source page")) })?; if !response.status().is_success() { tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200"); return Ok(Vec::new()); } let html_text = response.text().await.map_err(|e| { AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e)) })?; let pairs = extract_links_as_pairs(&html_text, &base_url); let links_text = format_links_for_llm(&pairs); let (system, user) = build_link_extraction_prompt(&links_text); let schema = build_link_extraction_schema(); let llm_start = std::time::Instant::now(); let llm_result = provider.call_llm(model, &system, &user, &schema).await; let llm_duration = llm_start.elapsed().as_millis() as u64; // Log the LLM call if pool/user_id/job_id are provided if let (Some(pool), Some(uid), Some(jid)) = (pool, user_id, job_id) { let response_str = match &llm_result { Ok(resp) => serde_json::to_string_pretty(resp).unwrap_or_default(), Err(e) => format!("Error: {}", e), }; crate::db::llm_call_log::insert( pool, uid, jid, "link_extraction", model, &system, &user, &response_str, llm_duration as i32, None, ).await.ok(); } match llm_result { Ok(llm_response) => { let urls: Vec = llm_response .get("urls") .and_then(|u| u.as_array()) .map(|arr| { arr.iter() .filter_map(|v| v.as_str()) .filter_map(|href| { let resolved = base_url.join(href).ok()?; if resolved.scheme() != "http" && resolved.scheme() != "https" { return None; } if resolved.host_str()?.to_lowercase() != base_domain { return None; } Some(resolved.to_string()) }) .collect() }) .unwrap_or_default(); if urls.is_empty() { tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic"); let fallback = extract_links_from_html(&html_text, &base_url, &base_domain); Ok(fallback.into_iter().take(max_links).collect()) } else { let mut seen = std::collections::HashSet::new(); let deduped: Vec = urls.into_iter().filter(|u| seen.insert(u.clone())).collect(); Ok(deduped.into_iter().take(max_links).collect()) } } Err(e) => { tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back to heuristic"); let fallback = extract_links_from_html(&html_text, &base_url, &base_domain); Ok(fallback.into_iter().take(max_links).collect()) } } } #[cfg(test)] mod tests { use super::*; fn base_url(s: &str) -> Url { Url::parse(s).unwrap() } #[test] fn extracts_article_links_from_html() { let html = r#" Article 1 Article 2 Home "#; let base = base_url("https://example.com/blog"); let links = extract_links_from_html(html, &base, "example.com"); assert_eq!(links.len(), 2); assert!(links[0].contains("/blog/article-1")); assert!(links[1].contains("/blog/article-2")); } #[test] fn filters_external_links() { let html = r#"External"#; let base = base_url("https://example.com"); let links = extract_links_from_html(html, &base, "example.com"); assert!(links.is_empty()); } #[test] fn filters_non_article_patterns() { let html = r#" Tag Category Author Login "#; let base = base_url("https://example.com"); let links = extract_links_from_html(html, &base, "example.com"); assert!(links.is_empty()); } #[test] fn filters_static_assets() { let html = r#" CSS JS Image "#; let base = base_url("https://example.com"); let links = extract_links_from_html(html, &base, "example.com"); assert!(links.is_empty()); } #[test] fn deduplicates_links() { let html = r#" Link 1 Link 2 Link 3 "#; let base = base_url("https://example.com"); let links = extract_links_from_html(html, &base, "example.com"); assert_eq!(links.len(), 1); } #[test] fn resolves_relative_urls() { let html = r#"Relative"#; let base = base_url("https://example.com/blog/"); let links = extract_links_from_html(html, &base, "example.com"); assert_eq!(links.len(), 1); assert!(links[0].contains("/blog/my-post")); } #[test] fn allows_single_segment_paths() { let html = r#"Article"#; let base = base_url("https://example.com"); let links = extract_links_from_html(html, &base, "example.com"); assert_eq!(links.len(), 1); } #[test] fn empty_html_returns_empty() { let links = extract_links_from_html("", &base_url("https://example.com"), "example.com"); assert!(links.is_empty()); } #[test] fn extract_pairs_returns_href_and_text() { let html = r#" Breaking AI News GPT-6 Released "#; let base = base_url("https://example.com/blog"); let pairs = extract_links_as_pairs(html, &base); assert_eq!(pairs.len(), 2); assert!(pairs[0].0.contains("/blog/article-1")); assert_eq!(pairs[0].1, "Breaking AI News"); assert!(pairs[1].0.contains("/blog/article-2")); assert_eq!(pairs[1].1, "GPT-6 Released"); } #[test] fn extract_pairs_filters_external_links() { let html = r#"External"#; let base = base_url("https://example.com"); let pairs = extract_links_as_pairs(html, &base); assert!(pairs.is_empty()); } #[test] fn extract_pairs_filters_root_path() { let html = r#"Home"#; let base = base_url("https://example.com"); let pairs = extract_links_as_pairs(html, &base); assert!(pairs.is_empty()); } #[test] fn extract_pairs_handles_empty_anchor_text() { let html = r#""#; let base = base_url("https://example.com"); let pairs = extract_links_as_pairs(html, &base); assert_eq!(pairs.len(), 1); assert_eq!(pairs[0].1, ""); } #[test] fn format_links_for_llm_formats_correctly() { let pairs = vec![ ("https://example.com/a".to_string(), "Article One".to_string()), ("https://example.com/b".to_string(), "".to_string()), ]; let result = format_links_for_llm(&pairs); assert!(result.contains("- https://example.com/a | \"Article One\"")); assert!(result.contains("- https://example.com/b")); assert!(!result.contains("| \"\"")); } #[test] fn format_links_for_llm_caps_at_200() { let pairs: Vec<(String, String)> = (0..300) .map(|i| (format!("https://example.com/{}", i), format!("Link {}", i))) .collect(); let result = format_links_for_llm(&pairs); let line_count = result.lines().count(); assert_eq!(line_count, 200); } }