diff --git a/backend/src/services/mod.rs b/backend/src/services/mod.rs index ae7a96c..4432534 100644 --- a/backend/src/services/mod.rs +++ b/backend/src/services/mod.rs @@ -7,5 +7,6 @@ pub mod llm; pub mod prompts; pub mod rate_limiter; pub mod scraper; +pub mod source_scraper; pub mod synthesis; pub mod turnstile; diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs new file mode 100644 index 0000000..2d64482 --- /dev/null +++ b/backend/src/services/source_scraper.rs @@ -0,0 +1,205 @@ +//! Source page scraper: fetches a source URL and extracts article links. +//! +//! Used in Phase 1 of the generation pipeline to discover articles +//! from user-configured sources before falling back to LLM web search. + +use crate::errors::AppError; +use scraper::{Html, Selector}; +use url::Url; + +/// Patterns in URL paths that indicate non-article pages. +const EXCLUDED_PATH_PATTERNS: &[&str] = &[ + "/tag/", "/category/", "/author/", "/page/", "/login", "/signup", + "/privacy", "/terms", "/search", "/contact", "/about", +]; + +/// File extensions that indicate static assets, not articles. +const EXCLUDED_EXTENSIONS: &[&str] = &[ + ".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", + ".pdf", ".zip", ".xml", ".json", ".ico", ".woff", ".woff2", +]; + +/// Extract article links from a source page. +/// +/// Fetches the HTML at `source_url`, extracts all `` links, +/// filters to same-domain article-like URLs, deduplicates, and returns +/// up to `max_links` candidate URLs. +pub async fn extract_article_links( + http_client: &reqwest::Client, + source_url: &str, + max_links: usize, +) -> Result, AppError> { + let base_url = Url::parse(source_url) + .map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?; + let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); + + let response = http_client + .get(source_url) + .send() + .await + .map_err(|e| { + tracing::warn!(url = source_url, error = %e, "Failed to fetch source page"); + AppError::Internal(anyhow::anyhow!("Failed to fetch source page")) + })?; + + if !response.status().is_success() { + tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200"); + return Ok(Vec::new()); + } + + let html_text = response.text().await.map_err(|e| { + AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e)) + })?; + + let links = extract_links_from_html(&html_text, &base_url, &base_domain); + + Ok(links.into_iter().take(max_links).collect()) +} + +/// Extract and filter article links from HTML content. +/// +/// This is a pure function (no I/O) for easy testing. +pub fn extract_links_from_html( + html: &str, + base_url: &Url, + base_domain: &str, +) -> Vec { + let document = Html::parse_document(html); + let selector = Selector::parse("a[href]").unwrap(); + let mut seen = std::collections::HashSet::new(); + let mut links = Vec::new(); + + for element in document.select(&selector) { + if let Some(href) = element.value().attr("href") { + let resolved = match base_url.join(href) { + Ok(u) => u, + Err(_) => continue, + }; + + if resolved.scheme() != "http" && resolved.scheme() != "https" { + continue; + } + + let link_domain = resolved.host_str().unwrap_or("").to_lowercase(); + if link_domain != base_domain { + continue; + } + + let path = resolved.path(); + if path.is_empty() || path == "/" { + continue; + } + + let path_lower = path.to_lowercase(); + if EXCLUDED_PATH_PATTERNS.iter().any(|p| path_lower.contains(p)) { + continue; + } + + if EXCLUDED_EXTENSIONS.iter().any(|ext| path_lower.ends_with(ext)) { + continue; + } + + let mut normalized = resolved.clone(); + normalized.set_fragment(None); + let url_str = normalized.to_string(); + + if seen.insert(url_str.clone()) { + links.push(url_str); + } + } + } + + links +} + +#[cfg(test)] +mod tests { + use super::*; + + fn base_url(s: &str) -> Url { + Url::parse(s).unwrap() + } + + #[test] + fn extracts_article_links_from_html() { + let html = r#" + + Article 1 + Article 2 + Home + "#; + let base = base_url("https://example.com/blog"); + let links = extract_links_from_html(html, &base, "example.com"); + assert_eq!(links.len(), 2); + assert!(links[0].contains("/blog/article-1")); + assert!(links[1].contains("/blog/article-2")); + } + + #[test] + fn filters_external_links() { + let html = r#"External"#; + let base = base_url("https://example.com"); + let links = extract_links_from_html(html, &base, "example.com"); + assert!(links.is_empty()); + } + + #[test] + fn filters_non_article_patterns() { + let html = r#" + Tag + Category + Author + Login + "#; + let base = base_url("https://example.com"); + let links = extract_links_from_html(html, &base, "example.com"); + assert!(links.is_empty()); + } + + #[test] + fn filters_static_assets() { + let html = r#" + CSS + JS + Image + "#; + let base = base_url("https://example.com"); + let links = extract_links_from_html(html, &base, "example.com"); + assert!(links.is_empty()); + } + + #[test] + fn deduplicates_links() { + let html = r#" + Link 1 + Link 2 + Link 3 + "#; + let base = base_url("https://example.com"); + let links = extract_links_from_html(html, &base, "example.com"); + assert_eq!(links.len(), 1); + } + + #[test] + fn resolves_relative_urls() { + let html = r#"Relative"#; + let base = base_url("https://example.com/blog/"); + let links = extract_links_from_html(html, &base, "example.com"); + assert_eq!(links.len(), 1); + assert!(links[0].contains("/blog/my-post")); + } + + #[test] + fn allows_single_segment_paths() { + let html = r#"Article"#; + let base = base_url("https://example.com"); + let links = extract_links_from_html(html, &base, "example.com"); + assert_eq!(links.len(), 1); + } + + #[test] + fn empty_html_returns_empty() { + let links = extract_links_from_html("", &base_url("https://example.com"), "example.com"); + assert!(links.is_empty()); + } +}