From 357f06e40578670858262e3059c5b3fe2e79f53b Mon Sep 17 00:00:00 2001 From: oabrivard Date: Tue, 24 Mar 2026 10:45:14 +0100 Subject: [PATCH] feat: LLM-assisted source link extraction with heuristic fallback Co-Authored-By: Claude Sonnet 4.6 --- backend/src/services/source_scraper.rs | 104 +++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs index 2d64482..c272156 100644 --- a/backend/src/services/source_scraper.rs +++ b/backend/src/services/source_scraper.rs @@ -3,7 +3,11 @@ //! Used in Phase 1 of the generation pipeline to discover articles //! from user-configured sources before falling back to LLM web search. +use std::sync::Arc; use crate::errors::AppError; +use crate::services::llm::LlmProvider; +use crate::services::llm::schema::build_link_extraction_schema; +use crate::services::prompts::build_link_extraction_prompt; use scraper::{Html, Selector}; use url::Url; @@ -112,6 +116,90 @@ pub fn extract_links_from_html( links } +/// Extract section and first 8000 chars of from HTML (UTF-8 safe). +pub fn extract_head_and_body(html: &str) -> (String, String) { + let head_start = html.find("").map(|i| i + 7).unwrap_or(head_start); + let head = &html[head_start..head_end]; + + let body_start = html.find(", + model: &str, +) -> Result, AppError> { + let base_url = Url::parse(source_url) + .map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?; + let base_domain = base_url.host_str().unwrap_or("").to_lowercase(); + + let response = http_client.get(source_url).send().await.map_err(|e| { + tracing::warn!(url = source_url, error = %e, "Failed to fetch source page"); + AppError::Internal(anyhow::anyhow!("Failed to fetch source page")) + })?; + + if !response.status().is_success() { + tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200"); + return Ok(Vec::new()); + } + + let html_text = response.text().await.map_err(|e| { + AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e)) + })?; + + let (head_html, body_html) = extract_head_and_body(&html_text); + let (system, user) = build_link_extraction_prompt(&head_html, &body_html); + let schema = build_link_extraction_schema(); + + match provider.generate_rewrite_pass(model, &system, &user, &schema).await { + Ok(llm_response) => { + let urls: Vec = llm_response + .get("urls") + .and_then(|u| u.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|v| v.as_str()) + .filter_map(|href| { + let resolved = base_url.join(href).ok()?; + if resolved.scheme() != "http" && resolved.scheme() != "https" { + return None; + } + if resolved.host_str()?.to_lowercase() != base_domain { + return None; + } + Some(resolved.to_string()) + }) + .collect() + }) + .unwrap_or_default(); + + if urls.is_empty() { + tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic"); + let fallback = extract_links_from_html(&html_text, &base_url, &base_domain); + Ok(fallback.into_iter().take(max_links).collect()) + } else { + let mut seen = std::collections::HashSet::new(); + let deduped: Vec = urls.into_iter().filter(|u| seen.insert(u.clone())).collect(); + Ok(deduped.into_iter().take(max_links).collect()) + } + } + Err(e) => { + tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back to heuristic"); + let fallback = extract_links_from_html(&html_text, &base_url, &base_domain); + Ok(fallback.into_iter().take(max_links).collect()) + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -202,4 +290,20 @@ mod tests { let links = extract_links_from_html("", &base_url("https://example.com"), "example.com"); assert!(links.is_empty()); } + + #[test] + fn extract_head_and_body_splits_correctly() { + let html = "T

Content

"; + let (head, body) = extract_head_and_body(html); + assert!(head.contains("T")); + assert!(body.contains("

Content

")); + } + + #[test] + fn extract_head_and_body_truncates_body_safely() { + let long_body = "x".repeat(20000); + let html = format!("{}", long_body); + let (_, body) = extract_head_and_body(&html); + assert_eq!(body.chars().count(), 8000); + } }