diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs
index 2d64482..c272156 100644
--- a/backend/src/services/source_scraper.rs
+++ b/backend/src/services/source_scraper.rs
@@ -3,7 +3,11 @@
//! Used in Phase 1 of the generation pipeline to discover articles
//! from user-configured sources before falling back to LLM web search.
+use std::sync::Arc;
use crate::errors::AppError;
+use crate::services::llm::LlmProvider;
+use crate::services::llm::schema::build_link_extraction_schema;
+use crate::services::prompts::build_link_extraction_prompt;
use scraper::{Html, Selector};
use url::Url;
@@ -112,6 +116,90 @@ pub fn extract_links_from_html(
links
}
+/// Extract
section and first 8000 chars of from HTML (UTF-8 safe).
+pub fn extract_head_and_body(html: &str) -> (String, String) {
+ let head_start = html.find("").map(|i| i + 7).unwrap_or(head_start);
+ let head = &html[head_start..head_end];
+
+ let body_start = html.find(",
+ model: &str,
+) -> Result, AppError> {
+ let base_url = Url::parse(source_url)
+ .map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?;
+ let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
+
+ let response = http_client.get(source_url).send().await.map_err(|e| {
+ tracing::warn!(url = source_url, error = %e, "Failed to fetch source page");
+ AppError::Internal(anyhow::anyhow!("Failed to fetch source page"))
+ })?;
+
+ if !response.status().is_success() {
+ tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200");
+ return Ok(Vec::new());
+ }
+
+ let html_text = response.text().await.map_err(|e| {
+ AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
+ })?;
+
+ let (head_html, body_html) = extract_head_and_body(&html_text);
+ let (system, user) = build_link_extraction_prompt(&head_html, &body_html);
+ let schema = build_link_extraction_schema();
+
+ match provider.generate_rewrite_pass(model, &system, &user, &schema).await {
+ Ok(llm_response) => {
+ let urls: Vec = llm_response
+ .get("urls")
+ .and_then(|u| u.as_array())
+ .map(|arr| {
+ arr.iter()
+ .filter_map(|v| v.as_str())
+ .filter_map(|href| {
+ let resolved = base_url.join(href).ok()?;
+ if resolved.scheme() != "http" && resolved.scheme() != "https" {
+ return None;
+ }
+ if resolved.host_str()?.to_lowercase() != base_domain {
+ return None;
+ }
+ Some(resolved.to_string())
+ })
+ .collect()
+ })
+ .unwrap_or_default();
+
+ if urls.is_empty() {
+ tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic");
+ let fallback = extract_links_from_html(&html_text, &base_url, &base_domain);
+ Ok(fallback.into_iter().take(max_links).collect())
+ } else {
+ let mut seen = std::collections::HashSet::new();
+ let deduped: Vec = urls.into_iter().filter(|u| seen.insert(u.clone())).collect();
+ Ok(deduped.into_iter().take(max_links).collect())
+ }
+ }
+ Err(e) => {
+ tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back to heuristic");
+ let fallback = extract_links_from_html(&html_text, &base_url, &base_domain);
+ Ok(fallback.into_iter().take(max_links).collect())
+ }
+ }
+}
+
#[cfg(test)]
mod tests {
use super::*;
@@ -202,4 +290,20 @@ mod tests {
let links = extract_links_from_html("", &base_url("https://example.com"), "example.com");
assert!(links.is_empty());
}
+
+ #[test]
+ fn extract_head_and_body_splits_correctly() {
+ let html = "TContent
";
+ let (head, body) = extract_head_and_body(html);
+ assert!(head.contains("T"));
+ assert!(body.contains("Content
"));
+ }
+
+ #[test]
+ fn extract_head_and_body_truncates_body_safely() {
+ let long_body = "x".repeat(20000);
+ let html = format!("{}", long_body);
+ let (_, body) = extract_head_and_body(&html);
+ assert_eq!(body.chars().count(), 8000);
+ }
}