|
|
|
@ -3,7 +3,11 @@
|
|
|
|
//! Used in Phase 1 of the generation pipeline to discover articles
|
|
|
|
//! Used in Phase 1 of the generation pipeline to discover articles
|
|
|
|
//! from user-configured sources before falling back to LLM web search.
|
|
|
|
//! from user-configured sources before falling back to LLM web search.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use std::sync::Arc;
|
|
|
|
use crate::errors::AppError;
|
|
|
|
use crate::errors::AppError;
|
|
|
|
|
|
|
|
use crate::services::llm::LlmProvider;
|
|
|
|
|
|
|
|
use crate::services::llm::schema::build_link_extraction_schema;
|
|
|
|
|
|
|
|
use crate::services::prompts::build_link_extraction_prompt;
|
|
|
|
use scraper::{Html, Selector};
|
|
|
|
use scraper::{Html, Selector};
|
|
|
|
use url::Url;
|
|
|
|
use url::Url;
|
|
|
|
|
|
|
|
|
|
|
|
@ -112,6 +116,90 @@ pub fn extract_links_from_html(
|
|
|
|
links
|
|
|
|
links
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Extract <head> section and first 8000 chars of <body> from HTML (UTF-8 safe).
|
|
|
|
|
|
|
|
pub fn extract_head_and_body(html: &str) -> (String, String) {
|
|
|
|
|
|
|
|
let head_start = html.find("<head").unwrap_or(0);
|
|
|
|
|
|
|
|
let head_end = html.find("</head>").map(|i| i + 7).unwrap_or(head_start);
|
|
|
|
|
|
|
|
let head = &html[head_start..head_end];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let body_start = html.find("<body").unwrap_or(head_end);
|
|
|
|
|
|
|
|
let body: String = html[body_start..].chars().take(8000).collect();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(head.to_string(), body)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Extract article links using LLM analysis of the page HTML.
|
|
|
|
|
|
|
|
///
|
|
|
|
|
|
|
|
/// Falls back to heuristic extraction if the LLM call fails or returns empty.
|
|
|
|
|
|
|
|
pub async fn extract_article_links_with_llm(
|
|
|
|
|
|
|
|
http_client: &reqwest::Client,
|
|
|
|
|
|
|
|
source_url: &str,
|
|
|
|
|
|
|
|
max_links: usize,
|
|
|
|
|
|
|
|
provider: &Arc<dyn LlmProvider>,
|
|
|
|
|
|
|
|
model: &str,
|
|
|
|
|
|
|
|
) -> Result<Vec<String>, AppError> {
|
|
|
|
|
|
|
|
let base_url = Url::parse(source_url)
|
|
|
|
|
|
|
|
.map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?;
|
|
|
|
|
|
|
|
let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let response = http_client.get(source_url).send().await.map_err(|e| {
|
|
|
|
|
|
|
|
tracing::warn!(url = source_url, error = %e, "Failed to fetch source page");
|
|
|
|
|
|
|
|
AppError::Internal(anyhow::anyhow!("Failed to fetch source page"))
|
|
|
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if !response.status().is_success() {
|
|
|
|
|
|
|
|
tracing::warn!(url = source_url, status = %response.status(), "Source page returned non-200");
|
|
|
|
|
|
|
|
return Ok(Vec::new());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let html_text = response.text().await.map_err(|e| {
|
|
|
|
|
|
|
|
AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
|
|
|
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let (head_html, body_html) = extract_head_and_body(&html_text);
|
|
|
|
|
|
|
|
let (system, user) = build_link_extraction_prompt(&head_html, &body_html);
|
|
|
|
|
|
|
|
let schema = build_link_extraction_schema();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
match provider.generate_rewrite_pass(model, &system, &user, &schema).await {
|
|
|
|
|
|
|
|
Ok(llm_response) => {
|
|
|
|
|
|
|
|
let urls: Vec<String> = llm_response
|
|
|
|
|
|
|
|
.get("urls")
|
|
|
|
|
|
|
|
.and_then(|u| u.as_array())
|
|
|
|
|
|
|
|
.map(|arr| {
|
|
|
|
|
|
|
|
arr.iter()
|
|
|
|
|
|
|
|
.filter_map(|v| v.as_str())
|
|
|
|
|
|
|
|
.filter_map(|href| {
|
|
|
|
|
|
|
|
let resolved = base_url.join(href).ok()?;
|
|
|
|
|
|
|
|
if resolved.scheme() != "http" && resolved.scheme() != "https" {
|
|
|
|
|
|
|
|
return None;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if resolved.host_str()?.to_lowercase() != base_domain {
|
|
|
|
|
|
|
|
return None;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
Some(resolved.to_string())
|
|
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
.collect()
|
|
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
.unwrap_or_default();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if urls.is_empty() {
|
|
|
|
|
|
|
|
tracing::warn!(url = source_url, "LLM returned no links, falling back to heuristic");
|
|
|
|
|
|
|
|
let fallback = extract_links_from_html(&html_text, &base_url, &base_domain);
|
|
|
|
|
|
|
|
Ok(fallback.into_iter().take(max_links).collect())
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
let mut seen = std::collections::HashSet::new();
|
|
|
|
|
|
|
|
let deduped: Vec<String> = urls.into_iter().filter(|u| seen.insert(u.clone())).collect();
|
|
|
|
|
|
|
|
Ok(deduped.into_iter().take(max_links).collect())
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
Err(e) => {
|
|
|
|
|
|
|
|
tracing::warn!(url = source_url, error = %e, "LLM link extraction failed, falling back to heuristic");
|
|
|
|
|
|
|
|
let fallback = extract_links_from_html(&html_text, &base_url, &base_domain);
|
|
|
|
|
|
|
|
Ok(fallback.into_iter().take(max_links).collect())
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
use super::*;
|
|
|
|
@ -202,4 +290,20 @@ mod tests {
|
|
|
|
let links = extract_links_from_html("", &base_url("https://example.com"), "example.com");
|
|
|
|
let links = extract_links_from_html("", &base_url("https://example.com"), "example.com");
|
|
|
|
assert!(links.is_empty());
|
|
|
|
assert!(links.is_empty());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn extract_head_and_body_splits_correctly() {
|
|
|
|
|
|
|
|
let html = "<html><head><title>T</title></head><body><p>Content</p></body></html>";
|
|
|
|
|
|
|
|
let (head, body) = extract_head_and_body(html);
|
|
|
|
|
|
|
|
assert!(head.contains("<title>T</title>"));
|
|
|
|
|
|
|
|
assert!(body.contains("<p>Content</p>"));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn extract_head_and_body_truncates_body_safely() {
|
|
|
|
|
|
|
|
let long_body = "x".repeat(20000);
|
|
|
|
|
|
|
|
let html = format!("<head></head><body>{}</body>", long_body);
|
|
|
|
|
|
|
|
let (_, body) = extract_head_and_body(&html);
|
|
|
|
|
|
|
|
assert_eq!(body.chars().count(), 8000);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|