|
|
|
@ -13,8 +13,9 @@ use url::Url;
|
|
|
|
|
|
|
|
|
|
|
|
/// Patterns in URL paths that indicate non-article pages.
|
|
|
|
/// Patterns in URL paths that indicate non-article pages.
|
|
|
|
const EXCLUDED_PATH_PATTERNS: &[&str] = &[
|
|
|
|
const EXCLUDED_PATH_PATTERNS: &[&str] = &[
|
|
|
|
"/tag/", "/category/", "/author/", "/page/", "/login", "/signup",
|
|
|
|
"/tag", "/category", "/author", "/page", "/login", "/signup",
|
|
|
|
"/privacy", "/terms", "/search", "/contact", "/about",
|
|
|
|
"/privacy", "/terms", "/search", "/contact", "/about", "/topics",
|
|
|
|
|
|
|
|
"/archive", "/companies", "/events", "/company", "/event", "/collections",
|
|
|
|
];
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
/// File extensions that indicate static assets, not articles.
|
|
|
|
/// File extensions that indicate static assets, not articles.
|
|
|
|
@ -137,6 +138,9 @@ pub async fn extract_article_links_with_llm(
|
|
|
|
max_links: usize,
|
|
|
|
max_links: usize,
|
|
|
|
provider: &Arc<dyn LlmProvider>,
|
|
|
|
provider: &Arc<dyn LlmProvider>,
|
|
|
|
model: &str,
|
|
|
|
model: &str,
|
|
|
|
|
|
|
|
pool: Option<&sqlx::PgPool>,
|
|
|
|
|
|
|
|
user_id: Option<uuid::Uuid>,
|
|
|
|
|
|
|
|
job_id: Option<uuid::Uuid>,
|
|
|
|
) -> Result<Vec<String>, AppError> {
|
|
|
|
) -> Result<Vec<String>, AppError> {
|
|
|
|
let base_url = Url::parse(source_url)
|
|
|
|
let base_url = Url::parse(source_url)
|
|
|
|
.map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?;
|
|
|
|
.map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?;
|
|
|
|
@ -160,7 +164,23 @@ pub async fn extract_article_links_with_llm(
|
|
|
|
let (system, user) = build_link_extraction_prompt(&head_html, &body_html);
|
|
|
|
let (system, user) = build_link_extraction_prompt(&head_html, &body_html);
|
|
|
|
let schema = build_link_extraction_schema();
|
|
|
|
let schema = build_link_extraction_schema();
|
|
|
|
|
|
|
|
|
|
|
|
match provider.call_llm(model, &system, &user, &schema).await {
|
|
|
|
let llm_start = std::time::Instant::now();
|
|
|
|
|
|
|
|
let llm_result = provider.call_llm(model, &system, &user, &schema).await;
|
|
|
|
|
|
|
|
let llm_duration = llm_start.elapsed().as_millis() as u64;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Log the LLM call if pool/user_id/job_id are provided
|
|
|
|
|
|
|
|
if let (Some(pool), Some(uid), Some(jid)) = (pool, user_id, job_id) {
|
|
|
|
|
|
|
|
let response_str = match &llm_result {
|
|
|
|
|
|
|
|
Ok(resp) => serde_json::to_string_pretty(resp).unwrap_or_default(),
|
|
|
|
|
|
|
|
Err(e) => format!("Error: {}", e),
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
crate::db::llm_call_log::insert(
|
|
|
|
|
|
|
|
pool, uid, jid, "link_extraction", model,
|
|
|
|
|
|
|
|
&system, &user, &response_str, llm_duration as i32,
|
|
|
|
|
|
|
|
).await.ok();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
match llm_result {
|
|
|
|
Ok(llm_response) => {
|
|
|
|
Ok(llm_response) => {
|
|
|
|
let urls: Vec<String> = llm_response
|
|
|
|
let urls: Vec<String> = llm_response
|
|
|
|
.get("urls")
|
|
|
|
.get("urls")
|
|
|
|
|