fix: log LLM calls for source link extraction in llm_call_log

master
oabrivard 3 months ago
parent fb765d6c8f
commit a760220d44

@ -13,8 +13,9 @@ use url::Url;
/// Patterns in URL paths that indicate non-article pages.
const EXCLUDED_PATH_PATTERNS: &[&str] = &[
"/tag/", "/category/", "/author/", "/page/", "/login", "/signup",
"/privacy", "/terms", "/search", "/contact", "/about",
"/tag", "/category", "/author", "/page", "/login", "/signup",
"/privacy", "/terms", "/search", "/contact", "/about", "/topics",
"/archive", "/companies", "/events", "/company", "/event", "/collections",
];
/// File extensions that indicate static assets, not articles.
@ -137,6 +138,9 @@ pub async fn extract_article_links_with_llm(
max_links: usize,
provider: &Arc<dyn LlmProvider>,
model: &str,
pool: Option<&sqlx::PgPool>,
user_id: Option<uuid::Uuid>,
job_id: Option<uuid::Uuid>,
) -> Result<Vec<String>, AppError> {
let base_url = Url::parse(source_url)
.map_err(|e| AppError::BadRequest(format!("Invalid source URL: {}", e)))?;
@ -160,7 +164,23 @@ pub async fn extract_article_links_with_llm(
let (system, user) = build_link_extraction_prompt(&head_html, &body_html);
let schema = build_link_extraction_schema();
match provider.call_llm(model, &system, &user, &schema).await {
let llm_start = std::time::Instant::now();
let llm_result = provider.call_llm(model, &system, &user, &schema).await;
let llm_duration = llm_start.elapsed().as_millis() as u64;
// Log the LLM call if pool/user_id/job_id are provided
if let (Some(pool), Some(uid), Some(jid)) = (pool, user_id, job_id) {
let response_str = match &llm_result {
Ok(resp) => serde_json::to_string_pretty(resp).unwrap_or_default(),
Err(e) => format!("Error: {}", e),
};
crate::db::llm_call_log::insert(
pool, uid, jid, "link_extraction", model,
&system, &user, &response_str, llm_duration as i32,
).await.ok();
}
match llm_result {
Ok(llm_response) => {
let urls: Vec<String> = llm_response
.get("urls")

@ -301,10 +301,14 @@ async fn run_generation_inner(
let provider_clone = std::sync::Arc::clone(&provider);
let model = model_research.clone();
let max_l = max_links;
let pool = state.pool.clone();
let uid = user_id;
let jid = job_id;
join_set.spawn(async move {
let links = if use_llm {
source_scraper::extract_article_links_with_llm(
&client, &source_url, max_l, &provider_clone, &model,
Some(&pool), Some(uid), Some(jid),
).await
} else {
source_scraper::extract_article_links(
@ -342,10 +346,14 @@ async fn run_generation_inner(
let provider_clone = std::sync::Arc::clone(&provider);
let model = model_research.clone();
let max_l = max_links;
let pool = state.pool.clone();
let uid = user_id;
let jid = job_id;
join_set.spawn(async move {
let links = if use_llm {
source_scraper::extract_article_links_with_llm(
&client, &source_url, max_l, &provider_clone, &model,
Some(&pool), Some(uid), Some(jid),
).await
} else {
source_scraper::extract_article_links(

Loading…
Cancel
Save