|
|
|
|
@ -87,6 +87,12 @@ pub struct JobStore {
|
|
|
|
|
/// Jobs expire after 1 hour (allows SSE reconnection).
|
|
|
|
|
const JOB_TTL: Duration = Duration::from_secs(3600);
|
|
|
|
|
|
|
|
|
|
impl Default for JobStore {
|
|
|
|
|
fn default() -> Self {
|
|
|
|
|
Self::new()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl JobStore {
|
|
|
|
|
/// Create a new empty job store.
|
|
|
|
|
pub fn new() -> Self {
|
|
|
|
|
@ -593,7 +599,7 @@ async fn run_generation_inner(
|
|
|
|
|
// History dedup
|
|
|
|
|
if settings.article_history_days > 0 {
|
|
|
|
|
let hash = hash_article_url(&item.url);
|
|
|
|
|
let exists = db::article_history::check_urls_exist(&state.pool, user_id, &[hash.clone()]).await.unwrap_or_default();
|
|
|
|
|
let exists = db::article_history::check_urls_exist(&state.pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
|
|
|
|
|
if exists.contains(&hash) {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_history", false).await;
|
|
|
|
|
continue;
|
|
|
|
|
@ -711,6 +717,7 @@ fn emit_progress(tx: &watch::Sender<ProgressEvent>, step: &str, message: &str, p
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Insert a trace entry into article_history for debugging pipeline behavior.
|
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
|
|
|
async fn trace_article(
|
|
|
|
|
pool: &sqlx::PgPool,
|
|
|
|
|
user_id: Uuid,
|
|
|
|
|
@ -741,6 +748,7 @@ async fn trace_article(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Log an LLM call with full prompt, response, and timing.
|
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
|
|
|
async fn log_llm_call(
|
|
|
|
|
pool: &sqlx::PgPool,
|
|
|
|
|
user_id: Uuid,
|
|
|
|
|
@ -1024,14 +1032,8 @@ fn rotate_sources(sources: Vec<crate::models::source::Source>, last_source_url:
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Scrape a single article URL, returning (body_text, page_title, final_url) or empty strings on failure.
|
|
|
|
|
///
|
|
|
|
|
/// Handles all failure modes gracefully:
|
|
|
|
|
/// - Network errors → empty content (article kept)
|
|
|
|
|
/// - Soft 404 → article excluded (empty content)
|
|
|
|
|
/// - Article too old → article excluded (empty content)
|
|
|
|
|
/// Result of scraping a single article.
|
|
|
|
|
/// The 4th value is the drop reason if the article was rejected (None if OK).
|
|
|
|
|
/// Scrape a single article. Returns (body_text, page_title, final_url, drop_reason).
|
|
|
|
|
/// `drop_reason` is `Some("filtered_empty")` or `Some("filtered_too_old")` if rejected, `None` if OK.
|
|
|
|
|
async fn scrape_single_article(
|
|
|
|
|
http_client: &reqwest::Client,
|
|
|
|
|
url: &str,
|
|
|
|
|
|