|
|
|
@ -3,10 +3,43 @@
|
|
|
|
//! Prevents the same article from appearing in multiple syntheses.
|
|
|
|
//! Prevents the same article from appearing in multiple syntheses.
|
|
|
|
|
|
|
|
|
|
|
|
use std::collections::HashSet;
|
|
|
|
use std::collections::HashSet;
|
|
|
|
|
|
|
|
use chrono::{DateTime, Utc};
|
|
|
|
|
|
|
|
use serde::Serialize;
|
|
|
|
use sqlx::PgPool;
|
|
|
|
use sqlx::PgPool;
|
|
|
|
use uuid::Uuid;
|
|
|
|
use uuid::Uuid;
|
|
|
|
use crate::errors::AppError;
|
|
|
|
use crate::errors::AppError;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Entry for inserting into article_history with full tracing metadata.
|
|
|
|
|
|
|
|
pub struct ArticleHistoryEntry {
|
|
|
|
|
|
|
|
pub user_id: Uuid,
|
|
|
|
|
|
|
|
pub url: String,
|
|
|
|
|
|
|
|
pub url_hash: String,
|
|
|
|
|
|
|
|
pub title: String,
|
|
|
|
|
|
|
|
pub source_type: String,
|
|
|
|
|
|
|
|
pub source_url: Option<String>,
|
|
|
|
|
|
|
|
pub category: Option<String>,
|
|
|
|
|
|
|
|
pub synthesis_id: Option<Uuid>,
|
|
|
|
|
|
|
|
pub status: String,
|
|
|
|
|
|
|
|
pub scraped_ok: bool,
|
|
|
|
|
|
|
|
pub job_id: Uuid,
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Row returned from article_history queries.
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, sqlx::FromRow)]
|
|
|
|
|
|
|
|
pub struct ArticleHistoryRow {
|
|
|
|
|
|
|
|
pub id: Uuid,
|
|
|
|
|
|
|
|
pub url: String,
|
|
|
|
|
|
|
|
pub title: String,
|
|
|
|
|
|
|
|
pub source_type: String,
|
|
|
|
|
|
|
|
pub source_url: Option<String>,
|
|
|
|
|
|
|
|
pub category: Option<String>,
|
|
|
|
|
|
|
|
pub synthesis_id: Option<Uuid>,
|
|
|
|
|
|
|
|
pub status: String,
|
|
|
|
|
|
|
|
pub scraped_ok: bool,
|
|
|
|
|
|
|
|
pub job_id: Uuid,
|
|
|
|
|
|
|
|
pub created_at: DateTime<Utc>,
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Check which URL hashes already exist in history for this user.
|
|
|
|
/// Check which URL hashes already exist in history for this user.
|
|
|
|
///
|
|
|
|
///
|
|
|
|
/// Returns the set of url_hashes that were found (i.e., already used).
|
|
|
|
/// Returns the set of url_hashes that were found (i.e., already used).
|
|
|
|
@ -56,8 +89,109 @@ pub async fn insert_urls(
|
|
|
|
Ok(())
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Insert a single article history entry with full tracing metadata.
|
|
|
|
|
|
|
|
pub async fn insert_entry(pool: &PgPool, entry: &ArticleHistoryEntry) -> Result<(), AppError> {
|
|
|
|
|
|
|
|
sqlx::query(
|
|
|
|
|
|
|
|
r#"
|
|
|
|
|
|
|
|
INSERT INTO article_history (user_id, url_hash, url, title, source_type, source_url, category, synthesis_id, status, scraped_ok, job_id)
|
|
|
|
|
|
|
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
|
|
|
|
|
|
|
"#,
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
.bind(entry.user_id)
|
|
|
|
|
|
|
|
.bind(&entry.url_hash)
|
|
|
|
|
|
|
|
.bind(&entry.url)
|
|
|
|
|
|
|
|
.bind(&entry.title)
|
|
|
|
|
|
|
|
.bind(&entry.source_type)
|
|
|
|
|
|
|
|
.bind(&entry.source_url)
|
|
|
|
|
|
|
|
.bind(&entry.category)
|
|
|
|
|
|
|
|
.bind(entry.synthesis_id)
|
|
|
|
|
|
|
|
.bind(&entry.status)
|
|
|
|
|
|
|
|
.bind(entry.scraped_ok)
|
|
|
|
|
|
|
|
.bind(entry.job_id)
|
|
|
|
|
|
|
|
.execute(pool)
|
|
|
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// List article history with optional filters, paginated.
|
|
|
|
|
|
|
|
pub async fn list_history(
|
|
|
|
|
|
|
|
pool: &PgPool,
|
|
|
|
|
|
|
|
user_id: Uuid,
|
|
|
|
|
|
|
|
limit: i64,
|
|
|
|
|
|
|
|
offset: i64,
|
|
|
|
|
|
|
|
status_filter: Option<&str>,
|
|
|
|
|
|
|
|
source_type_filter: Option<&str>,
|
|
|
|
|
|
|
|
) -> Result<Vec<ArticleHistoryRow>, AppError> {
|
|
|
|
|
|
|
|
let rows = sqlx::query_as::<_, ArticleHistoryRow>(
|
|
|
|
|
|
|
|
r#"
|
|
|
|
|
|
|
|
SELECT id, url, title, source_type, source_url, category, synthesis_id, status, scraped_ok, job_id, created_at
|
|
|
|
|
|
|
|
FROM article_history
|
|
|
|
|
|
|
|
WHERE user_id = $1
|
|
|
|
|
|
|
|
AND ($4::TEXT IS NULL OR status = $4)
|
|
|
|
|
|
|
|
AND ($5::TEXT IS NULL OR source_type = $5)
|
|
|
|
|
|
|
|
ORDER BY created_at DESC
|
|
|
|
|
|
|
|
LIMIT $2 OFFSET $3
|
|
|
|
|
|
|
|
"#,
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
.bind(user_id)
|
|
|
|
|
|
|
|
.bind(limit)
|
|
|
|
|
|
|
|
.bind(offset)
|
|
|
|
|
|
|
|
.bind(status_filter)
|
|
|
|
|
|
|
|
.bind(source_type_filter)
|
|
|
|
|
|
|
|
.fetch_all(pool)
|
|
|
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
Ok(rows)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Count article history entries with optional filters.
|
|
|
|
|
|
|
|
pub async fn count_history(
|
|
|
|
|
|
|
|
pool: &PgPool,
|
|
|
|
|
|
|
|
user_id: Uuid,
|
|
|
|
|
|
|
|
status_filter: Option<&str>,
|
|
|
|
|
|
|
|
source_type_filter: Option<&str>,
|
|
|
|
|
|
|
|
) -> Result<i64, AppError> {
|
|
|
|
|
|
|
|
let row = sqlx::query_scalar::<_, i64>(
|
|
|
|
|
|
|
|
r#"
|
|
|
|
|
|
|
|
SELECT COUNT(*) FROM article_history
|
|
|
|
|
|
|
|
WHERE user_id = $1
|
|
|
|
|
|
|
|
AND ($2::TEXT IS NULL OR status = $2)
|
|
|
|
|
|
|
|
AND ($3::TEXT IS NULL OR source_type = $3)
|
|
|
|
|
|
|
|
"#,
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
.bind(user_id)
|
|
|
|
|
|
|
|
.bind(status_filter)
|
|
|
|
|
|
|
|
.bind(source_type_filter)
|
|
|
|
|
|
|
|
.fetch_one(pool)
|
|
|
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
Ok(row)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// List all article history entries for a generation job.
|
|
|
|
|
|
|
|
pub async fn list_by_job_id(
|
|
|
|
|
|
|
|
pool: &PgPool,
|
|
|
|
|
|
|
|
user_id: Uuid,
|
|
|
|
|
|
|
|
job_id: Uuid,
|
|
|
|
|
|
|
|
) -> Result<Vec<ArticleHistoryRow>, AppError> {
|
|
|
|
|
|
|
|
let rows = sqlx::query_as::<_, ArticleHistoryRow>(
|
|
|
|
|
|
|
|
r#"
|
|
|
|
|
|
|
|
SELECT id, url, title, source_type, source_url, category, synthesis_id, status, scraped_ok, job_id, created_at
|
|
|
|
|
|
|
|
FROM article_history
|
|
|
|
|
|
|
|
WHERE user_id = $1 AND job_id = $2
|
|
|
|
|
|
|
|
ORDER BY created_at ASC
|
|
|
|
|
|
|
|
"#,
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
.bind(user_id)
|
|
|
|
|
|
|
|
.bind(job_id)
|
|
|
|
|
|
|
|
.fetch_all(pool)
|
|
|
|
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
Ok(rows)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Delete history entries older than N days for this user.
|
|
|
|
/// Delete history entries older than N days for this user.
|
|
|
|
///
|
|
|
|
///
|
|
|
|
|
|
|
|
/// Only removes entries where synthesis_id IS NULL (dropped articles).
|
|
|
|
|
|
|
|
/// Used articles linked to syntheses stay until the synthesis is deleted.
|
|
|
|
|
|
|
|
///
|
|
|
|
/// Returns the number of deleted rows.
|
|
|
|
/// Returns the number of deleted rows.
|
|
|
|
pub async fn cleanup_old(
|
|
|
|
pub async fn cleanup_old(
|
|
|
|
pool: &PgPool,
|
|
|
|
pool: &PgPool,
|
|
|
|
@ -65,7 +199,7 @@ pub async fn cleanup_old(
|
|
|
|
days: i32,
|
|
|
|
days: i32,
|
|
|
|
) -> Result<u64, AppError> {
|
|
|
|
) -> Result<u64, AppError> {
|
|
|
|
let result = sqlx::query(
|
|
|
|
let result = sqlx::query(
|
|
|
|
"DELETE FROM article_history WHERE user_id = $1 AND created_at < now() - make_interval(days => $2)",
|
|
|
|
"DELETE FROM article_history WHERE user_id = $1 AND created_at < now() - make_interval(days => $2) AND synthesis_id IS NULL",
|
|
|
|
)
|
|
|
|
)
|
|
|
|
.bind(user_id)
|
|
|
|
.bind(user_id)
|
|
|
|
.bind(days)
|
|
|
|
.bind(days)
|
|
|
|
|