From bb716b5dc253038a9782276c070c143e3e8e8cd7 Mon Sep 17 00:00:00 2001 From: oabrivard Date: Wed, 25 Mar 2026 00:49:27 +0100 Subject: [PATCH] feat: add get_last_source_url + remove head_html from ScrapedContent - Add get_last_source_url() to article_history db module for source rotation - Remove head_html field from ScrapedContent struct and scrape_url function - Fix synthesis.rs scrape_single_article_with_llm to pass empty string instead of removed field Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/src/db/article_history.rs | 14 ++++++++++++++ backend/src/services/scraper.rs | 11 ----------- backend/src/services/synthesis.rs | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/backend/src/db/article_history.rs b/backend/src/db/article_history.rs index 7152109..80bc29c 100644 --- a/backend/src/db/article_history.rs +++ b/backend/src/db/article_history.rs @@ -187,6 +187,20 @@ pub async fn list_by_job_id( Ok(rows) } +/// Get the source_url from the most recent 'used' entry for source rotation. +pub async fn get_last_source_url( + pool: &PgPool, + user_id: Uuid, +) -> Result, AppError> { + let result = sqlx::query_scalar::<_, String>( + "SELECT source_url FROM article_history WHERE user_id = $1 AND status = 'used' AND source_url IS NOT NULL ORDER BY created_at DESC LIMIT 1", + ) + .bind(user_id) + .fetch_optional(pool) + .await?; + Ok(result) +} + /// Delete history entries older than N days for this user. /// /// Only removes entries where synthesis_id IS NULL (dropped articles). diff --git a/backend/src/services/scraper.rs b/backend/src/services/scraper.rs index 11452e9..d63e40a 100644 --- a/backend/src/services/scraper.rs +++ b/backend/src/services/scraper.rs @@ -50,8 +50,6 @@ pub struct ScrapedContent { pub is_soft_404: bool, /// Final URL after following redirects. pub url: String, - /// Raw section from the HTML, preserved for LLM extraction. - pub head_html: String, } /// Build a `reqwest::Client` configured for scraping. @@ -144,7 +142,6 @@ pub async fn scrape_url( body_text: String::new(), is_soft_404: false, url: response.url().to_string(), - head_html: String::new(), }); } @@ -181,13 +178,6 @@ pub async fn scrape_url( let html_text = String::from_utf8_lossy(&bytes); - // Extract section for potential LLM use - let head_html = { - let start = html_text.find("").map(|i| i + 7).unwrap_or(start); - html_text[start..end].to_string() - }; - let document = Html::parse_document(&html_text); // Extract page title @@ -214,7 +204,6 @@ pub async fn scrape_url( body_text, is_soft_404, url: final_url.to_string(), - head_html, }) } diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index 13582ef..5ec1420 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -1725,7 +1725,7 @@ async fn scrape_single_article_with_llm( } let (system, user) = crate::services::prompts::build_article_extraction_prompt( - &content.head_html, + "", &content.body_text, ); let schema = crate::services::llm::schema::build_article_extraction_schema();