feat: add get_last_source_url + remove head_html from ScrapedContent

- Add get_last_source_url() to article_history db module for source rotation
- Remove head_html field from ScrapedContent struct and scrape_url function
- Fix synthesis.rs scrape_single_article_with_llm to pass empty string instead of removed field

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent b2dbc3847a
commit bb716b5dc2

@ -187,6 +187,20 @@ pub async fn list_by_job_id(
Ok(rows) Ok(rows)
} }
/// Get the source_url from the most recent 'used' entry for source rotation.
pub async fn get_last_source_url(
pool: &PgPool,
user_id: Uuid,
) -> Result<Option<String>, AppError> {
let result = sqlx::query_scalar::<_, String>(
"SELECT source_url FROM article_history WHERE user_id = $1 AND status = 'used' AND source_url IS NOT NULL ORDER BY created_at DESC LIMIT 1",
)
.bind(user_id)
.fetch_optional(pool)
.await?;
Ok(result)
}
/// Delete history entries older than N days for this user. /// Delete history entries older than N days for this user.
/// ///
/// Only removes entries where synthesis_id IS NULL (dropped articles). /// Only removes entries where synthesis_id IS NULL (dropped articles).

@ -50,8 +50,6 @@ pub struct ScrapedContent {
pub is_soft_404: bool, pub is_soft_404: bool,
/// Final URL after following redirects. /// Final URL after following redirects.
pub url: String, pub url: String,
/// Raw <head> section from the HTML, preserved for LLM extraction.
pub head_html: String,
} }
/// Build a `reqwest::Client` configured for scraping. /// Build a `reqwest::Client` configured for scraping.
@ -144,7 +142,6 @@ pub async fn scrape_url(
body_text: String::new(), body_text: String::new(),
is_soft_404: false, is_soft_404: false,
url: response.url().to_string(), url: response.url().to_string(),
head_html: String::new(),
}); });
} }
@ -181,13 +178,6 @@ pub async fn scrape_url(
let html_text = String::from_utf8_lossy(&bytes); let html_text = String::from_utf8_lossy(&bytes);
// Extract <head> section for potential LLM use
let head_html = {
let start = html_text.find("<head").unwrap_or(0);
let end = html_text.find("</head>").map(|i| i + 7).unwrap_or(start);
html_text[start..end].to_string()
};
let document = Html::parse_document(&html_text); let document = Html::parse_document(&html_text);
// Extract page title // Extract page title
@ -214,7 +204,6 @@ pub async fn scrape_url(
body_text, body_text,
is_soft_404, is_soft_404,
url: final_url.to_string(), url: final_url.to_string(),
head_html,
}) })
} }

@ -1725,7 +1725,7 @@ async fn scrape_single_article_with_llm(
} }
let (system, user) = crate::services::prompts::build_article_extraction_prompt( let (system, user) = crate::services::prompts::build_article_extraction_prompt(
&content.head_html, "",
&content.body_text, &content.body_text,
); );
let schema = crate::services::llm::schema::build_article_extraction_schema(); let schema = crate::services::llm::schema::build_article_extraction_schema();

Loading…
Cancel
Save