feat: add get_last_source_url + remove head_html from ScrapedContent

- Add get_last_source_url() to article_history db module for source rotation - Remove head_html field from ScrapedContent struct and scrape_url function - Fix synthesis.rs scrape_single_article_with_llm to pass empty string instead of removed field Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · bb716b5dc2
parent b2dbc3847a
commit bb716b5dc2
3 changed files with 15 additions and 12 deletions
--- a/backend/src/db/article_history.rs
+++ b/backend/src/db/article_history.rs
@ -187,6 +187,20 @@ pub async fn list_by_job_id(
    Ok(rows)
 }
 /// Get the source_url from the most recent 'used' entry for source rotation.
 pub async fn get_last_source_url(
    pool: &PgPool,
    user_id: Uuid,
 ) -> Result<Option<String>, AppError> {
    let result = sqlx::query_scalar::<_, String>(
        "SELECT source_url FROM article_history WHERE user_id = $1 AND status = 'used' AND source_url IS NOT NULL ORDER BY created_at DESC LIMIT 1",
    )
    .bind(user_id)
    .fetch_optional(pool)
    .await?;
    Ok(result)
 }
 /// Delete history entries older than N days for this user.
 ///
 /// Only removes entries where synthesis_id IS NULL (dropped articles).
--- a/backend/src/services/scraper.rs
+++ b/backend/src/services/scraper.rs
@ -50,8 +50,6 @@ pub struct ScrapedContent {
    pub is_soft_404: bool,
    /// Final URL after following redirects.
    pub url: String,
    /// Raw <head> section from the HTML, preserved for LLM extraction.
    pub head_html: String,
 }
 /// Build a `reqwest::Client` configured for scraping.
@ -144,7 +142,6 @@ pub async fn scrape_url(
            body_text: String::new(),
            is_soft_404: false,
            url: response.url().to_string(),
            head_html: String::new(),
        });
    }
@ -181,13 +178,6 @@ pub async fn scrape_url(
    let html_text = String::from_utf8_lossy(&bytes);
    // Extract <head> section for potential LLM use
    let head_html = {
        let start = html_text.find("<head").unwrap_or(0);
        let end = html_text.find("</head>").map(|i| i + 7).unwrap_or(start);
        html_text[start..end].to_string()
    };
    let document = Html::parse_document(&html_text);
    // Extract page title
@ -214,7 +204,6 @@ pub async fn scrape_url(
        body_text,
        is_soft_404,
        url: final_url.to_string(),
        head_html,
    })
 }
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -1725,7 +1725,7 @@ async fn scrape_single_article_with_llm(
    }
    let (system, user) = crate::services::prompts::build_article_extraction_prompt(
-        &content.head_html,
+        "",
        &content.body_text,
    );
    let schema = crate::services::llm::schema::build_article_extraction_schema();