v2: pipeline user model selection, rate limiter, URL filter, original title, null-safe sections

- resolve_provider_and_key() now respects user ai_provider preference
- Dual model resolution: ai_model for search pass, ai_model_writing for rewrite pass
- Per-generation rate limiter with user override support
- Homepage URL filter removes domain-only URLs after search pass
- ScrapedNewsItem gains original_title field populated from page <title>
- SynthesisResponse::try_from handles null sections gracefully (returns empty vec)
- Search prompt warns LLM against returning homepage URLs
- Rewrite prompt instructs LLM to use originalTitle with language preservation rules

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent ed6b41fe52
commit 9b994e0528

@ -48,13 +48,16 @@ impl TryFrom<Synthesis> for SynthesisResponse {
type Error = crate::errors::AppError; type Error = crate::errors::AppError;
fn try_from(s: Synthesis) -> Result<Self, Self::Error> { fn try_from(s: Synthesis) -> Result<Self, Self::Error> {
let sections: Vec<NewsSection> = let sections: Vec<NewsSection> = if s.sections.is_null() {
Vec::new()
} else {
serde_json::from_value(s.sections).map_err(|e| { serde_json::from_value(s.sections).map_err(|e| {
crate::errors::AppError::Internal(anyhow::anyhow!( crate::errors::AppError::Internal(anyhow::anyhow!(
"Failed to parse synthesis sections: {}", "Failed to parse synthesis sections: {}",
e e
)) ))
})?; })?
};
Ok(Self { Ok(Self {
id: s.id, id: s.id,
@ -145,6 +148,8 @@ pub struct ScrapedNewsItem {
pub title: String, pub title: String,
pub url: String, pub url: String,
pub summary: String, pub summary: String,
#[serde(rename = "originalTitle")]
pub original_title: String,
#[serde(rename = "scrapedContent")] #[serde(rename = "scrapedContent")]
pub scraped_content: String, pub scraped_content: String,
} }
@ -312,6 +317,21 @@ mod tests {
assert!(SynthesisResponse::try_from(synthesis).is_err()); assert!(SynthesisResponse::try_from(synthesis).is_err());
} }
#[test]
fn synthesis_response_from_null_sections_returns_empty() {
let synthesis = Synthesis {
id: Uuid::nil(),
user_id: Uuid::nil(),
week: "2026-W12".into(),
sections: serde_json::Value::Null,
status: "completed".into(),
created_at: Utc::now(),
};
let response = SynthesisResponse::try_from(synthesis).unwrap();
assert!(response.sections.is_empty());
}
#[test] #[test]
fn send_email_request_valid_email() { fn send_email_request_valid_email() {
let req = SendEmailRequest { let req = SendEmailRequest {

@ -74,6 +74,8 @@ pub fn build_search_prompt(
Pour chaque categorie, fournis au maximum {max_items} actualites.\n\ Pour chaque categorie, fournis au maximum {max_items} actualites.\n\
Pour chaque actualite, fournis un titre provisoire, l'URL source exacte et complete, \ Pour chaque actualite, fournis un titre provisoire, l'URL source exacte et complete, \
et un resume provisoire.\n\ et un resume provisoire.\n\
Ne retourne JAMAIS des URLs de pages d'accueil (homepage). Fournis toujours des liens \
directs vers des articles specifiques avec un chemin complet (pas juste le nom de domaine).\n\
Retourne le resultat au format JSON en utilisant les cles category_0, category_1, etc. \ Retourne le resultat au format JSON en utilisant les cles category_0, category_1, etc. \
correspondant a l'ordre des sections ci-dessus.", correspondant a l'ordre des sections ci-dessus.",
date = current_date, date = current_date,
@ -112,6 +114,10 @@ pub fn build_rewrite_prompt(
brut extrait des sites web ('scrapedContent').\n\ brut extrait des sites web ('scrapedContent').\n\
Ta tache est de reecrire le 'title' et le 'summary' (4 ou 5 lignes) pour chaque article \ Ta tache est de reecrire le 'title' et le 'summary' (4 ou 5 lignes) pour chaque article \
afin qu'ils refletent EXACTEMENT et FIDELEMENT le contenu textuel fourni.\n\ afin qu'ils refletent EXACTEMENT et FIDELEMENT le contenu textuel fourni.\n\
Pour chaque article, un 'originalTitle' extrait de la page web est fourni. Utilise ce \
titre original comme base pour le titre final. Regles linguistiques: les titres en anglais \
restent en anglais, les titres en francais restent en francais, les autres langues sont \
traduites en francais.\n\
Si le 'scrapedContent' est vide ou insuffisant, utilise le titre et le resume originaux \ Si le 'scrapedContent' est vide ou insuffisant, utilise le titre et le resume originaux \
pour faire au mieux.\n\ pour faire au mieux.\n\
Conserve EXACTEMENT les memes URLs. Ne supprime aucun article de cette liste.\n\n\ Conserve EXACTEMENT les memes URLs. Ne supprime aucun article de cette liste.\n\n\
@ -237,6 +243,14 @@ mod tests {
assert!(user_prompt.contains("recherche Google")); assert!(user_prompt.contains("recherche Google"));
} }
#[test]
fn search_prompt_warns_against_homepage_urls() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
assert!(user_prompt.contains("pages d'accueil"));
assert!(user_prompt.contains("articles specifiques"));
}
#[test] #[test]
fn rewrite_prompt_includes_instructions() { fn rewrite_prompt_includes_instructions() {
let mut data = std::collections::HashMap::new(); let mut data = std::collections::HashMap::new();
@ -246,6 +260,7 @@ mod tests {
title: "Test Article".into(), title: "Test Article".into(),
url: "https://example.com".into(), url: "https://example.com".into(),
summary: "A summary".into(), summary: "A summary".into(),
original_title: "Original Test Article".into(),
scraped_content: "Full article text here...".into(), scraped_content: "Full article text here...".into(),
}], }],
); );
@ -256,6 +271,8 @@ mod tests {
assert!(user_prompt.contains("Test Article")); assert!(user_prompt.contains("Test Article"));
assert!(user_prompt.contains("https://example.com")); assert!(user_prompt.contains("https://example.com"));
assert!(user_prompt.contains("Ne supprime aucun article")); assert!(user_prompt.contains("Ne supprime aucun article"));
assert!(user_prompt.contains("originalTitle"));
assert!(user_prompt.contains("titre original comme base"));
} }
#[test] #[test]

@ -18,9 +18,12 @@ use serde::Serialize;
use tokio::sync::watch; use tokio::sync::watch;
use uuid::Uuid; use uuid::Uuid;
use url::Url;
use crate::app_state::AppState; use crate::app_state::AppState;
use crate::db; use crate::db;
use crate::errors::AppError; use crate::errors::AppError;
use crate::models::settings::UserSettings;
use crate::models::synthesis::{ use crate::models::synthesis::{
get_iso_week_string, NewsItem, NewsSection, ScrapedNewsItem, get_iso_week_string, NewsItem, NewsSection, ScrapedNewsItem,
}; };
@ -267,19 +270,28 @@ async fn run_generation_inner(
// Step 3: Resolve provider + decrypt API key // Step 3: Resolve provider + decrypt API key
emit_progress(tx, "provider", "Configuration du fournisseur IA...", 15); emit_progress(tx, "provider", "Configuration du fournisseur IA...", 15);
let (provider_name, api_key) = resolve_provider_and_key(state, user_id).await?; let (provider_name, api_key) = resolve_provider_and_key(state, user_id, &settings).await?;
let provider = create_provider(&provider_name, api_key, &state.http_client)?; let provider = create_provider(&provider_name, api_key, &state.http_client)?;
// Step 4: Build schema from categories // Step 4: Build schema from categories
let schema = build_category_schema(&settings.categories); let schema = build_category_schema(&settings.categories);
// Step 4b: Resolve models — user overrides take priority over admin config
let model_research = if !settings.ai_model.is_empty() {
settings.ai_model.clone()
} else {
resolve_model(state, &provider_name).await?
};
let model_writing = if !settings.ai_model_writing.is_empty() {
settings.ai_model_writing.clone()
} else {
model_research.clone()
};
// Step 5: Rate limit check (pass 1) // Step 5: Rate limit check (pass 1)
if !state.provider_rate_limiter.check(&provider_name) { // User overrides take priority over global rate limiter
return Err(AppError::RateLimited( check_rate_limit(state, &settings, &provider_name)?;
"Limite de requetes atteinte. Veuillez reessayer dans quelques instants.".into(),
));
}
// Step 6: LLM search pass // Step 6: LLM search pass
emit_progress(tx, "search", "Recherche d'actualites en cours...", 30); emit_progress(tx, "search", "Recherche d'actualites en cours...", 30);
@ -289,16 +301,17 @@ async fn run_generation_inner(
let (system_prompt, user_prompt) = let (system_prompt, user_prompt) =
prompts::build_search_prompt(&settings, &sources, &current_date); prompts::build_search_prompt(&settings, &sources, &current_date);
let model = resolve_model(state, &provider_name).await?;
let raw_results = provider let raw_results = provider
.generate_search_pass(&model, &system_prompt, &user_prompt, &schema) .generate_search_pass(&model_research, &system_prompt, &user_prompt, &schema)
.await?; .await?;
// Step 7: Parse structured output into (category_key, Vec<NewsItem>) // Step 7: Parse structured output into (category_key, Vec<NewsItem>)
emit_progress(tx, "parsing", "Analyse des resultats...", 40); emit_progress(tx, "parsing", "Analyse des resultats...", 40);
let parsed = parse_llm_output(&raw_results, &settings.categories)?; let parsed = parse_llm_output(&raw_results, &settings.categories)?;
// Step 7b: Filter out homepage URLs (path == "/" or empty)
let parsed = filter_homepage_urls(parsed);
// Step 8: Adaptive pipeline — decide whether to scrape+rewrite or use search results directly // Step 8: Adaptive pipeline — decide whether to scrape+rewrite or use search results directly
// //
// If the provider supports native web search and the search pass produced high-quality // If the provider supports native web search and the search pass produced high-quality
@ -322,19 +335,14 @@ async fn run_generation_inner(
let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await; let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await;
// Rate limit check (pass 2) // Rate limit check (pass 2)
if !state.provider_rate_limiter.check(&provider_name) { check_rate_limit(state, &settings, &provider_name)?;
return Err(AppError::RateLimited(
"Limite de requetes atteinte pour la passe de reecriture. Veuillez reessayer."
.into(),
));
}
// LLM rewrite pass // LLM rewrite pass
emit_progress(tx, "rewrite", "Redaction des resumes...", 80); emit_progress(tx, "rewrite", "Redaction des resumes...", 80);
let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&scraped); let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&scraped);
let final_results = provider let final_results = provider
.generate_rewrite_pass(&model, &rewrite_system, &rewrite_user, &schema) .generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &schema)
.await?; .await?;
emit_progress(tx, "finalizing", "Finalisation...", 90); emit_progress(tx, "finalizing", "Finalisation...", 90);
@ -368,13 +376,126 @@ fn emit_progress(tx: &watch::Sender<ProgressEvent>, step: &str, message: &str, p
.ok(); .ok();
} }
/// Check rate limits, using user overrides if configured, otherwise the global limiter.
///
/// When the user has both `rate_limit_max_requests` and `rate_limit_time_window_seconds`
/// set, a temporary per-user rate limiter is created with those values. Otherwise the
/// global provider rate limiter is used.
fn check_rate_limit(
state: &AppState,
settings: &UserSettings,
provider_name: &str,
) -> Result<(), AppError> {
match (
settings.rate_limit_max_requests,
settings.rate_limit_time_window_seconds,
) {
(Some(max_req), Some(window_sec)) => {
// Create a temporary rate limiter with user's config
let user_limiter = crate::services::rate_limiter::RateLimiter::new(
max_req as usize,
Duration::from_secs(window_sec as u64),
);
let key = format!("user_gen_{}", provider_name);
if !user_limiter.check(&key) {
return Err(AppError::RateLimited(
"Limite de requetes personnalisee atteinte. Veuillez reessayer dans quelques instants.".into(),
));
}
Ok(())
}
_ => {
if !state.provider_rate_limiter.check(provider_name) {
return Err(AppError::RateLimited(
"Limite de requetes atteinte. Veuillez reessayer dans quelques instants."
.into(),
));
}
Ok(())
}
}
}
/// Filter out articles whose URL is a homepage (path is "/" or empty).
///
/// Homepage URLs are typically not useful as article sources and indicate
/// the LLM returned a domain root rather than a specific article.
fn filter_homepage_urls(
parsed: Vec<(String, Vec<NewsItem>)>,
) -> Vec<(String, Vec<NewsItem>)> {
let mut total_filtered = 0usize;
let result: Vec<(String, Vec<NewsItem>)> = parsed
.into_iter()
.map(|(cat_key, items)| {
let filtered: Vec<NewsItem> = items
.into_iter()
.filter(|item| {
match Url::parse(&item.url) {
Ok(parsed_url) => {
let path = parsed_url.path();
if path == "/" || path.is_empty() {
total_filtered += 1;
false
} else {
true
}
}
Err(_) => true, // Keep items with unparseable URLs (handled elsewhere)
}
})
.collect();
(cat_key, filtered)
})
.collect();
if total_filtered > 0 {
tracing::warn!(
count = total_filtered,
"Filtered out homepage URLs from search results"
);
}
result
}
/// Resolve the LLM provider and decrypt the user's API key. /// Resolve the LLM provider and decrypt the user's API key.
/// ///
/// Looks up the user's API key for the first available provider. /// If the user has a preferred provider in settings, looks for a key matching
/// that provider specifically. Otherwise falls back to the first available key.
async fn resolve_provider_and_key( async fn resolve_provider_and_key(
state: &AppState, state: &AppState,
user_id: Uuid, user_id: Uuid,
settings: &UserSettings,
) -> Result<(String, String), AppError> { ) -> Result<(String, String), AppError> {
let master_key = encryption::MasterKey::from_hex(&state.config.master_encryption_key)?;
// If the user has a preferred provider, look for that specific key
if !settings.ai_provider.is_empty() {
let key_record = db::api_keys::get_for_user_and_provider(
&state.pool,
user_id,
&settings.ai_provider,
)
.await?;
match key_record {
Some(record) => {
let api_key =
encryption::decrypt(&master_key, &record.encrypted_key, &record.nonce)?;
return Ok((record.provider_name.clone(), api_key));
}
None => {
return Err(AppError::BadRequest(format!(
"Aucune cle API configuree pour le fournisseur '{}'. \
Veuillez ajouter une cle API pour ce fournisseur dans vos parametres.",
settings.ai_provider
)));
}
}
}
// Fall back to first available key
let keys = db::api_keys::list_for_user(&state.pool, user_id).await?; let keys = db::api_keys::list_for_user(&state.pool, user_id).await?;
if keys.is_empty() { if keys.is_empty() {
@ -383,9 +504,7 @@ async fn resolve_provider_and_key(
)); ));
} }
// Use the first available key
let key_record = &keys[0]; let key_record = &keys[0];
let master_key = encryption::MasterKey::from_hex(&state.config.master_encryption_key)?;
let api_key = encryption::decrypt( let api_key = encryption::decrypt(
&master_key, &master_key,
&key_record.encrypted_key, &key_record.encrypted_key,
@ -509,11 +628,12 @@ async fn scrape_articles(
pct as u8, pct as u8,
); );
if let Ok((cat_key, item, scraped_content)) = join_result { if let Ok((cat_key, item, (scraped_content, page_title))) = join_result {
let scraped_item = ScrapedNewsItem { let scraped_item = ScrapedNewsItem {
title: item.title, title: item.title,
url: item.url, url: item.url,
summary: item.summary, summary: item.summary,
original_title: page_title,
scraped_content, scraped_content,
}; };
@ -538,7 +658,7 @@ async fn scrape_articles(
result result
} }
/// Scrape a single article URL, returning the body text or an empty string on failure. /// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure.
/// ///
/// Handles all failure modes gracefully: /// Handles all failure modes gracefully:
/// - Network errors → empty content (article kept) /// - Network errors → empty content (article kept)
@ -548,24 +668,25 @@ async fn scrape_single_article(
http_client: &reqwest::Client, http_client: &reqwest::Client,
url: &str, url: &str,
max_age_days: i64, max_age_days: i64,
) -> String { ) -> (String, String) {
match scraper::scrape_url(http_client, url).await { match scraper::scrape_url(http_client, url).await {
Ok(content) => { Ok(content) => {
if !content.ok || content.is_soft_404 { if !content.ok || content.is_soft_404 {
tracing::warn!(url = url, "Soft 404 or error page detected, skipping content"); tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
return String::new(); return (String::new(), String::new());
} }
if scraper::is_article_too_old(content.published_date, max_age_days) { if scraper::is_article_too_old(content.published_date, max_age_days) {
tracing::warn!(url = url, "Article too old, skipping content"); tracing::warn!(url = url, "Article too old, skipping content");
return String::new(); return (String::new(), String::new());
} }
content.body_text let title = content.title.unwrap_or_default();
(content.body_text, title)
} }
Err(e) => { Err(e) => {
tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content"); tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
String::new() (String::new(), String::new())
} }
} }
} }
@ -1062,4 +1183,71 @@ mod tests {
let parsed: Vec<(String, Vec<NewsItem>)> = vec![]; let parsed: Vec<(String, Vec<NewsItem>)> = vec![];
assert!(!url_quality_sufficient(&parsed)); assert!(!url_quality_sufficient(&parsed));
} }
// ── filter_homepage_urls tests ──────────────────────────────
#[test]
fn test_homepage_url_filtered() {
let parsed = vec![(
"category_0".into(),
vec![
NewsItem {
title: "Homepage".into(),
url: "https://example.com/".into(),
summary: "Sum".into(),
},
NewsItem {
title: "Homepage no slash".into(),
url: "https://example.com".into(),
summary: "Sum".into(),
},
NewsItem {
title: "Real article".into(),
url: "https://example.com/article/123".into(),
summary: "Sum".into(),
},
],
)];
let result = filter_homepage_urls(parsed);
assert_eq!(result[0].1.len(), 1);
assert_eq!(result[0].1[0].title, "Real article");
}
#[test]
fn test_article_url_not_filtered() {
let parsed = vec![(
"category_0".into(),
vec![
NewsItem {
title: "Article 1".into(),
url: "https://example.com/news/article-1".into(),
summary: "Sum 1".into(),
},
NewsItem {
title: "Article 2".into(),
url: "https://blog.example.org/2026/03/post".into(),
summary: "Sum 2".into(),
},
],
)];
let result = filter_homepage_urls(parsed);
assert_eq!(result[0].1.len(), 2);
}
#[test]
fn test_homepage_filter_keeps_unparseable_urls() {
let parsed = vec![(
"category_0".into(),
vec![NewsItem {
title: "Bad URL".into(),
url: "not-a-url".into(),
summary: "Sum".into(),
}],
)];
let result = filter_homepage_urls(parsed);
assert_eq!(result[0].1.len(), 1);
}
} }

Loading…
Cancel
Save