v2: pipeline user model selection, rate limiter, URL filter, original title, null-safe sections

- resolve_provider_and_key() now respects user ai_provider preference
- Dual model resolution: ai_model for search pass, ai_model_writing for rewrite pass
- Per-generation rate limiter with user override support
- Homepage URL filter removes domain-only URLs after search pass
- ScrapedNewsItem gains original_title field populated from page <title>
- SynthesisResponse::try_from handles null sections gracefully (returns empty vec)
- Search prompt warns LLM against returning homepage URLs
- Rewrite prompt instructs LLM to use originalTitle with language preservation rules

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent ed6b41fe52
commit 9b994e0528

@ -48,13 +48,16 @@ impl TryFrom<Synthesis> for SynthesisResponse {
type Error = crate::errors::AppError;
fn try_from(s: Synthesis) -> Result<Self, Self::Error> {
let sections: Vec<NewsSection> =
let sections: Vec<NewsSection> = if s.sections.is_null() {
Vec::new()
} else {
serde_json::from_value(s.sections).map_err(|e| {
crate::errors::AppError::Internal(anyhow::anyhow!(
"Failed to parse synthesis sections: {}",
e
))
})?;
})?
};
Ok(Self {
id: s.id,
@ -145,6 +148,8 @@ pub struct ScrapedNewsItem {
pub title: String,
pub url: String,
pub summary: String,
#[serde(rename = "originalTitle")]
pub original_title: String,
#[serde(rename = "scrapedContent")]
pub scraped_content: String,
}
@ -312,6 +317,21 @@ mod tests {
assert!(SynthesisResponse::try_from(synthesis).is_err());
}
#[test]
fn synthesis_response_from_null_sections_returns_empty() {
let synthesis = Synthesis {
id: Uuid::nil(),
user_id: Uuid::nil(),
week: "2026-W12".into(),
sections: serde_json::Value::Null,
status: "completed".into(),
created_at: Utc::now(),
};
let response = SynthesisResponse::try_from(synthesis).unwrap();
assert!(response.sections.is_empty());
}
#[test]
fn send_email_request_valid_email() {
let req = SendEmailRequest {

@ -74,6 +74,8 @@ pub fn build_search_prompt(
Pour chaque categorie, fournis au maximum {max_items} actualites.\n\
Pour chaque actualite, fournis un titre provisoire, l'URL source exacte et complete, \
et un resume provisoire.\n\
Ne retourne JAMAIS des URLs de pages d'accueil (homepage). Fournis toujours des liens \
directs vers des articles specifiques avec un chemin complet (pas juste le nom de domaine).\n\
Retourne le resultat au format JSON en utilisant les cles category_0, category_1, etc. \
correspondant a l'ordre des sections ci-dessus.",
date = current_date,
@ -112,6 +114,10 @@ pub fn build_rewrite_prompt(
brut extrait des sites web ('scrapedContent').\n\
Ta tache est de reecrire le 'title' et le 'summary' (4 ou 5 lignes) pour chaque article \
afin qu'ils refletent EXACTEMENT et FIDELEMENT le contenu textuel fourni.\n\
Pour chaque article, un 'originalTitle' extrait de la page web est fourni. Utilise ce \
titre original comme base pour le titre final. Regles linguistiques: les titres en anglais \
restent en anglais, les titres en francais restent en francais, les autres langues sont \
traduites en francais.\n\
Si le 'scrapedContent' est vide ou insuffisant, utilise le titre et le resume originaux \
pour faire au mieux.\n\
Conserve EXACTEMENT les memes URLs. Ne supprime aucun article de cette liste.\n\n\
@ -237,6 +243,14 @@ mod tests {
assert!(user_prompt.contains("recherche Google"));
}
#[test]
fn search_prompt_warns_against_homepage_urls() {
let settings = test_settings();
let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
assert!(user_prompt.contains("pages d'accueil"));
assert!(user_prompt.contains("articles specifiques"));
}
#[test]
fn rewrite_prompt_includes_instructions() {
let mut data = std::collections::HashMap::new();
@ -246,6 +260,7 @@ mod tests {
title: "Test Article".into(),
url: "https://example.com".into(),
summary: "A summary".into(),
original_title: "Original Test Article".into(),
scraped_content: "Full article text here...".into(),
}],
);
@ -256,6 +271,8 @@ mod tests {
assert!(user_prompt.contains("Test Article"));
assert!(user_prompt.contains("https://example.com"));
assert!(user_prompt.contains("Ne supprime aucun article"));
assert!(user_prompt.contains("originalTitle"));
assert!(user_prompt.contains("titre original comme base"));
}
#[test]

@ -18,9 +18,12 @@ use serde::Serialize;
use tokio::sync::watch;
use uuid::Uuid;
use url::Url;
use crate::app_state::AppState;
use crate::db;
use crate::errors::AppError;
use crate::models::settings::UserSettings;
use crate::models::synthesis::{
get_iso_week_string, NewsItem, NewsSection, ScrapedNewsItem,
};
@ -267,19 +270,28 @@ async fn run_generation_inner(
// Step 3: Resolve provider + decrypt API key
emit_progress(tx, "provider", "Configuration du fournisseur IA...", 15);
let (provider_name, api_key) = resolve_provider_and_key(state, user_id).await?;
let (provider_name, api_key) = resolve_provider_and_key(state, user_id, &settings).await?;
let provider = create_provider(&provider_name, api_key, &state.http_client)?;
// Step 4: Build schema from categories
let schema = build_category_schema(&settings.categories);
// Step 4b: Resolve models — user overrides take priority over admin config
let model_research = if !settings.ai_model.is_empty() {
settings.ai_model.clone()
} else {
resolve_model(state, &provider_name).await?
};
let model_writing = if !settings.ai_model_writing.is_empty() {
settings.ai_model_writing.clone()
} else {
model_research.clone()
};
// Step 5: Rate limit check (pass 1)
if !state.provider_rate_limiter.check(&provider_name) {
return Err(AppError::RateLimited(
"Limite de requetes atteinte. Veuillez reessayer dans quelques instants.".into(),
));
}
// User overrides take priority over global rate limiter
check_rate_limit(state, &settings, &provider_name)?;
// Step 6: LLM search pass
emit_progress(tx, "search", "Recherche d'actualites en cours...", 30);
@ -289,16 +301,17 @@ async fn run_generation_inner(
let (system_prompt, user_prompt) =
prompts::build_search_prompt(&settings, &sources, &current_date);
let model = resolve_model(state, &provider_name).await?;
let raw_results = provider
.generate_search_pass(&model, &system_prompt, &user_prompt, &schema)
.generate_search_pass(&model_research, &system_prompt, &user_prompt, &schema)
.await?;
// Step 7: Parse structured output into (category_key, Vec<NewsItem>)
emit_progress(tx, "parsing", "Analyse des resultats...", 40);
let parsed = parse_llm_output(&raw_results, &settings.categories)?;
// Step 7b: Filter out homepage URLs (path == "/" or empty)
let parsed = filter_homepage_urls(parsed);
// Step 8: Adaptive pipeline — decide whether to scrape+rewrite or use search results directly
//
// If the provider supports native web search and the search pass produced high-quality
@ -322,19 +335,14 @@ async fn run_generation_inner(
let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await;
// Rate limit check (pass 2)
if !state.provider_rate_limiter.check(&provider_name) {
return Err(AppError::RateLimited(
"Limite de requetes atteinte pour la passe de reecriture. Veuillez reessayer."
.into(),
));
}
check_rate_limit(state, &settings, &provider_name)?;
// LLM rewrite pass
emit_progress(tx, "rewrite", "Redaction des resumes...", 80);
let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&scraped);
let final_results = provider
.generate_rewrite_pass(&model, &rewrite_system, &rewrite_user, &schema)
.generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &schema)
.await?;
emit_progress(tx, "finalizing", "Finalisation...", 90);
@ -368,13 +376,126 @@ fn emit_progress(tx: &watch::Sender<ProgressEvent>, step: &str, message: &str, p
.ok();
}
/// Check rate limits, using user overrides if configured, otherwise the global limiter.
///
/// When the user has both `rate_limit_max_requests` and `rate_limit_time_window_seconds`
/// set, a temporary per-user rate limiter is created with those values. Otherwise the
/// global provider rate limiter is used.
fn check_rate_limit(
state: &AppState,
settings: &UserSettings,
provider_name: &str,
) -> Result<(), AppError> {
match (
settings.rate_limit_max_requests,
settings.rate_limit_time_window_seconds,
) {
(Some(max_req), Some(window_sec)) => {
// Create a temporary rate limiter with user's config
let user_limiter = crate::services::rate_limiter::RateLimiter::new(
max_req as usize,
Duration::from_secs(window_sec as u64),
);
let key = format!("user_gen_{}", provider_name);
if !user_limiter.check(&key) {
return Err(AppError::RateLimited(
"Limite de requetes personnalisee atteinte. Veuillez reessayer dans quelques instants.".into(),
));
}
Ok(())
}
_ => {
if !state.provider_rate_limiter.check(provider_name) {
return Err(AppError::RateLimited(
"Limite de requetes atteinte. Veuillez reessayer dans quelques instants."
.into(),
));
}
Ok(())
}
}
}
/// Filter out articles whose URL is a homepage (path is "/" or empty).
///
/// Homepage URLs are typically not useful as article sources and indicate
/// the LLM returned a domain root rather than a specific article.
fn filter_homepage_urls(
parsed: Vec<(String, Vec<NewsItem>)>,
) -> Vec<(String, Vec<NewsItem>)> {
let mut total_filtered = 0usize;
let result: Vec<(String, Vec<NewsItem>)> = parsed
.into_iter()
.map(|(cat_key, items)| {
let filtered: Vec<NewsItem> = items
.into_iter()
.filter(|item| {
match Url::parse(&item.url) {
Ok(parsed_url) => {
let path = parsed_url.path();
if path == "/" || path.is_empty() {
total_filtered += 1;
false
} else {
true
}
}
Err(_) => true, // Keep items with unparseable URLs (handled elsewhere)
}
})
.collect();
(cat_key, filtered)
})
.collect();
if total_filtered > 0 {
tracing::warn!(
count = total_filtered,
"Filtered out homepage URLs from search results"
);
}
result
}
/// Resolve the LLM provider and decrypt the user's API key.
///
/// Looks up the user's API key for the first available provider.
/// If the user has a preferred provider in settings, looks for a key matching
/// that provider specifically. Otherwise falls back to the first available key.
async fn resolve_provider_and_key(
state: &AppState,
user_id: Uuid,
settings: &UserSettings,
) -> Result<(String, String), AppError> {
let master_key = encryption::MasterKey::from_hex(&state.config.master_encryption_key)?;
// If the user has a preferred provider, look for that specific key
if !settings.ai_provider.is_empty() {
let key_record = db::api_keys::get_for_user_and_provider(
&state.pool,
user_id,
&settings.ai_provider,
)
.await?;
match key_record {
Some(record) => {
let api_key =
encryption::decrypt(&master_key, &record.encrypted_key, &record.nonce)?;
return Ok((record.provider_name.clone(), api_key));
}
None => {
return Err(AppError::BadRequest(format!(
"Aucune cle API configuree pour le fournisseur '{}'. \
Veuillez ajouter une cle API pour ce fournisseur dans vos parametres.",
settings.ai_provider
)));
}
}
}
// Fall back to first available key
let keys = db::api_keys::list_for_user(&state.pool, user_id).await?;
if keys.is_empty() {
@ -383,9 +504,7 @@ async fn resolve_provider_and_key(
));
}
// Use the first available key
let key_record = &keys[0];
let master_key = encryption::MasterKey::from_hex(&state.config.master_encryption_key)?;
let api_key = encryption::decrypt(
&master_key,
&key_record.encrypted_key,
@ -509,11 +628,12 @@ async fn scrape_articles(
pct as u8,
);
if let Ok((cat_key, item, scraped_content)) = join_result {
if let Ok((cat_key, item, (scraped_content, page_title))) = join_result {
let scraped_item = ScrapedNewsItem {
title: item.title,
url: item.url,
summary: item.summary,
original_title: page_title,
scraped_content,
};
@ -538,7 +658,7 @@ async fn scrape_articles(
result
}
/// Scrape a single article URL, returning the body text or an empty string on failure.
/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure.
///
/// Handles all failure modes gracefully:
/// - Network errors → empty content (article kept)
@ -548,24 +668,25 @@ async fn scrape_single_article(
http_client: &reqwest::Client,
url: &str,
max_age_days: i64,
) -> String {
) -> (String, String) {
match scraper::scrape_url(http_client, url).await {
Ok(content) => {
if !content.ok || content.is_soft_404 {
tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
return String::new();
return (String::new(), String::new());
}
if scraper::is_article_too_old(content.published_date, max_age_days) {
tracing::warn!(url = url, "Article too old, skipping content");
return String::new();
return (String::new(), String::new());
}
content.body_text
let title = content.title.unwrap_or_default();
(content.body_text, title)
}
Err(e) => {
tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
String::new()
(String::new(), String::new())
}
}
}
@ -1062,4 +1183,71 @@ mod tests {
let parsed: Vec<(String, Vec<NewsItem>)> = vec![];
assert!(!url_quality_sufficient(&parsed));
}
// ── filter_homepage_urls tests ──────────────────────────────
#[test]
fn test_homepage_url_filtered() {
let parsed = vec![(
"category_0".into(),
vec![
NewsItem {
title: "Homepage".into(),
url: "https://example.com/".into(),
summary: "Sum".into(),
},
NewsItem {
title: "Homepage no slash".into(),
url: "https://example.com".into(),
summary: "Sum".into(),
},
NewsItem {
title: "Real article".into(),
url: "https://example.com/article/123".into(),
summary: "Sum".into(),
},
],
)];
let result = filter_homepage_urls(parsed);
assert_eq!(result[0].1.len(), 1);
assert_eq!(result[0].1[0].title, "Real article");
}
#[test]
fn test_article_url_not_filtered() {
let parsed = vec![(
"category_0".into(),
vec![
NewsItem {
title: "Article 1".into(),
url: "https://example.com/news/article-1".into(),
summary: "Sum 1".into(),
},
NewsItem {
title: "Article 2".into(),
url: "https://blog.example.org/2026/03/post".into(),
summary: "Sum 2".into(),
},
],
)];
let result = filter_homepage_urls(parsed);
assert_eq!(result[0].1.len(), 2);
}
#[test]
fn test_homepage_filter_keeps_unparseable_urls() {
let parsed = vec![(
"category_0".into(),
vec![NewsItem {
title: "Bad URL".into(),
url: "not-a-url".into(),
summary: "Sum".into(),
}],
)];
let result = filter_homepage_urls(parsed);
assert_eq!(result[0].1.len(), 1);
}
}

Loading…
Cancel
Save