diff --git a/backend/src/models/synthesis.rs b/backend/src/models/synthesis.rs index b78a1ae..9fd7b77 100644 --- a/backend/src/models/synthesis.rs +++ b/backend/src/models/synthesis.rs @@ -48,13 +48,16 @@ impl TryFrom for SynthesisResponse { type Error = crate::errors::AppError; fn try_from(s: Synthesis) -> Result { - let sections: Vec = + let sections: Vec = if s.sections.is_null() { + Vec::new() + } else { serde_json::from_value(s.sections).map_err(|e| { crate::errors::AppError::Internal(anyhow::anyhow!( "Failed to parse synthesis sections: {}", e )) - })?; + })? + }; Ok(Self { id: s.id, @@ -145,6 +148,8 @@ pub struct ScrapedNewsItem { pub title: String, pub url: String, pub summary: String, + #[serde(rename = "originalTitle")] + pub original_title: String, #[serde(rename = "scrapedContent")] pub scraped_content: String, } @@ -312,6 +317,21 @@ mod tests { assert!(SynthesisResponse::try_from(synthesis).is_err()); } + #[test] + fn synthesis_response_from_null_sections_returns_empty() { + let synthesis = Synthesis { + id: Uuid::nil(), + user_id: Uuid::nil(), + week: "2026-W12".into(), + sections: serde_json::Value::Null, + status: "completed".into(), + created_at: Utc::now(), + }; + + let response = SynthesisResponse::try_from(synthesis).unwrap(); + assert!(response.sections.is_empty()); + } + #[test] fn send_email_request_valid_email() { let req = SendEmailRequest { diff --git a/backend/src/services/prompts.rs b/backend/src/services/prompts.rs index 5e90835..71710c9 100644 --- a/backend/src/services/prompts.rs +++ b/backend/src/services/prompts.rs @@ -74,6 +74,8 @@ pub fn build_search_prompt( Pour chaque categorie, fournis au maximum {max_items} actualites.\n\ Pour chaque actualite, fournis un titre provisoire, l'URL source exacte et complete, \ et un resume provisoire.\n\ + Ne retourne JAMAIS des URLs de pages d'accueil (homepage). Fournis toujours des liens \ + directs vers des articles specifiques avec un chemin complet (pas juste le nom de domaine).\n\ Retourne le resultat au format JSON en utilisant les cles category_0, category_1, etc. \ correspondant a l'ordre des sections ci-dessus.", date = current_date, @@ -112,6 +114,10 @@ pub fn build_rewrite_prompt( brut extrait des sites web ('scrapedContent').\n\ Ta tache est de reecrire le 'title' et le 'summary' (4 ou 5 lignes) pour chaque article \ afin qu'ils refletent EXACTEMENT et FIDELEMENT le contenu textuel fourni.\n\ + Pour chaque article, un 'originalTitle' extrait de la page web est fourni. Utilise ce \ + titre original comme base pour le titre final. Regles linguistiques: les titres en anglais \ + restent en anglais, les titres en francais restent en francais, les autres langues sont \ + traduites en francais.\n\ Si le 'scrapedContent' est vide ou insuffisant, utilise le titre et le resume originaux \ pour faire au mieux.\n\ Conserve EXACTEMENT les memes URLs. Ne supprime aucun article de cette liste.\n\n\ @@ -237,6 +243,14 @@ mod tests { assert!(user_prompt.contains("recherche Google")); } + #[test] + fn search_prompt_warns_against_homepage_urls() { + let settings = test_settings(); + let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026"); + assert!(user_prompt.contains("pages d'accueil")); + assert!(user_prompt.contains("articles specifiques")); + } + #[test] fn rewrite_prompt_includes_instructions() { let mut data = std::collections::HashMap::new(); @@ -246,6 +260,7 @@ mod tests { title: "Test Article".into(), url: "https://example.com".into(), summary: "A summary".into(), + original_title: "Original Test Article".into(), scraped_content: "Full article text here...".into(), }], ); @@ -256,6 +271,8 @@ mod tests { assert!(user_prompt.contains("Test Article")); assert!(user_prompt.contains("https://example.com")); assert!(user_prompt.contains("Ne supprime aucun article")); + assert!(user_prompt.contains("originalTitle")); + assert!(user_prompt.contains("titre original comme base")); } #[test] diff --git a/backend/src/services/synthesis.rs b/backend/src/services/synthesis.rs index cfa96dc..784d068 100644 --- a/backend/src/services/synthesis.rs +++ b/backend/src/services/synthesis.rs @@ -18,9 +18,12 @@ use serde::Serialize; use tokio::sync::watch; use uuid::Uuid; +use url::Url; + use crate::app_state::AppState; use crate::db; use crate::errors::AppError; +use crate::models::settings::UserSettings; use crate::models::synthesis::{ get_iso_week_string, NewsItem, NewsSection, ScrapedNewsItem, }; @@ -267,19 +270,28 @@ async fn run_generation_inner( // Step 3: Resolve provider + decrypt API key emit_progress(tx, "provider", "Configuration du fournisseur IA...", 15); - let (provider_name, api_key) = resolve_provider_and_key(state, user_id).await?; + let (provider_name, api_key) = resolve_provider_and_key(state, user_id, &settings).await?; let provider = create_provider(&provider_name, api_key, &state.http_client)?; // Step 4: Build schema from categories let schema = build_category_schema(&settings.categories); + // Step 4b: Resolve models — user overrides take priority over admin config + let model_research = if !settings.ai_model.is_empty() { + settings.ai_model.clone() + } else { + resolve_model(state, &provider_name).await? + }; + let model_writing = if !settings.ai_model_writing.is_empty() { + settings.ai_model_writing.clone() + } else { + model_research.clone() + }; + // Step 5: Rate limit check (pass 1) - if !state.provider_rate_limiter.check(&provider_name) { - return Err(AppError::RateLimited( - "Limite de requetes atteinte. Veuillez reessayer dans quelques instants.".into(), - )); - } + // User overrides take priority over global rate limiter + check_rate_limit(state, &settings, &provider_name)?; // Step 6: LLM search pass emit_progress(tx, "search", "Recherche d'actualites en cours...", 30); @@ -289,16 +301,17 @@ async fn run_generation_inner( let (system_prompt, user_prompt) = prompts::build_search_prompt(&settings, &sources, ¤t_date); - let model = resolve_model(state, &provider_name).await?; - let raw_results = provider - .generate_search_pass(&model, &system_prompt, &user_prompt, &schema) + .generate_search_pass(&model_research, &system_prompt, &user_prompt, &schema) .await?; // Step 7: Parse structured output into (category_key, Vec) emit_progress(tx, "parsing", "Analyse des resultats...", 40); let parsed = parse_llm_output(&raw_results, &settings.categories)?; + // Step 7b: Filter out homepage URLs (path == "/" or empty) + let parsed = filter_homepage_urls(parsed); + // Step 8: Adaptive pipeline — decide whether to scrape+rewrite or use search results directly // // If the provider supports native web search and the search pass produced high-quality @@ -322,19 +335,14 @@ async fn run_generation_inner( let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await; // Rate limit check (pass 2) - if !state.provider_rate_limiter.check(&provider_name) { - return Err(AppError::RateLimited( - "Limite de requetes atteinte pour la passe de reecriture. Veuillez reessayer." - .into(), - )); - } + check_rate_limit(state, &settings, &provider_name)?; // LLM rewrite pass emit_progress(tx, "rewrite", "Redaction des resumes...", 80); let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&scraped); let final_results = provider - .generate_rewrite_pass(&model, &rewrite_system, &rewrite_user, &schema) + .generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &schema) .await?; emit_progress(tx, "finalizing", "Finalisation...", 90); @@ -368,13 +376,126 @@ fn emit_progress(tx: &watch::Sender, step: &str, message: &str, p .ok(); } +/// Check rate limits, using user overrides if configured, otherwise the global limiter. +/// +/// When the user has both `rate_limit_max_requests` and `rate_limit_time_window_seconds` +/// set, a temporary per-user rate limiter is created with those values. Otherwise the +/// global provider rate limiter is used. +fn check_rate_limit( + state: &AppState, + settings: &UserSettings, + provider_name: &str, +) -> Result<(), AppError> { + match ( + settings.rate_limit_max_requests, + settings.rate_limit_time_window_seconds, + ) { + (Some(max_req), Some(window_sec)) => { + // Create a temporary rate limiter with user's config + let user_limiter = crate::services::rate_limiter::RateLimiter::new( + max_req as usize, + Duration::from_secs(window_sec as u64), + ); + let key = format!("user_gen_{}", provider_name); + if !user_limiter.check(&key) { + return Err(AppError::RateLimited( + "Limite de requetes personnalisee atteinte. Veuillez reessayer dans quelques instants.".into(), + )); + } + Ok(()) + } + _ => { + if !state.provider_rate_limiter.check(provider_name) { + return Err(AppError::RateLimited( + "Limite de requetes atteinte. Veuillez reessayer dans quelques instants." + .into(), + )); + } + Ok(()) + } + } +} + +/// Filter out articles whose URL is a homepage (path is "/" or empty). +/// +/// Homepage URLs are typically not useful as article sources and indicate +/// the LLM returned a domain root rather than a specific article. +fn filter_homepage_urls( + parsed: Vec<(String, Vec)>, +) -> Vec<(String, Vec)> { + let mut total_filtered = 0usize; + + let result: Vec<(String, Vec)> = parsed + .into_iter() + .map(|(cat_key, items)| { + let filtered: Vec = items + .into_iter() + .filter(|item| { + match Url::parse(&item.url) { + Ok(parsed_url) => { + let path = parsed_url.path(); + if path == "/" || path.is_empty() { + total_filtered += 1; + false + } else { + true + } + } + Err(_) => true, // Keep items with unparseable URLs (handled elsewhere) + } + }) + .collect(); + (cat_key, filtered) + }) + .collect(); + + if total_filtered > 0 { + tracing::warn!( + count = total_filtered, + "Filtered out homepage URLs from search results" + ); + } + + result +} + /// Resolve the LLM provider and decrypt the user's API key. /// -/// Looks up the user's API key for the first available provider. +/// If the user has a preferred provider in settings, looks for a key matching +/// that provider specifically. Otherwise falls back to the first available key. async fn resolve_provider_and_key( state: &AppState, user_id: Uuid, + settings: &UserSettings, ) -> Result<(String, String), AppError> { + let master_key = encryption::MasterKey::from_hex(&state.config.master_encryption_key)?; + + // If the user has a preferred provider, look for that specific key + if !settings.ai_provider.is_empty() { + let key_record = db::api_keys::get_for_user_and_provider( + &state.pool, + user_id, + &settings.ai_provider, + ) + .await?; + + match key_record { + Some(record) => { + let api_key = + encryption::decrypt(&master_key, &record.encrypted_key, &record.nonce)?; + return Ok((record.provider_name.clone(), api_key)); + } + None => { + return Err(AppError::BadRequest(format!( + "Aucune cle API configuree pour le fournisseur '{}'. \ + Veuillez ajouter une cle API pour ce fournisseur dans vos parametres.", + settings.ai_provider + ))); + } + } + } + + // Fall back to first available key let keys = db::api_keys::list_for_user(&state.pool, user_id).await?; if keys.is_empty() { @@ -383,9 +504,7 @@ async fn resolve_provider_and_key( )); } - // Use the first available key let key_record = &keys[0]; - let master_key = encryption::MasterKey::from_hex(&state.config.master_encryption_key)?; let api_key = encryption::decrypt( &master_key, &key_record.encrypted_key, @@ -509,11 +628,12 @@ async fn scrape_articles( pct as u8, ); - if let Ok((cat_key, item, scraped_content)) = join_result { + if let Ok((cat_key, item, (scraped_content, page_title))) = join_result { let scraped_item = ScrapedNewsItem { title: item.title, url: item.url, summary: item.summary, + original_title: page_title, scraped_content, }; @@ -538,7 +658,7 @@ async fn scrape_articles( result } -/// Scrape a single article URL, returning the body text or an empty string on failure. +/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure. /// /// Handles all failure modes gracefully: /// - Network errors → empty content (article kept) @@ -548,24 +668,25 @@ async fn scrape_single_article( http_client: &reqwest::Client, url: &str, max_age_days: i64, -) -> String { +) -> (String, String) { match scraper::scrape_url(http_client, url).await { Ok(content) => { if !content.ok || content.is_soft_404 { tracing::warn!(url = url, "Soft 404 or error page detected, skipping content"); - return String::new(); + return (String::new(), String::new()); } if scraper::is_article_too_old(content.published_date, max_age_days) { tracing::warn!(url = url, "Article too old, skipping content"); - return String::new(); + return (String::new(), String::new()); } - content.body_text + let title = content.title.unwrap_or_default(); + (content.body_text, title) } Err(e) => { tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content"); - String::new() + (String::new(), String::new()) } } } @@ -1062,4 +1183,71 @@ mod tests { let parsed: Vec<(String, Vec)> = vec![]; assert!(!url_quality_sufficient(&parsed)); } + + // ── filter_homepage_urls tests ────────────────────────────── + + #[test] + fn test_homepage_url_filtered() { + let parsed = vec![( + "category_0".into(), + vec![ + NewsItem { + title: "Homepage".into(), + url: "https://example.com/".into(), + summary: "Sum".into(), + }, + NewsItem { + title: "Homepage no slash".into(), + url: "https://example.com".into(), + summary: "Sum".into(), + }, + NewsItem { + title: "Real article".into(), + url: "https://example.com/article/123".into(), + summary: "Sum".into(), + }, + ], + )]; + + let result = filter_homepage_urls(parsed); + assert_eq!(result[0].1.len(), 1); + assert_eq!(result[0].1[0].title, "Real article"); + } + + #[test] + fn test_article_url_not_filtered() { + let parsed = vec![( + "category_0".into(), + vec![ + NewsItem { + title: "Article 1".into(), + url: "https://example.com/news/article-1".into(), + summary: "Sum 1".into(), + }, + NewsItem { + title: "Article 2".into(), + url: "https://blog.example.org/2026/03/post".into(), + summary: "Sum 2".into(), + }, + ], + )]; + + let result = filter_homepage_urls(parsed); + assert_eq!(result[0].1.len(), 2); + } + + #[test] + fn test_homepage_filter_keeps_unparseable_urls() { + let parsed = vec![( + "category_0".into(), + vec![NewsItem { + title: "Bad URL".into(), + url: "not-a-url".into(), + summary: "Sum".into(), + }], + )]; + + let result = filter_homepage_urls(parsed); + assert_eq!(result[0].1.len(), 1); + } }