v2: pipeline user model selection, rate limiter, URL filter, original title, null-safe sections

- resolve_provider_and_key() now respects user ai_provider preference - Dual model resolution: ai_model for search pass, ai_model_writing for rewrite pass - Per-generation rate limiter with user override support - Homepage URL filter removes domain-only URLs after search pass - ScrapedNewsItem gains original_title field populated from page <title> - SynthesisResponse::try_from handles null sections gracefully (returns empty vec) - Search prompt warns LLM against returning homepage URLs - Rewrite prompt instructs LLM to use originalTitle with language preservation rules Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · 9b994e0528
parent ed6b41fe52
commit 9b994e0528
3 changed files with 253 additions and 28 deletions
--- a/backend/src/models/synthesis.rs
+++ b/backend/src/models/synthesis.rs
@ -48,13 +48,16 @@ impl TryFrom<Synthesis> for SynthesisResponse {
    type Error = crate::errors::AppError;

    fn try_from(s: Synthesis) -> Result<Self, Self::Error> {
-        let sections: Vec<NewsSection> =
+        let sections: Vec<NewsSection> = if s.sections.is_null() {
+            Vec::new()
+        } else {
            serde_json::from_value(s.sections).map_err(|e| {
                crate::errors::AppError::Internal(anyhow::anyhow!(
                    "Failed to parse synthesis sections: {}",
                    e
                ))
-            })?;
+            })?
+        };

        Ok(Self {
            id: s.id,
@ -145,6 +148,8 @@ pub struct ScrapedNewsItem {
    pub title: String,
    pub url: String,
    pub summary: String,
+    #[serde(rename = "originalTitle")]
+    pub original_title: String,
    #[serde(rename = "scrapedContent")]
    pub scraped_content: String,
 }
@ -312,6 +317,21 @@ mod tests {
        assert!(SynthesisResponse::try_from(synthesis).is_err());
    }

+    #[test]
+    fn synthesis_response_from_null_sections_returns_empty() {
+        let synthesis = Synthesis {
+            id: Uuid::nil(),
+            user_id: Uuid::nil(),
+            week: "2026-W12".into(),
+            sections: serde_json::Value::Null,
+            status: "completed".into(),
+            created_at: Utc::now(),
+        };
+
+        let response = SynthesisResponse::try_from(synthesis).unwrap();
+        assert!(response.sections.is_empty());
+    }
+
    #[test]
    fn send_email_request_valid_email() {
        let req = SendEmailRequest {
--- a/backend/src/services/prompts.rs
+++ b/backend/src/services/prompts.rs
@ -74,6 +74,8 @@ pub fn build_search_prompt(
         Pour chaque categorie, fournis au maximum {max_items} actualites.\n\
         Pour chaque actualite, fournis un titre provisoire, l'URL source exacte et complete, \
         et un resume provisoire.\n\
+         Ne retourne JAMAIS des URLs de pages d'accueil (homepage). Fournis toujours des liens \
+         directs vers des articles specifiques avec un chemin complet (pas juste le nom de domaine).\n\
         Retourne le resultat au format JSON en utilisant les cles category_0, category_1, etc. \
         correspondant a l'ordre des sections ci-dessus.",
        date = current_date,
@ -112,6 +114,10 @@ pub fn build_rewrite_prompt(
         brut extrait des sites web ('scrapedContent').\n\
         Ta tache est de reecrire le 'title' et le 'summary' (4 ou 5 lignes) pour chaque article \
         afin qu'ils refletent EXACTEMENT et FIDELEMENT le contenu textuel fourni.\n\
+         Pour chaque article, un 'originalTitle' extrait de la page web est fourni. Utilise ce \
+         titre original comme base pour le titre final. Regles linguistiques: les titres en anglais \
+         restent en anglais, les titres en francais restent en francais, les autres langues sont \
+         traduites en francais.\n\
         Si le 'scrapedContent' est vide ou insuffisant, utilise le titre et le resume originaux \
         pour faire au mieux.\n\
         Conserve EXACTEMENT les memes URLs. Ne supprime aucun article de cette liste.\n\n\
@ -237,6 +243,14 @@ mod tests {
        assert!(user_prompt.contains("recherche Google"));
    }

+    #[test]
+    fn search_prompt_warns_against_homepage_urls() {
+        let settings = test_settings();
+        let (_, user_prompt) = build_search_prompt(&settings, &[], "lundi 21 mars 2026");
+        assert!(user_prompt.contains("pages d'accueil"));
+        assert!(user_prompt.contains("articles specifiques"));
+    }
+
    #[test]
    fn rewrite_prompt_includes_instructions() {
        let mut data = std::collections::HashMap::new();
@ -246,6 +260,7 @@ mod tests {
                title: "Test Article".into(),
                url: "https://example.com".into(),
                summary: "A summary".into(),
+                original_title: "Original Test Article".into(),
                scraped_content: "Full article text here...".into(),
            }],
        );
@ -256,6 +271,8 @@ mod tests {
        assert!(user_prompt.contains("Test Article"));
        assert!(user_prompt.contains("https://example.com"));
        assert!(user_prompt.contains("Ne supprime aucun article"));
+        assert!(user_prompt.contains("originalTitle"));
+        assert!(user_prompt.contains("titre original comme base"));
    }

    #[test]
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -18,9 +18,12 @@ use serde::Serialize;
 use tokio::sync::watch;
 use uuid::Uuid;

+use url::Url;
+
 use crate::app_state::AppState;
 use crate::db;
 use crate::errors::AppError;
+use crate::models::settings::UserSettings;
 use crate::models::synthesis::{
    get_iso_week_string, NewsItem, NewsSection, ScrapedNewsItem,
 };
@ -267,19 +270,28 @@ async fn run_generation_inner(

    // Step 3: Resolve provider + decrypt API key
    emit_progress(tx, "provider", "Configuration du fournisseur IA...", 15);
-    let (provider_name, api_key) = resolve_provider_and_key(state, user_id).await?;
+    let (provider_name, api_key) = resolve_provider_and_key(state, user_id, &settings).await?;

    let provider = create_provider(&provider_name, api_key, &state.http_client)?;

    // Step 4: Build schema from categories
    let schema = build_category_schema(&settings.categories);

+    // Step 4b: Resolve models — user overrides take priority over admin config
+    let model_research = if !settings.ai_model.is_empty() {
+        settings.ai_model.clone()
+    } else {
+        resolve_model(state, &provider_name).await?
+    };
+    let model_writing = if !settings.ai_model_writing.is_empty() {
+        settings.ai_model_writing.clone()
+    } else {
+        model_research.clone()
+    };
+
    // Step 5: Rate limit check (pass 1)
-    if !state.provider_rate_limiter.check(&provider_name) {
-        return Err(AppError::RateLimited(
-            "Limite de requetes atteinte. Veuillez reessayer dans quelques instants.".into(),
-        ));
-    }
+    // User overrides take priority over global rate limiter
+    check_rate_limit(state, &settings, &provider_name)?;

    // Step 6: LLM search pass
    emit_progress(tx, "search", "Recherche d'actualites en cours...", 30);
@ -289,16 +301,17 @@ async fn run_generation_inner(
    let (system_prompt, user_prompt) =
        prompts::build_search_prompt(&settings, &sources, &current_date);

-    let model = resolve_model(state, &provider_name).await?;
-
    let raw_results = provider
-        .generate_search_pass(&model, &system_prompt, &user_prompt, &schema)
+        .generate_search_pass(&model_research, &system_prompt, &user_prompt, &schema)
        .await?;

    // Step 7: Parse structured output into (category_key, Vec<NewsItem>)
    emit_progress(tx, "parsing", "Analyse des resultats...", 40);
    let parsed = parse_llm_output(&raw_results, &settings.categories)?;

+    // Step 7b: Filter out homepage URLs (path == "/" or empty)
+    let parsed = filter_homepage_urls(parsed);
+
    // Step 8: Adaptive pipeline — decide whether to scrape+rewrite or use search results directly
    //
    // If the provider supports native web search and the search pass produced high-quality
@ -322,19 +335,14 @@ async fn run_generation_inner(
        let scraped = scrape_articles(state, &parsed, settings.max_age_days as i64, tx).await;

        // Rate limit check (pass 2)
-        if !state.provider_rate_limiter.check(&provider_name) {
-            return Err(AppError::RateLimited(
-                "Limite de requetes atteinte pour la passe de reecriture. Veuillez reessayer."
-                    .into(),
-            ));
-        }
+        check_rate_limit(state, &settings, &provider_name)?;

        // LLM rewrite pass
        emit_progress(tx, "rewrite", "Redaction des resumes...", 80);
        let (rewrite_system, rewrite_user) = prompts::build_rewrite_prompt(&scraped);

        let final_results = provider
-            .generate_rewrite_pass(&model, &rewrite_system, &rewrite_user, &schema)
+            .generate_rewrite_pass(&model_writing, &rewrite_system, &rewrite_user, &schema)
            .await?;

        emit_progress(tx, "finalizing", "Finalisation...", 90);
@ -368,13 +376,126 @@ fn emit_progress(tx: &watch::Sender<ProgressEvent>, step: &str, message: &str, p
    .ok();
 }

+/// Check rate limits, using user overrides if configured, otherwise the global limiter.
+///
+/// When the user has both `rate_limit_max_requests` and `rate_limit_time_window_seconds`
+/// set, a temporary per-user rate limiter is created with those values. Otherwise the
+/// global provider rate limiter is used.
+fn check_rate_limit(
+    state: &AppState,
+    settings: &UserSettings,
+    provider_name: &str,
+) -> Result<(), AppError> {
+    match (
+        settings.rate_limit_max_requests,
+        settings.rate_limit_time_window_seconds,
+    ) {
+        (Some(max_req), Some(window_sec)) => {
+            // Create a temporary rate limiter with user's config
+            let user_limiter = crate::services::rate_limiter::RateLimiter::new(
+                max_req as usize,
+                Duration::from_secs(window_sec as u64),
+            );
+            let key = format!("user_gen_{}", provider_name);
+            if !user_limiter.check(&key) {
+                return Err(AppError::RateLimited(
+                    "Limite de requetes personnalisee atteinte. Veuillez reessayer dans quelques instants.".into(),
+                ));
+            }
+            Ok(())
+        }
+        _ => {
+            if !state.provider_rate_limiter.check(provider_name) {
+                return Err(AppError::RateLimited(
+                    "Limite de requetes atteinte. Veuillez reessayer dans quelques instants."
+                        .into(),
+                ));
+            }
+            Ok(())
+        }
+    }
+}
+
+/// Filter out articles whose URL is a homepage (path is "/" or empty).
+///
+/// Homepage URLs are typically not useful as article sources and indicate
+/// the LLM returned a domain root rather than a specific article.
+fn filter_homepage_urls(
+    parsed: Vec<(String, Vec<NewsItem>)>,
+) -> Vec<(String, Vec<NewsItem>)> {
+    let mut total_filtered = 0usize;
+
+    let result: Vec<(String, Vec<NewsItem>)> = parsed
+        .into_iter()
+        .map(|(cat_key, items)| {
+            let filtered: Vec<NewsItem> = items
+                .into_iter()
+                .filter(|item| {
+                    match Url::parse(&item.url) {
+                        Ok(parsed_url) => {
+                            let path = parsed_url.path();
+                            if path == "/" || path.is_empty() {
+                                total_filtered += 1;
+                                false
+                            } else {
+                                true
+                            }
+                        }
+                        Err(_) => true, // Keep items with unparseable URLs (handled elsewhere)
+                    }
+                })
+                .collect();
+            (cat_key, filtered)
+        })
+        .collect();
+
+    if total_filtered > 0 {
+        tracing::warn!(
+            count = total_filtered,
+            "Filtered out homepage URLs from search results"
+        );
+    }
+
+    result
+}
+
 /// Resolve the LLM provider and decrypt the user's API key.
 ///
-/// Looks up the user's API key for the first available provider.
+/// If the user has a preferred provider in settings, looks for a key matching
+/// that provider specifically. Otherwise falls back to the first available key.
 async fn resolve_provider_and_key(
    state: &AppState,
    user_id: Uuid,
+    settings: &UserSettings,
 ) -> Result<(String, String), AppError> {
+    let master_key = encryption::MasterKey::from_hex(&state.config.master_encryption_key)?;
+
+    // If the user has a preferred provider, look for that specific key
+    if !settings.ai_provider.is_empty() {
+        let key_record = db::api_keys::get_for_user_and_provider(
+            &state.pool,
+            user_id,
+            &settings.ai_provider,
+        )
+        .await?;
+
+        match key_record {
+            Some(record) => {
+                let api_key =
+                    encryption::decrypt(&master_key, &record.encrypted_key, &record.nonce)?;
+                return Ok((record.provider_name.clone(), api_key));
+            }
+            None => {
+                return Err(AppError::BadRequest(format!(
+                    "Aucune cle API configuree pour le fournisseur '{}'. \
+                     Veuillez ajouter une cle API pour ce fournisseur dans vos parametres.",
+                    settings.ai_provider
+                )));
+            }
+        }
+    }
+
+    // Fall back to first available key
    let keys = db::api_keys::list_for_user(&state.pool, user_id).await?;

    if keys.is_empty() {
@ -383,9 +504,7 @@ async fn resolve_provider_and_key(
        ));
    }

-    // Use the first available key
    let key_record = &keys[0];
-    let master_key = encryption::MasterKey::from_hex(&state.config.master_encryption_key)?;
    let api_key = encryption::decrypt(
        &master_key,
        &key_record.encrypted_key,
@ -509,11 +628,12 @@ async fn scrape_articles(
            pct as u8,
        );

-        if let Ok((cat_key, item, scraped_content)) = join_result {
+        if let Ok((cat_key, item, (scraped_content, page_title))) = join_result {
            let scraped_item = ScrapedNewsItem {
                title: item.title,
                url: item.url,
                summary: item.summary,
+                original_title: page_title,
                scraped_content,
            };

@ -538,7 +658,7 @@ async fn scrape_articles(
    result
 }

-/// Scrape a single article URL, returning the body text or an empty string on failure.
+/// Scrape a single article URL, returning (body_text, page_title) or empty strings on failure.
 ///
 /// Handles all failure modes gracefully:
 /// - Network errors → empty content (article kept)
@ -548,24 +668,25 @@ async fn scrape_single_article(
    http_client: &reqwest::Client,
    url: &str,
    max_age_days: i64,
-) -> String {
+) -> (String, String) {
    match scraper::scrape_url(http_client, url).await {
        Ok(content) => {
            if !content.ok || content.is_soft_404 {
                tracing::warn!(url = url, "Soft 404 or error page detected, skipping content");
-                return String::new();
+                return (String::new(), String::new());
            }

            if scraper::is_article_too_old(content.published_date, max_age_days) {
                tracing::warn!(url = url, "Article too old, skipping content");
-                return String::new();
+                return (String::new(), String::new());
            }

-            content.body_text
+            let title = content.title.unwrap_or_default();
+            (content.body_text, title)
        }
        Err(e) => {
            tracing::warn!(url = url, error = %e, "Failed to scrape URL, keeping article with empty content");
-            String::new()
+            (String::new(), String::new())
        }
    }
 }
@ -1062,4 +1183,71 @@ mod tests {
        let parsed: Vec<(String, Vec<NewsItem>)> = vec![];
        assert!(!url_quality_sufficient(&parsed));
    }
+
+    // ── filter_homepage_urls tests ──────────────────────────────
+
+    #[test]
+    fn test_homepage_url_filtered() {
+        let parsed = vec![(
+            "category_0".into(),
+            vec![
+                NewsItem {
+                    title: "Homepage".into(),
+                    url: "https://example.com/".into(),
+                    summary: "Sum".into(),
+                },
+                NewsItem {
+                    title: "Homepage no slash".into(),
+                    url: "https://example.com".into(),
+                    summary: "Sum".into(),
+                },
+                NewsItem {
+                    title: "Real article".into(),
+                    url: "https://example.com/article/123".into(),
+                    summary: "Sum".into(),
+                },
+            ],
+        )];
+
+        let result = filter_homepage_urls(parsed);
+        assert_eq!(result[0].1.len(), 1);
+        assert_eq!(result[0].1[0].title, "Real article");
+    }
+
+    #[test]
+    fn test_article_url_not_filtered() {
+        let parsed = vec![(
+            "category_0".into(),
+            vec![
+                NewsItem {
+                    title: "Article 1".into(),
+                    url: "https://example.com/news/article-1".into(),
+                    summary: "Sum 1".into(),
+                },
+                NewsItem {
+                    title: "Article 2".into(),
+                    url: "https://blog.example.org/2026/03/post".into(),
+                    summary: "Sum 2".into(),
+                },
+            ],
+        )];
+
+        let result = filter_homepage_urls(parsed);
+        assert_eq!(result[0].1.len(), 2);
+    }
+
+    #[test]
+    fn test_homepage_filter_keeps_unparseable_urls() {
+        let parsed = vec![(
+            "category_0".into(),
+            vec![NewsItem {
+                title: "Bad URL".into(),
+                url: "not-a-url".into(),
+                summary: "Sum".into(),
+            }],
+        )];
+
+        let result = filter_homepage_urls(parsed);
+        assert_eq!(result[0].1.len(), 1);
+    }
 }