feat: add "Articles sans date" category for articles without publication date

Articles where neither the scraper nor the LLM could extract a date are now placed in a separate "Articles sans date" section instead of their classified category. This makes undated articles visible without mixing them with properly dated content. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
3 months ago · a89c61c5b6
parent fb086a706f
commit a89c61c5b6
1 changed files with 42 additions and 2 deletions
--- a/backend/src/services/synthesis.rs
+++ b/backend/src/services/synthesis.rs
@ -522,6 +522,24 @@ pub async fn run_generation_inner(
                                }
                            }
                            let llm_date = class_response.get("date").and_then(|d| d.as_str()).filter(|s| !s.is_empty()).map(|s| s.to_string());
                            // Articles without any date go to "Articles sans date" category
                            if llm_date.is_none() {
                                let llm_title = class_response.get("title").and_then(|t| t.as_str()).unwrap_or(&page_title).to_string();
                                let llm_summary = class_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
                                article_scraped.entry("category_no_date".to_string()).or_default().push(NewsItem {
                                    title: llm_title,
                                    url: final_url.clone(),
                                    summary: llm_summary,
                                    date: None,
                                });
                                let source_domain = extract_domain(&source_url).unwrap_or_default();
                                *source_counts.entry(source_domain).or_insert(0) += 1;
                                continue;
                            }
                            let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
                                &class_response, &page_title, &user_categories, &classification_categories,
                                &filled_counts, settings.max_items_per_category as usize,
@ -529,7 +547,6 @@ pub async fn run_generation_inner(
                                continue;
                            };
                            let llm_date = class_response.get("date").and_then(|d| d.as_str()).filter(|s| !s.is_empty()).map(|s| s.to_string());
                            article_scraped.entry(final_cat_key).or_default().push(NewsItem {
                                title: llm_title,
                                url: final_url.clone(),
@ -747,6 +764,25 @@ pub async fn run_generation_inner(
                                }
                            }
                            let llm_date = class_response.get("date").and_then(|d| d.as_str()).filter(|s| !s.is_empty()).map(|s| s.to_string());
                            // Articles without any date go to "Articles sans date" category
                            if llm_date.is_none() {
                                let llm_title = class_response.get("title").and_then(|t| t.as_str()).unwrap_or(&page_title).to_string();
                                let llm_summary = class_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
                                article_scraped.entry("category_no_date".to_string()).or_default().push(NewsItem {
                                    title: llm_title,
                                    url: final_url.clone(),
                                    summary: llm_summary,
                                    date: None,
                                });
                                if let Some(domain) = extract_domain(&final_url) {
                                    *source_counts.entry(domain).or_insert(0) += 1;
                                }
                                continue;
                            }
                            let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
                                &class_response, &page_title, &user_categories, &classification_categories,
                                &filled_counts, settings.max_items_per_category as usize,
@ -754,7 +790,6 @@ pub async fn run_generation_inner(
                                continue;
                            };
                            let llm_date = class_response.get("date").and_then(|d| d.as_str()).filter(|s| !s.is_empty()).map(|s| s.to_string());
                            article_scraped.entry(final_cat_key).or_default().push(NewsItem {
                                title: llm_title,
                                url: final_url.clone(),
@ -885,6 +920,11 @@ pub async fn run_generation_inner(
            final_sections.push(NewsSection { title: "Divers".to_string(), items: autre_items.clone() });
        }
    }
    if let Some(no_date_items) = article_scraped.get("category_no_date") {
        if !no_date_items.is_empty() {
            final_sections.push(NewsSection { title: "Articles sans date".to_string(), items: no_date_items.clone() });
        }
    }
    let sections_json = serde_json::to_value(&final_sections).map_err(|e| AppError::Internal(anyhow::anyhow!("Failed to serialize: {}", e)))?;
    let sections_json = sanitize_json_null_bytes(sections_json);