//! CSV parsing and generation utilities for source import/export. //! //! Handles common real-world CSV quirks: BOM, mixed separators //! (comma and semicolon), quoted fields, header rows, and blank lines. use crate::errors::AppError; use crate::models::source::Source; /// Parse CSV content into `(title, url)` pairs. /// /// Supports: /// - Comma (`,`) and semicolon (`;`) as separators (auto-detected per line) /// - Quoted fields (double-quoted, with escaped `""` inside) /// - UTF-8 BOM (stripped if present) /// - Header row detection (skipped if it looks like a header) /// - Empty lines (silently skipped) /// - Windows (`\r\n`) and Unix (`\n`) line endings pub fn parse_csv(content: &str) -> Result, AppError> { // Strip UTF-8 BOM if present let content = content.strip_prefix('\u{FEFF}').unwrap_or(content); let lines: Vec<&str> = content.lines().collect(); if lines.is_empty() { return Ok(Vec::new()); } let mut results = Vec::new(); let mut start_index = 0; // Detect if the first line is a header row if is_header_line(lines[0]) { start_index = 1; } for line in &lines[start_index..] { let trimmed = line.trim(); if trimmed.is_empty() { continue; } let fields = parse_csv_line(trimmed); if fields.len() < 2 { continue; // Skip malformed rows } let title = fields[0].trim().to_string(); let url = fields[1].trim().to_string(); if title.is_empty() || url.is_empty() { continue; } results.push((title, url)); } Ok(results) } /// Generate CSV content from a list of sources. /// /// Produces a header row followed by one row per source. /// Fields are quoted if they contain commas, quotes, or newlines. pub fn generate_csv(sources: &[Source]) -> String { let mut output = String::from("title,url\n"); for source in sources { output.push_str(&csv_quote(&source.title)); output.push(','); output.push_str(&csv_quote(&source.url)); output.push('\n'); } output } /// Detect whether a line looks like a CSV header row. /// /// A header is detected if the lowercase fields contain common header /// keywords like "title", "url", "name", "link", "source", "adresse". fn is_header_line(line: &str) -> bool { let lower = line.to_lowercase(); let header_keywords = [ "title", "url", "name", "link", "source", "adresse", "titre", "lien", ]; header_keywords .iter() .any(|keyword| lower.contains(keyword)) } /// Parse a single CSV line into fields, supporting both comma and semicolon /// separators, and double-quoted fields. /// /// The separator is auto-detected: if the line contains a semicolon outside /// of quotes and no comma outside of quotes, semicolon is used; otherwise /// comma is the default. fn parse_csv_line(line: &str) -> Vec { let separator = detect_separator(line); let mut fields = Vec::new(); let mut current = String::new(); let mut in_quotes = false; let mut chars = line.chars().peekable(); while let Some(ch) = chars.next() { if in_quotes { if ch == '"' { // Check for escaped quote ("") if chars.peek() == Some(&'"') { current.push('"'); chars.next(); } else { in_quotes = false; } } else { current.push(ch); } } else if ch == '"' { in_quotes = true; } else if ch == separator { fields.push(current.clone()); current.clear(); } else { current.push(ch); } } fields.push(current); fields } /// Detect the field separator for a CSV line. /// /// Counts unquoted commas and semicolons. If there are semicolons but no /// commas (outside quotes), uses semicolon. Otherwise defaults to comma. fn detect_separator(line: &str) -> char { let mut in_quotes = false; let mut commas = 0u32; let mut semicolons = 0u32; for ch in line.chars() { match ch { '"' => in_quotes = !in_quotes, ',' if !in_quotes => commas += 1, ';' if !in_quotes => semicolons += 1, _ => {} } } if semicolons > 0 && commas == 0 { ';' } else { ',' } } /// Quote a CSV field if it contains special characters. /// /// Wraps the field in double quotes if it contains a comma, double quote, /// or newline. Internal double quotes are escaped as `""`. fn csv_quote(field: &str) -> String { if field.contains(',') || field.contains('"') || field.contains('\n') { let escaped = field.replace('"', "\"\""); format!("\"{}\"", escaped) } else { field.to_string() } } #[cfg(test)] mod tests { use super::*; use chrono::Utc; use uuid::Uuid; #[test] fn test_parse_csv_comma_separated() { let csv = "title,url\nMy Blog,https://blog.example.com\nNews Site,https://news.example.com"; let result = parse_csv(csv).unwrap(); assert_eq!(result.len(), 2); assert_eq!(result[0].0, "My Blog"); assert_eq!(result[0].1, "https://blog.example.com"); assert_eq!(result[1].0, "News Site"); assert_eq!(result[1].1, "https://news.example.com"); } #[test] fn test_parse_csv_semicolon_separated() { let csv = "titre;lien\nMon Blog;https://blog.example.com\nActus;https://news.example.com"; let result = parse_csv(csv).unwrap(); assert_eq!(result.len(), 2); assert_eq!(result[0].0, "Mon Blog"); assert_eq!(result[0].1, "https://blog.example.com"); } #[test] fn test_parse_csv_quoted_fields() { let csv = "title,url\n\"My, Blog\",https://blog.example.com\n\"He said \"\"hi\"\"\",https://example.com"; let result = parse_csv(csv).unwrap(); assert_eq!(result.len(), 2); assert_eq!(result[0].0, "My, Blog"); assert_eq!(result[1].0, "He said \"hi\""); } #[test] fn test_parse_csv_header_skipping() { let csv = "title,url\nBlog,https://example.com"; let result = parse_csv(csv).unwrap(); assert_eq!(result.len(), 1); assert_eq!(result[0].0, "Blog"); } #[test] fn test_parse_csv_no_header() { let csv = "Blog,https://example.com\nNews,https://news.com"; let result = parse_csv(csv).unwrap(); assert_eq!(result.len(), 2); } #[test] fn test_parse_csv_empty_lines() { let csv = "title,url\n\nBlog,https://example.com\n\n\nNews,https://news.com\n"; let result = parse_csv(csv).unwrap(); assert_eq!(result.len(), 2); } #[test] fn test_parse_csv_utf8_bom() { let csv = "\u{FEFF}title,url\nBlog,https://example.com"; let result = parse_csv(csv).unwrap(); assert_eq!(result.len(), 1); assert_eq!(result[0].0, "Blog"); } #[test] fn test_parse_csv_empty_content() { let result = parse_csv("").unwrap(); assert!(result.is_empty()); } #[test] fn test_parse_csv_only_header() { let result = parse_csv("title,url").unwrap(); assert!(result.is_empty()); } #[test] fn test_parse_csv_malformed_single_field() { let csv = "Blog\nhttps://example.com"; let result = parse_csv(csv).unwrap(); // Single-field lines are skipped assert!(result.is_empty()); } #[test] fn test_generate_csv_basic() { let sources = vec![ Source { id: Uuid::new_v4(), user_id: Uuid::new_v4(), title: "My Blog".into(), url: "https://blog.example.com".into(), theme_id: None, is_preferred: false, rss_url: None, rss_discovered_at: None, created_at: Utc::now(), }, Source { id: Uuid::new_v4(), user_id: Uuid::new_v4(), title: "News".into(), url: "https://news.example.com".into(), theme_id: None, is_preferred: false, rss_url: None, rss_discovered_at: None, created_at: Utc::now(), }, ]; let csv = generate_csv(&sources); let lines: Vec<&str> = csv.lines().collect(); assert_eq!(lines[0], "title,url"); assert_eq!(lines[1], "My Blog,https://blog.example.com"); assert_eq!(lines[2], "News,https://news.example.com"); } #[test] fn test_generate_csv_with_special_chars() { let sources = vec![Source { id: Uuid::new_v4(), user_id: Uuid::new_v4(), title: "Blog, with commas".into(), url: "https://example.com".into(), theme_id: None, is_preferred: false, rss_url: None, rss_discovered_at: None, created_at: Utc::now(), }]; let csv = generate_csv(&sources); let lines: Vec<&str> = csv.lines().collect(); assert_eq!(lines[1], "\"Blog, with commas\",https://example.com"); } #[test] fn test_generate_csv_empty() { let csv = generate_csv(&[]); assert_eq!(csv, "title,url\n"); } #[test] fn test_generate_csv_roundtrip() { let sources = vec![ Source { id: Uuid::new_v4(), user_id: Uuid::new_v4(), title: "Simple Blog".into(), url: "https://blog.example.com".into(), theme_id: None, is_preferred: false, rss_url: None, rss_discovered_at: None, created_at: Utc::now(), }, Source { id: Uuid::new_v4(), user_id: Uuid::new_v4(), title: "News, Quotes \"here\"".into(), url: "https://news.example.com".into(), theme_id: None, is_preferred: false, rss_url: None, rss_discovered_at: None, created_at: Utc::now(), }, ]; let csv = generate_csv(&sources); let parsed = parse_csv(&csv).unwrap(); assert_eq!(parsed.len(), 2); assert_eq!(parsed[0].0, "Simple Blog"); assert_eq!(parsed[0].1, "https://blog.example.com"); assert_eq!(parsed[1].0, "News, Quotes \"here\""); assert_eq!(parsed[1].1, "https://news.example.com"); } #[test] fn test_detect_separator_comma() { assert_eq!(detect_separator("a,b,c"), ','); } #[test] fn test_detect_separator_semicolon() { assert_eq!(detect_separator("a;b;c"), ';'); } #[test] fn test_detect_separator_mixed_prefers_comma() { // If both are present outside quotes, comma wins assert_eq!(detect_separator("a,b;c"), ','); } #[test] fn test_detect_separator_semicolons_with_commas_in_quotes() { // Commas inside quotes don't count assert_eq!(detect_separator("\"a,b\";c"), ';'); } }