You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

372 lines
11 KiB
Rust

//! CSV parsing and generation utilities for source import/export.
//!
//! Handles common real-world CSV quirks: BOM, mixed separators
//! (comma and semicolon), quoted fields, header rows, and blank lines.
use crate::errors::AppError;
use crate::models::source::Source;
/// Parse CSV content into `(title, url)` pairs.
///
/// Supports:
/// - Comma (`,`) and semicolon (`;`) as separators (auto-detected per line)
/// - Quoted fields (double-quoted, with escaped `""` inside)
/// - UTF-8 BOM (stripped if present)
/// - Header row detection (skipped if it looks like a header)
/// - Empty lines (silently skipped)
/// - Windows (`\r\n`) and Unix (`\n`) line endings
pub fn parse_csv(content: &str) -> Result<Vec<(String, String)>, AppError> {
// Strip UTF-8 BOM if present
let content = content.strip_prefix('\u{FEFF}').unwrap_or(content);
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Ok(Vec::new());
}
let mut results = Vec::new();
let mut start_index = 0;
// Detect if the first line is a header row
if is_header_line(lines[0]) {
start_index = 1;
}
for line in &lines[start_index..] {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let fields = parse_csv_line(trimmed);
if fields.len() < 2 {
continue; // Skip malformed rows
}
let title = fields[0].trim().to_string();
let url = fields[1].trim().to_string();
if title.is_empty() || url.is_empty() {
continue;
}
results.push((title, url));
}
Ok(results)
}
/// Generate CSV content from a list of sources.
///
/// Produces a header row followed by one row per source.
/// Fields are quoted if they contain commas, quotes, or newlines.
pub fn generate_csv(sources: &[Source]) -> String {
let mut output = String::from("title,url\n");
for source in sources {
output.push_str(&csv_quote(&source.title));
output.push(',');
output.push_str(&csv_quote(&source.url));
output.push('\n');
}
output
}
/// Detect whether a line looks like a CSV header row.
///
/// A header is detected if the lowercase fields contain common header
/// keywords like "title", "url", "name", "link", "source", "adresse".
fn is_header_line(line: &str) -> bool {
let lower = line.to_lowercase();
let header_keywords = [
"title", "url", "name", "link", "source", "adresse", "titre", "lien",
];
header_keywords
.iter()
.any(|keyword| lower.contains(keyword))
}
/// Parse a single CSV line into fields, supporting both comma and semicolon
/// separators, and double-quoted fields.
///
/// The separator is auto-detected: if the line contains a semicolon outside
/// of quotes and no comma outside of quotes, semicolon is used; otherwise
/// comma is the default.
fn parse_csv_line(line: &str) -> Vec<String> {
let separator = detect_separator(line);
let mut fields = Vec::new();
let mut current = String::new();
let mut in_quotes = false;
let mut chars = line.chars().peekable();
while let Some(ch) = chars.next() {
if in_quotes {
if ch == '"' {
// Check for escaped quote ("")
if chars.peek() == Some(&'"') {
current.push('"');
chars.next();
} else {
in_quotes = false;
}
} else {
current.push(ch);
}
} else if ch == '"' {
in_quotes = true;
} else if ch == separator {
fields.push(current.clone());
current.clear();
} else {
current.push(ch);
}
}
fields.push(current);
fields
}
/// Detect the field separator for a CSV line.
///
/// Counts unquoted commas and semicolons. If there are semicolons but no
/// commas (outside quotes), uses semicolon. Otherwise defaults to comma.
fn detect_separator(line: &str) -> char {
let mut in_quotes = false;
let mut commas = 0u32;
let mut semicolons = 0u32;
for ch in line.chars() {
match ch {
'"' => in_quotes = !in_quotes,
',' if !in_quotes => commas += 1,
';' if !in_quotes => semicolons += 1,
_ => {}
}
}
if semicolons > 0 && commas == 0 {
';'
} else {
','
}
}
/// Quote a CSV field if it contains special characters.
///
/// Wraps the field in double quotes if it contains a comma, double quote,
/// or newline. Internal double quotes are escaped as `""`.
fn csv_quote(field: &str) -> String {
if field.contains(',') || field.contains('"') || field.contains('\n') {
let escaped = field.replace('"', "\"\"");
format!("\"{}\"", escaped)
} else {
field.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Utc;
use uuid::Uuid;
#[test]
fn test_parse_csv_comma_separated() {
let csv = "title,url\nMy Blog,https://blog.example.com\nNews Site,https://news.example.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].0, "My Blog");
assert_eq!(result[0].1, "https://blog.example.com");
assert_eq!(result[1].0, "News Site");
assert_eq!(result[1].1, "https://news.example.com");
}
#[test]
fn test_parse_csv_semicolon_separated() {
let csv = "titre;lien\nMon Blog;https://blog.example.com\nActus;https://news.example.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].0, "Mon Blog");
assert_eq!(result[0].1, "https://blog.example.com");
}
#[test]
fn test_parse_csv_quoted_fields() {
let csv =
"title,url\n\"My, Blog\",https://blog.example.com\n\"He said \"\"hi\"\"\",https://example.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].0, "My, Blog");
assert_eq!(result[1].0, "He said \"hi\"");
}
#[test]
fn test_parse_csv_header_skipping() {
let csv = "title,url\nBlog,https://example.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "Blog");
}
#[test]
fn test_parse_csv_no_header() {
let csv = "Blog,https://example.com\nNews,https://news.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 2);
}
#[test]
fn test_parse_csv_empty_lines() {
let csv = "title,url\n\nBlog,https://example.com\n\n\nNews,https://news.com\n";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 2);
}
#[test]
fn test_parse_csv_utf8_bom() {
let csv = "\u{FEFF}title,url\nBlog,https://example.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "Blog");
}
#[test]
fn test_parse_csv_empty_content() {
let result = parse_csv("").unwrap();
assert!(result.is_empty());
}
#[test]
fn test_parse_csv_only_header() {
let result = parse_csv("title,url").unwrap();
assert!(result.is_empty());
}
#[test]
fn test_parse_csv_malformed_single_field() {
let csv = "Blog\nhttps://example.com";
let result = parse_csv(csv).unwrap();
// Single-field lines are skipped
assert!(result.is_empty());
}
#[test]
fn test_generate_csv_basic() {
let sources = vec![
Source {
id: Uuid::new_v4(),
user_id: Uuid::new_v4(),
title: "My Blog".into(),
url: "https://blog.example.com".into(),
theme_id: None,
is_preferred: false,
rss_url: None,
rss_discovered_at: None,
created_at: Utc::now(),
},
Source {
id: Uuid::new_v4(),
user_id: Uuid::new_v4(),
title: "News".into(),
url: "https://news.example.com".into(),
theme_id: None,
is_preferred: false,
rss_url: None,
rss_discovered_at: None,
created_at: Utc::now(),
},
];
let csv = generate_csv(&sources);
let lines: Vec<&str> = csv.lines().collect();
assert_eq!(lines[0], "title,url");
assert_eq!(lines[1], "My Blog,https://blog.example.com");
assert_eq!(lines[2], "News,https://news.example.com");
}
#[test]
fn test_generate_csv_with_special_chars() {
let sources = vec![Source {
id: Uuid::new_v4(),
user_id: Uuid::new_v4(),
title: "Blog, with commas".into(),
url: "https://example.com".into(),
theme_id: None,
is_preferred: false,
rss_url: None,
rss_discovered_at: None,
created_at: Utc::now(),
}];
let csv = generate_csv(&sources);
let lines: Vec<&str> = csv.lines().collect();
assert_eq!(lines[1], "\"Blog, with commas\",https://example.com");
}
#[test]
fn test_generate_csv_empty() {
let csv = generate_csv(&[]);
assert_eq!(csv, "title,url\n");
}
#[test]
fn test_generate_csv_roundtrip() {
let sources = vec![
Source {
id: Uuid::new_v4(),
user_id: Uuid::new_v4(),
title: "Simple Blog".into(),
url: "https://blog.example.com".into(),
theme_id: None,
is_preferred: false,
rss_url: None,
rss_discovered_at: None,
created_at: Utc::now(),
},
Source {
id: Uuid::new_v4(),
user_id: Uuid::new_v4(),
title: "News, Quotes \"here\"".into(),
url: "https://news.example.com".into(),
theme_id: None,
is_preferred: false,
rss_url: None,
rss_discovered_at: None,
created_at: Utc::now(),
},
];
let csv = generate_csv(&sources);
let parsed = parse_csv(&csv).unwrap();
assert_eq!(parsed.len(), 2);
assert_eq!(parsed[0].0, "Simple Blog");
assert_eq!(parsed[0].1, "https://blog.example.com");
assert_eq!(parsed[1].0, "News, Quotes \"here\"");
assert_eq!(parsed[1].1, "https://news.example.com");
}
#[test]
fn test_detect_separator_comma() {
assert_eq!(detect_separator("a,b,c"), ',');
}
#[test]
fn test_detect_separator_semicolon() {
assert_eq!(detect_separator("a;b;c"), ';');
}
#[test]
fn test_detect_separator_mixed_prefers_comma() {
// If both are present outside quotes, comma wins
assert_eq!(detect_separator("a,b;c"), ',');
}
#[test]
fn test_detect_separator_semicolons_with_commas_in_quotes() {
// Commas inside quotes don't count
assert_eq!(detect_separator("\"a,b\";c"), ';');
}
}