You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
372 lines
11 KiB
Rust
372 lines
11 KiB
Rust
//! CSV parsing and generation utilities for source import/export.
|
|
//!
|
|
//! Handles common real-world CSV quirks: BOM, mixed separators
|
|
//! (comma and semicolon), quoted fields, header rows, and blank lines.
|
|
|
|
use crate::errors::AppError;
|
|
use crate::models::source::Source;
|
|
|
|
/// Parse CSV content into `(title, url)` pairs.
|
|
///
|
|
/// Supports:
|
|
/// - Comma (`,`) and semicolon (`;`) as separators (auto-detected per line)
|
|
/// - Quoted fields (double-quoted, with escaped `""` inside)
|
|
/// - UTF-8 BOM (stripped if present)
|
|
/// - Header row detection (skipped if it looks like a header)
|
|
/// - Empty lines (silently skipped)
|
|
/// - Windows (`\r\n`) and Unix (`\n`) line endings
|
|
pub fn parse_csv(content: &str) -> Result<Vec<(String, String)>, AppError> {
|
|
// Strip UTF-8 BOM if present
|
|
let content = content.strip_prefix('\u{FEFF}').unwrap_or(content);
|
|
|
|
let lines: Vec<&str> = content.lines().collect();
|
|
if lines.is_empty() {
|
|
return Ok(Vec::new());
|
|
}
|
|
|
|
let mut results = Vec::new();
|
|
let mut start_index = 0;
|
|
|
|
// Detect if the first line is a header row
|
|
if is_header_line(lines[0]) {
|
|
start_index = 1;
|
|
}
|
|
|
|
for line in &lines[start_index..] {
|
|
let trimmed = line.trim();
|
|
if trimmed.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
let fields = parse_csv_line(trimmed);
|
|
if fields.len() < 2 {
|
|
continue; // Skip malformed rows
|
|
}
|
|
|
|
let title = fields[0].trim().to_string();
|
|
let url = fields[1].trim().to_string();
|
|
|
|
if title.is_empty() || url.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
results.push((title, url));
|
|
}
|
|
|
|
Ok(results)
|
|
}
|
|
|
|
/// Generate CSV content from a list of sources.
|
|
///
|
|
/// Produces a header row followed by one row per source.
|
|
/// Fields are quoted if they contain commas, quotes, or newlines.
|
|
pub fn generate_csv(sources: &[Source]) -> String {
|
|
let mut output = String::from("title,url\n");
|
|
|
|
for source in sources {
|
|
output.push_str(&csv_quote(&source.title));
|
|
output.push(',');
|
|
output.push_str(&csv_quote(&source.url));
|
|
output.push('\n');
|
|
}
|
|
|
|
output
|
|
}
|
|
|
|
/// Detect whether a line looks like a CSV header row.
|
|
///
|
|
/// A header is detected if the lowercase fields contain common header
|
|
/// keywords like "title", "url", "name", "link", "source", "adresse".
|
|
fn is_header_line(line: &str) -> bool {
|
|
let lower = line.to_lowercase();
|
|
let header_keywords = [
|
|
"title", "url", "name", "link", "source", "adresse", "titre", "lien",
|
|
];
|
|
header_keywords
|
|
.iter()
|
|
.any(|keyword| lower.contains(keyword))
|
|
}
|
|
|
|
/// Parse a single CSV line into fields, supporting both comma and semicolon
|
|
/// separators, and double-quoted fields.
|
|
///
|
|
/// The separator is auto-detected: if the line contains a semicolon outside
|
|
/// of quotes and no comma outside of quotes, semicolon is used; otherwise
|
|
/// comma is the default.
|
|
fn parse_csv_line(line: &str) -> Vec<String> {
|
|
let separator = detect_separator(line);
|
|
let mut fields = Vec::new();
|
|
let mut current = String::new();
|
|
let mut in_quotes = false;
|
|
let mut chars = line.chars().peekable();
|
|
|
|
while let Some(ch) = chars.next() {
|
|
if in_quotes {
|
|
if ch == '"' {
|
|
// Check for escaped quote ("")
|
|
if chars.peek() == Some(&'"') {
|
|
current.push('"');
|
|
chars.next();
|
|
} else {
|
|
in_quotes = false;
|
|
}
|
|
} else {
|
|
current.push(ch);
|
|
}
|
|
} else if ch == '"' {
|
|
in_quotes = true;
|
|
} else if ch == separator {
|
|
fields.push(current.clone());
|
|
current.clear();
|
|
} else {
|
|
current.push(ch);
|
|
}
|
|
}
|
|
|
|
fields.push(current);
|
|
fields
|
|
}
|
|
|
|
/// Detect the field separator for a CSV line.
|
|
///
|
|
/// Counts unquoted commas and semicolons. If there are semicolons but no
|
|
/// commas (outside quotes), uses semicolon. Otherwise defaults to comma.
|
|
fn detect_separator(line: &str) -> char {
|
|
let mut in_quotes = false;
|
|
let mut commas = 0u32;
|
|
let mut semicolons = 0u32;
|
|
|
|
for ch in line.chars() {
|
|
match ch {
|
|
'"' => in_quotes = !in_quotes,
|
|
',' if !in_quotes => commas += 1,
|
|
';' if !in_quotes => semicolons += 1,
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
if semicolons > 0 && commas == 0 {
|
|
';'
|
|
} else {
|
|
','
|
|
}
|
|
}
|
|
|
|
/// Quote a CSV field if it contains special characters.
|
|
///
|
|
/// Wraps the field in double quotes if it contains a comma, double quote,
|
|
/// or newline. Internal double quotes are escaped as `""`.
|
|
fn csv_quote(field: &str) -> String {
|
|
if field.contains(',') || field.contains('"') || field.contains('\n') {
|
|
let escaped = field.replace('"', "\"\"");
|
|
format!("\"{}\"", escaped)
|
|
} else {
|
|
field.to_string()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use chrono::Utc;
|
|
use uuid::Uuid;
|
|
|
|
#[test]
|
|
fn test_parse_csv_comma_separated() {
|
|
let csv = "title,url\nMy Blog,https://blog.example.com\nNews Site,https://news.example.com";
|
|
let result = parse_csv(csv).unwrap();
|
|
assert_eq!(result.len(), 2);
|
|
assert_eq!(result[0].0, "My Blog");
|
|
assert_eq!(result[0].1, "https://blog.example.com");
|
|
assert_eq!(result[1].0, "News Site");
|
|
assert_eq!(result[1].1, "https://news.example.com");
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_csv_semicolon_separated() {
|
|
let csv = "titre;lien\nMon Blog;https://blog.example.com\nActus;https://news.example.com";
|
|
let result = parse_csv(csv).unwrap();
|
|
assert_eq!(result.len(), 2);
|
|
assert_eq!(result[0].0, "Mon Blog");
|
|
assert_eq!(result[0].1, "https://blog.example.com");
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_csv_quoted_fields() {
|
|
let csv =
|
|
"title,url\n\"My, Blog\",https://blog.example.com\n\"He said \"\"hi\"\"\",https://example.com";
|
|
let result = parse_csv(csv).unwrap();
|
|
assert_eq!(result.len(), 2);
|
|
assert_eq!(result[0].0, "My, Blog");
|
|
assert_eq!(result[1].0, "He said \"hi\"");
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_csv_header_skipping() {
|
|
let csv = "title,url\nBlog,https://example.com";
|
|
let result = parse_csv(csv).unwrap();
|
|
assert_eq!(result.len(), 1);
|
|
assert_eq!(result[0].0, "Blog");
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_csv_no_header() {
|
|
let csv = "Blog,https://example.com\nNews,https://news.com";
|
|
let result = parse_csv(csv).unwrap();
|
|
assert_eq!(result.len(), 2);
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_csv_empty_lines() {
|
|
let csv = "title,url\n\nBlog,https://example.com\n\n\nNews,https://news.com\n";
|
|
let result = parse_csv(csv).unwrap();
|
|
assert_eq!(result.len(), 2);
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_csv_utf8_bom() {
|
|
let csv = "\u{FEFF}title,url\nBlog,https://example.com";
|
|
let result = parse_csv(csv).unwrap();
|
|
assert_eq!(result.len(), 1);
|
|
assert_eq!(result[0].0, "Blog");
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_csv_empty_content() {
|
|
let result = parse_csv("").unwrap();
|
|
assert!(result.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_csv_only_header() {
|
|
let result = parse_csv("title,url").unwrap();
|
|
assert!(result.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_csv_malformed_single_field() {
|
|
let csv = "Blog\nhttps://example.com";
|
|
let result = parse_csv(csv).unwrap();
|
|
// Single-field lines are skipped
|
|
assert!(result.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_csv_basic() {
|
|
let sources = vec![
|
|
Source {
|
|
id: Uuid::new_v4(),
|
|
user_id: Uuid::new_v4(),
|
|
title: "My Blog".into(),
|
|
url: "https://blog.example.com".into(),
|
|
theme_id: None,
|
|
is_preferred: false,
|
|
rss_url: None,
|
|
rss_discovered_at: None,
|
|
created_at: Utc::now(),
|
|
},
|
|
Source {
|
|
id: Uuid::new_v4(),
|
|
user_id: Uuid::new_v4(),
|
|
title: "News".into(),
|
|
url: "https://news.example.com".into(),
|
|
theme_id: None,
|
|
is_preferred: false,
|
|
rss_url: None,
|
|
rss_discovered_at: None,
|
|
created_at: Utc::now(),
|
|
},
|
|
];
|
|
|
|
let csv = generate_csv(&sources);
|
|
let lines: Vec<&str> = csv.lines().collect();
|
|
assert_eq!(lines[0], "title,url");
|
|
assert_eq!(lines[1], "My Blog,https://blog.example.com");
|
|
assert_eq!(lines[2], "News,https://news.example.com");
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_csv_with_special_chars() {
|
|
let sources = vec![Source {
|
|
id: Uuid::new_v4(),
|
|
user_id: Uuid::new_v4(),
|
|
title: "Blog, with commas".into(),
|
|
url: "https://example.com".into(),
|
|
theme_id: None,
|
|
is_preferred: false,
|
|
rss_url: None,
|
|
rss_discovered_at: None,
|
|
created_at: Utc::now(),
|
|
}];
|
|
|
|
let csv = generate_csv(&sources);
|
|
let lines: Vec<&str> = csv.lines().collect();
|
|
assert_eq!(lines[1], "\"Blog, with commas\",https://example.com");
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_csv_empty() {
|
|
let csv = generate_csv(&[]);
|
|
assert_eq!(csv, "title,url\n");
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_csv_roundtrip() {
|
|
let sources = vec![
|
|
Source {
|
|
id: Uuid::new_v4(),
|
|
user_id: Uuid::new_v4(),
|
|
title: "Simple Blog".into(),
|
|
url: "https://blog.example.com".into(),
|
|
theme_id: None,
|
|
is_preferred: false,
|
|
rss_url: None,
|
|
rss_discovered_at: None,
|
|
created_at: Utc::now(),
|
|
},
|
|
Source {
|
|
id: Uuid::new_v4(),
|
|
user_id: Uuid::new_v4(),
|
|
title: "News, Quotes \"here\"".into(),
|
|
url: "https://news.example.com".into(),
|
|
theme_id: None,
|
|
is_preferred: false,
|
|
rss_url: None,
|
|
rss_discovered_at: None,
|
|
created_at: Utc::now(),
|
|
},
|
|
];
|
|
|
|
let csv = generate_csv(&sources);
|
|
let parsed = parse_csv(&csv).unwrap();
|
|
|
|
assert_eq!(parsed.len(), 2);
|
|
assert_eq!(parsed[0].0, "Simple Blog");
|
|
assert_eq!(parsed[0].1, "https://blog.example.com");
|
|
assert_eq!(parsed[1].0, "News, Quotes \"here\"");
|
|
assert_eq!(parsed[1].1, "https://news.example.com");
|
|
}
|
|
|
|
#[test]
|
|
fn test_detect_separator_comma() {
|
|
assert_eq!(detect_separator("a,b,c"), ',');
|
|
}
|
|
|
|
#[test]
|
|
fn test_detect_separator_semicolon() {
|
|
assert_eq!(detect_separator("a;b;c"), ';');
|
|
}
|
|
|
|
#[test]
|
|
fn test_detect_separator_mixed_prefers_comma() {
|
|
// If both are present outside quotes, comma wins
|
|
assert_eq!(detect_separator("a,b;c"), ',');
|
|
}
|
|
|
|
#[test]
|
|
fn test_detect_separator_semicolons_with_commas_in_quotes() {
|
|
// Commas inside quotes don't count
|
|
assert_eq!(detect_separator("\"a,b\";c"), ';');
|
|
}
|
|
}
|