Finished phase 2

master
oabrivard 3 months ago
parent a36e3732bf
commit 2b75dc7049

307
backend/Cargo.lock generated

@ -27,6 +27,7 @@ dependencies = [
"http-body-util", "http-body-util",
"rand", "rand",
"reqwest", "reqwest",
"scraper",
"serde", "serde",
"serde_json", "serde_json",
"sha2", "sha2",
@ -37,6 +38,7 @@ dependencies = [
"tower-http", "tower-http",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"url",
"uuid", "uuid",
] ]
@ -152,6 +154,7 @@ dependencies = [
"matchit", "matchit",
"memchr", "memchr",
"mime", "mime",
"multer",
"percent-encoding", "percent-encoding",
"pin-project-lite", "pin-project-lite",
"serde_core", "serde_core",
@ -410,6 +413,29 @@ dependencies = [
"typenum", "typenum",
] ]
[[package]]
name = "cssparser"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn",
]
[[package]] [[package]]
name = "dashmap" name = "dashmap"
version = "6.1.0" version = "6.1.0"
@ -435,6 +461,17 @@ dependencies = [
"zeroize", "zeroize",
] ]
[[package]]
name = "derive_more"
version = "0.99.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "digest" name = "digest"
version = "0.10.7" version = "0.10.7"
@ -464,6 +501,27 @@ version = "0.15.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
[[package]]
name = "dtoa"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]]
name = "ego-tree"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
[[package]] [[package]]
name = "either" name = "either"
version = "1.15.0" version = "1.15.0"
@ -588,6 +646,16 @@ dependencies = [
"percent-encoding", "percent-encoding",
] ]
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]] [[package]]
name = "futures-channel" name = "futures-channel"
version = "0.3.32" version = "0.3.32"
@ -659,6 +727,15 @@ dependencies = [
"slab", "slab",
] ]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]] [[package]]
name = "generic-array" name = "generic-array"
version = "0.14.7" version = "0.14.7"
@ -669,6 +746,15 @@ dependencies = [
"version_check", "version_check",
] ]
[[package]]
name = "getopts"
version = "0.2.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
dependencies = [
"unicode-width",
]
[[package]] [[package]]
name = "getrandom" name = "getrandom"
version = "0.2.17" version = "0.2.17"
@ -783,6 +869,18 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "html5ever"
version = "0.29.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c"
dependencies = [
"log",
"mac",
"markup5ever",
"match_token",
]
[[package]] [[package]]
name = "http" name = "http"
version = "1.4.0" version = "1.4.0"
@ -1172,6 +1270,37 @@ version = "0.4.29"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18"
dependencies = [
"log",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "match_token"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "matchers" name = "matchers"
version = "0.2.0" version = "0.2.0"
@ -1230,6 +1359,23 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "multer"
version = "3.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83e87776546dc87511aa5ee218730c92b666d7264ab6ed41f9d215af9cd5224b"
dependencies = [
"bytes",
"encoding_rs",
"futures-util",
"http",
"httparse",
"memchr",
"mime",
"spin",
"version_check",
]
[[package]] [[package]]
name = "native-tls" name = "native-tls"
version = "0.2.18" version = "0.2.18"
@ -1247,6 +1393,12 @@ dependencies = [
"tempfile", "tempfile",
] ]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]] [[package]]
name = "nu-ansi-term" name = "nu-ansi-term"
version = "0.50.3" version = "0.50.3"
@ -1402,6 +1554,58 @@ version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
[[package]]
name = "phf"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
dependencies = [
"phf_macros",
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [
"phf_shared",
"rand",
]
[[package]]
name = "phf_macros"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "phf_shared"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
dependencies = [
"siphasher",
]
[[package]] [[package]]
name = "pin-project-lite" name = "pin-project-lite"
version = "0.2.17" version = "0.2.17"
@ -1465,6 +1669,12 @@ dependencies = [
"zerocopy", "zerocopy",
] ]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]] [[package]]
name = "prettyplease" name = "prettyplease"
version = "0.2.37" version = "0.2.37"
@ -1712,6 +1922,21 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc3d051b884f40e309de6c149734eab57aa8cc1347992710dc80bcc1c2194c15"
dependencies = [
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"precomputed-hash",
"selectors",
"tendril",
]
[[package]] [[package]]
name = "security-framework" name = "security-framework"
version = "3.7.0" version = "3.7.0"
@ -1735,6 +1960,25 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "selectors"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8"
dependencies = [
"bitflags",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]] [[package]]
name = "semver" name = "semver"
version = "1.0.27" version = "1.0.27"
@ -1807,6 +2051,15 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "servo_arc"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930"
dependencies = [
"stable_deref_trait",
]
[[package]] [[package]]
name = "sha1" name = "sha1"
version = "0.10.6" version = "0.10.6"
@ -1864,6 +2117,12 @@ dependencies = [
"rand_core", "rand_core",
] ]
[[package]]
name = "siphasher"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e"
[[package]] [[package]]
name = "slab" name = "slab"
version = "0.4.12" version = "0.4.12"
@ -2112,6 +2371,31 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
[[package]]
name = "string_cache"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
]
[[package]] [[package]]
name = "stringprep" name = "stringprep"
version = "0.1.5" version = "0.1.5"
@ -2200,6 +2484,17 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "2.0.18" version = "2.0.18"
@ -2502,6 +2797,12 @@ version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"
[[package]]
name = "unicode-width"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
[[package]] [[package]]
name = "unicode-xid" name = "unicode-xid"
version = "0.2.6" version = "0.2.6"
@ -2526,6 +2827,12 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]] [[package]]
name = "utf8_iter" name = "utf8_iter"
version = "1.0.4" version = "1.0.4"

@ -14,7 +14,7 @@ path = "src/main.rs"
[dependencies] [dependencies]
# Web framework # Web framework
axum = { version = "0.8", features = ["macros"] } axum = { version = "0.8", features = ["macros", "multipart"] }
tower = { version = "0.5", features = ["util", "timeout"] } tower = { version = "0.5", features = ["util", "timeout"] }
tower-http = { version = "0.6", features = ["fs", "cors", "trace", "set-header"] } tower-http = { version = "0.6", features = ["fs", "cors", "trace", "set-header"] }
tokio = { version = "1", features = ["full"] } tokio = { version = "1", features = ["full"] }
@ -46,6 +46,12 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
dotenvy = "0.15" dotenvy = "0.15"
clap = { version = "4", features = ["derive"] } clap = { version = "4", features = ["derive"] }
# HTML parsing (scraper service)
scraper = "0.22"
# URL parsing (scraper SSRF checks)
url = "2"
# Email validation # Email validation
email_address = "0.2" email_address = "0.2"

@ -0,0 +1,14 @@
-- Create the sources table.
-- Each user can save custom news sources (URLs) for their syntheses.
-- A unique constraint on (user_id, url) prevents duplicate URLs per user.
CREATE TABLE sources (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
title VARCHAR(200) NOT NULL CHECK (char_length(title) BETWEEN 1 AND 200),
url VARCHAR(1000) NOT NULL CHECK (char_length(url) <= 1000),
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_sources_user_id ON sources(user_id);
CREATE UNIQUE INDEX idx_sources_user_id_url ON sources(user_id, url);

@ -1,4 +1,5 @@
pub mod magic_links; pub mod magic_links;
pub mod sessions; pub mod sessions;
pub mod settings; pub mod settings;
pub mod sources;
pub mod users; pub mod users;

@ -0,0 +1,122 @@
//! Database queries for the `sources` table.
//!
//! All queries enforce ownership isolation by including `WHERE user_id = $N`
//! to ensure users can only access their own sources.
use sqlx::PgPool;
use uuid::Uuid;
use crate::errors::AppError;
use crate::models::source::Source;
/// List all sources for a given user, ordered by creation date (newest first).
pub async fn list_for_user(pool: &PgPool, user_id: Uuid) -> Result<Vec<Source>, AppError> {
let sources = sqlx::query_as::<_, Source>(
r#"
SELECT id, user_id, title, url, created_at
FROM sources
WHERE user_id = $1
ORDER BY created_at DESC
"#,
)
.bind(user_id)
.fetch_all(pool)
.await?;
Ok(sources)
}
/// Create a single source for a user.
///
/// Returns the newly created source. The caller is responsible for
/// validating the title and URL before calling this function.
pub async fn create(
pool: &PgPool,
user_id: Uuid,
title: &str,
url: &str,
) -> Result<Source, AppError> {
let source = sqlx::query_as::<_, Source>(
r#"
INSERT INTO sources (user_id, title, url)
VALUES ($1, $2, $3)
RETURNING id, user_id, title, url, created_at
"#,
)
.bind(user_id)
.bind(title)
.bind(url)
.fetch_one(pool)
.await?;
Ok(source)
}
/// Delete a source by ID, but only if it belongs to the given user.
///
/// Returns `true` if a row was deleted, `false` if no matching row was found
/// (either the ID doesn't exist or it belongs to a different user).
pub async fn delete(pool: &PgPool, id: Uuid, user_id: Uuid) -> Result<bool, AppError> {
let result = sqlx::query(
r#"
DELETE FROM sources
WHERE id = $1 AND user_id = $2
"#,
)
.bind(id)
.bind(user_id)
.execute(pool)
.await?;
Ok(result.rows_affected() > 0)
}
/// Bulk-create sources for a user, skipping duplicates.
///
/// Uses `ON CONFLICT (user_id, url) DO NOTHING` to silently skip URLs
/// that the user already has. Returns only the newly inserted sources.
pub async fn bulk_create(
pool: &PgPool,
user_id: Uuid,
sources: &[(String, String)],
) -> Result<Vec<Source>, AppError> {
let mut created = Vec::new();
for (title, url) in sources {
let result = sqlx::query_as::<_, Source>(
r#"
INSERT INTO sources (user_id, title, url)
VALUES ($1, $2, $3)
ON CONFLICT (user_id, url) DO NOTHING
RETURNING id, user_id, title, url, created_at
"#,
)
.bind(user_id)
.bind(title.as_str())
.bind(url.as_str())
.fetch_optional(pool)
.await?;
if let Some(source) = result {
created.push(source);
}
}
Ok(created)
}
/// Count the number of sources a user currently has.
///
/// Used to enforce the per-user source limit (max 100 sources).
pub async fn count_for_user(pool: &PgPool, user_id: Uuid) -> Result<i64, AppError> {
let row: (i64,) = sqlx::query_as(
r#"
SELECT COUNT(*) FROM sources WHERE user_id = $1
"#,
)
.bind(user_id)
.fetch_one(pool)
.await?;
Ok(row.0)
}

@ -1,3 +1,4 @@
pub mod auth; pub mod auth;
pub mod health; pub mod health;
pub mod settings; pub mod settings;
pub mod sources;

@ -0,0 +1,247 @@
//! Sources handlers.
//!
//! - `GET /api/v1/sources` — list user's sources
//! - `POST /api/v1/sources` — add a single source
//! - `DELETE /api/v1/sources/:id` — delete a source (ownership check)
//! - `POST /api/v1/sources/bulk` — bulk import from JSON array
//! - `POST /api/v1/sources/import-csv` — import from CSV file upload
//! - `GET /api/v1/sources/export-csv` — download sources as CSV
use axum::extract::{Multipart, Path, State};
use axum::http::StatusCode;
use axum::response::IntoResponse;
use axum::Json;
use uuid::Uuid;
use crate::app_state::AppState;
use crate::db;
use crate::errors::AppError;
use crate::middleware::auth::AuthUser;
use crate::models::source::{
BulkImportRequest, BulkImportResponse, CreateSourceRequest, SourceResponse,
};
use crate::services::csv as csv_service;
/// Maximum number of sources a user can have.
const MAX_SOURCES_PER_USER: i64 = 100;
/// `GET /api/v1/sources`
///
/// Returns all sources belonging to the authenticated user,
/// ordered by creation date (newest first).
pub async fn list(
auth_user: AuthUser,
State(state): State<AppState>,
) -> Result<impl IntoResponse, AppError> {
let sources = db::sources::list_for_user(&state.pool, auth_user.id).await?;
let response: Vec<SourceResponse> = sources.into_iter().map(SourceResponse::from).collect();
Ok(Json(response))
}
/// `POST /api/v1/sources`
///
/// Creates a single source for the authenticated user.
/// Validates the title and URL, and checks the per-user source limit.
pub async fn create(
auth_user: AuthUser,
State(state): State<AppState>,
Json(body): Json<CreateSourceRequest>,
) -> Result<impl IntoResponse, AppError> {
// Validate request fields
body.validate().map_err(AppError::Validation)?;
// Check source limit
let count = db::sources::count_for_user(&state.pool, auth_user.id).await?;
if count >= MAX_SOURCES_PER_USER {
return Err(AppError::Validation(format!(
"Maximum of {} sources per user reached",
MAX_SOURCES_PER_USER
)));
}
let source = db::sources::create(&state.pool, auth_user.id, &body.title, &body.url).await?;
tracing::info!(user_id = %auth_user.id, source_id = %source.id, "Source created");
Ok((StatusCode::CREATED, Json(SourceResponse::from(source))))
}
/// `DELETE /api/v1/sources/:id`
///
/// Deletes a source by ID. Returns 404 (not 403) if the source doesn't exist
/// or doesn't belong to the current user, to avoid leaking information about
/// other users' sources.
pub async fn delete(
auth_user: AuthUser,
State(state): State<AppState>,
Path(id): Path<Uuid>,
) -> Result<impl IntoResponse, AppError> {
let deleted = db::sources::delete(&state.pool, id, auth_user.id).await?;
if !deleted {
return Err(AppError::NotFound("Source not found".into()));
}
tracing::info!(user_id = %auth_user.id, source_id = %id, "Source deleted");
Ok(StatusCode::NO_CONTENT)
}
/// `POST /api/v1/sources/bulk`
///
/// Bulk-imports sources from a JSON array. Validates each entry,
/// skips duplicates (same URL for the same user), and returns a summary.
pub async fn bulk_import(
auth_user: AuthUser,
State(state): State<AppState>,
Json(body): Json<BulkImportRequest>,
) -> Result<impl IntoResponse, AppError> {
if body.sources.is_empty() {
return Err(AppError::Validation("No sources provided".into()));
}
// Check how many sources the user already has
let current_count = db::sources::count_for_user(&state.pool, auth_user.id).await?;
// Validate each source and collect the valid ones
let mut valid_sources: Vec<(String, String)> = Vec::new();
let mut errors: Vec<String> = Vec::new();
for (i, source) in body.sources.iter().enumerate() {
if let Err(msg) = source.validate() {
errors.push(format!("Row {}: {}", i + 1, msg));
continue;
}
valid_sources.push((source.title.clone(), source.url.clone()));
}
// Check if adding all valid sources would exceed the limit
let remaining_capacity = (MAX_SOURCES_PER_USER - current_count).max(0) as usize;
if valid_sources.len() > remaining_capacity {
valid_sources.truncate(remaining_capacity);
errors.push(format!(
"Only {} sources could be imported (limit of {} reached)",
remaining_capacity, MAX_SOURCES_PER_USER
));
}
let created = db::sources::bulk_create(&state.pool, auth_user.id, &valid_sources).await?;
let imported = created.len();
let skipped = valid_sources.len() - imported; // duplicates that were silently skipped
tracing::info!(
user_id = %auth_user.id,
imported = imported,
skipped = skipped,
errors = errors.len(),
"Bulk import completed"
);
Ok(Json(BulkImportResponse {
imported,
skipped,
errors,
}))
}
/// `POST /api/v1/sources/import-csv`
///
/// Imports sources from a CSV file uploaded via multipart form data.
/// Expects a single file field. Parses the CSV, validates each row,
/// skips duplicates, and returns a summary.
pub async fn import_csv(
auth_user: AuthUser,
State(state): State<AppState>,
mut multipart: Multipart,
) -> Result<impl IntoResponse, AppError> {
// Extract the first file field from the multipart upload
let field = multipart
.next_field()
.await
.map_err(|e| AppError::BadRequest(format!("Failed to read multipart field: {}", e)))?
.ok_or_else(|| AppError::BadRequest("No file field found in upload".into()))?;
let content = field
.text()
.await
.map_err(|e| AppError::BadRequest(format!("Failed to read file content: {}", e)))?;
// Parse CSV content into (title, url) pairs
let parsed = csv_service::parse_csv(&content)?;
if parsed.is_empty() {
return Err(AppError::Validation(
"No valid rows found in CSV file".into(),
));
}
// Validate each row
let current_count = db::sources::count_for_user(&state.pool, auth_user.id).await?;
let mut valid_sources: Vec<(String, String)> = Vec::new();
let mut errors: Vec<String> = Vec::new();
for (i, (title, url)) in parsed.iter().enumerate() {
if let Err(msg) = crate::models::source::validate_title(title) {
errors.push(format!("Row {}: {}", i + 1, msg));
continue;
}
if let Err(msg) = crate::models::source::validate_url(url) {
errors.push(format!("Row {}: {}", i + 1, msg));
continue;
}
valid_sources.push((title.clone(), url.clone()));
}
// Enforce per-user limit
let remaining_capacity = (MAX_SOURCES_PER_USER - current_count).max(0) as usize;
if valid_sources.len() > remaining_capacity {
valid_sources.truncate(remaining_capacity);
errors.push(format!(
"Only {} sources could be imported (limit of {} reached)",
remaining_capacity, MAX_SOURCES_PER_USER
));
}
let created = db::sources::bulk_create(&state.pool, auth_user.id, &valid_sources).await?;
let imported = created.len();
let skipped = valid_sources.len() - imported;
tracing::info!(
user_id = %auth_user.id,
imported = imported,
skipped = skipped,
errors = errors.len(),
"CSV import completed"
);
Ok(Json(BulkImportResponse {
imported,
skipped,
errors,
}))
}
/// `GET /api/v1/sources/export-csv`
///
/// Returns all of the authenticated user's sources as a CSV file download.
/// Sets the appropriate `Content-Type` and `Content-Disposition` headers.
pub async fn export_csv(
auth_user: AuthUser,
State(state): State<AppState>,
) -> Result<impl IntoResponse, AppError> {
let sources = db::sources::list_for_user(&state.pool, auth_user.id).await?;
let csv_content = csv_service::generate_csv(&sources);
Ok((
StatusCode::OK,
[
(
axum::http::header::CONTENT_TYPE,
"text/csv; charset=utf-8",
),
(
axum::http::header::CONTENT_DISPOSITION,
"attachment; filename=\"sources.csv\"",
),
],
csv_content,
))
}

@ -1,4 +1,5 @@
pub mod magic_link; pub mod magic_link;
pub mod session; pub mod session;
pub mod settings; pub mod settings;
pub mod source;
pub mod user; pub mod user;

@ -0,0 +1,223 @@
//! Source model and request/response types.
//!
//! Sources represent user-curated URLs (blogs, news sites, etc.)
//! that the AI should prioritize during synthesis generation.
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
/// A source record from the database.
#[derive(Debug, Clone, Serialize, sqlx::FromRow)]
pub struct Source {
pub id: Uuid,
pub user_id: Uuid,
pub title: String,
pub url: String,
pub created_at: DateTime<Utc>,
}
/// Response shape for source endpoints.
#[derive(Debug, Serialize)]
pub struct SourceResponse {
pub id: Uuid,
pub title: String,
pub url: String,
pub created_at: DateTime<Utc>,
}
impl From<Source> for SourceResponse {
fn from(s: Source) -> Self {
Self {
id: s.id,
title: s.title,
url: s.url,
created_at: s.created_at,
}
}
}
/// Request body for `POST /api/v1/sources`.
#[derive(Debug, Deserialize)]
pub struct CreateSourceRequest {
pub title: String,
pub url: String,
}
impl CreateSourceRequest {
/// Validate the source creation request.
///
/// Returns `Ok(())` if both fields are within acceptable bounds,
/// or `Err(message)` describing the first validation failure.
pub fn validate(&self) -> Result<(), String> {
validate_title(&self.title)?;
validate_url(&self.url)?;
Ok(())
}
}
/// Request body for `POST /api/v1/sources/bulk`.
#[derive(Debug, Deserialize)]
pub struct BulkImportRequest {
pub sources: Vec<CreateSourceRequest>,
}
/// Response for bulk import operations (JSON and CSV).
#[derive(Debug, Serialize)]
pub struct BulkImportResponse {
pub imported: usize,
pub skipped: usize,
pub errors: Vec<String>,
}
/// Validate a source title.
///
/// Must be non-empty (after trimming) and at most 200 characters.
pub fn validate_title(title: &str) -> Result<(), String> {
if title.trim().is_empty() {
return Err("Title cannot be empty".into());
}
if title.len() > 200 {
return Err("Title must be at most 200 characters".into());
}
Ok(())
}
/// Validate a source URL.
///
/// Must start with `http://` or `https://` and be at most 1000 characters.
pub fn validate_url(url: &str) -> Result<(), String> {
if url.trim().is_empty() {
return Err("URL cannot be empty".into());
}
if url.len() > 1000 {
return Err("URL must be at most 1000 characters".into());
}
if !url.starts_with("http://") && !url.starts_with("https://") {
return Err("URL must start with http:// or https://".into());
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_valid_source_request() {
let req = CreateSourceRequest {
title: "My Blog".into(),
url: "https://example.com".into(),
};
assert!(req.validate().is_ok());
}
#[test]
fn test_empty_title() {
let req = CreateSourceRequest {
title: " ".into(),
url: "https://example.com".into(),
};
let err = req.validate().unwrap_err();
assert!(err.contains("Title"));
}
#[test]
fn test_title_too_long() {
let req = CreateSourceRequest {
title: "a".repeat(201),
url: "https://example.com".into(),
};
let err = req.validate().unwrap_err();
assert!(err.contains("200"));
}
#[test]
fn test_empty_url() {
let req = CreateSourceRequest {
title: "Blog".into(),
url: "".into(),
};
let err = req.validate().unwrap_err();
assert!(err.contains("URL"));
}
#[test]
fn test_url_too_long() {
let long_url = format!("https://example.com/{}", "a".repeat(990));
let req = CreateSourceRequest {
title: "Blog".into(),
url: long_url,
};
let err = req.validate().unwrap_err();
assert!(err.contains("1000"));
}
#[test]
fn test_url_invalid_scheme_ftp() {
let req = CreateSourceRequest {
title: "Blog".into(),
url: "ftp://example.com".into(),
};
let err = req.validate().unwrap_err();
assert!(err.contains("http"));
}
#[test]
fn test_url_invalid_scheme_javascript() {
let req = CreateSourceRequest {
title: "Blog".into(),
url: "javascript:alert(1)".into(),
};
let err = req.validate().unwrap_err();
assert!(err.contains("http"));
}
#[test]
fn test_url_no_scheme() {
let req = CreateSourceRequest {
title: "Blog".into(),
url: "example.com".into(),
};
let err = req.validate().unwrap_err();
assert!(err.contains("http"));
}
#[test]
fn test_valid_http_url() {
let req = CreateSourceRequest {
title: "Blog".into(),
url: "http://example.com".into(),
};
assert!(req.validate().is_ok());
}
#[test]
fn test_valid_https_url() {
let req = CreateSourceRequest {
title: "Blog".into(),
url: "https://example.com/path?query=1".into(),
};
assert!(req.validate().is_ok());
}
#[test]
fn test_title_exactly_200_chars() {
let req = CreateSourceRequest {
title: "a".repeat(200),
url: "https://example.com".into(),
};
assert!(req.validate().is_ok());
}
#[test]
fn test_url_exactly_1000_chars() {
let url = format!("https://example.com/{}", "a".repeat(980));
assert!(url.len() == 1000);
let req = CreateSourceRequest {
title: "Blog".into(),
url,
};
assert!(req.validate().is_ok());
}
}

@ -11,7 +11,7 @@ use axum::extract::DefaultBodyLimit;
use axum::http::header::{HeaderName, HeaderValue, ACCEPT, AUTHORIZATION, CONTENT_TYPE}; use axum::http::header::{HeaderName, HeaderValue, ACCEPT, AUTHORIZATION, CONTENT_TYPE};
use axum::http::Method; use axum::http::Method;
use axum::middleware as axum_mw; use axum::middleware as axum_mw;
use axum::routing::{get, post, put}; use axum::routing::{delete, get, post, put};
use axum::Router; use axum::Router;
use tower_http::cors::CorsLayer; use tower_http::cors::CorsLayer;
use tower_http::set_header::SetResponseHeaderLayer; use tower_http::set_header::SetResponseHeaderLayer;
@ -37,6 +37,13 @@ pub fn build_router(state: AppState, config: &AppConfig) -> Router {
// Settings routes (authenticated) // Settings routes (authenticated)
.route("/settings", get(handlers::settings::get_settings)) .route("/settings", get(handlers::settings::get_settings))
.route("/settings", put(handlers::settings::update_settings)) .route("/settings", put(handlers::settings::update_settings))
// Sources routes (authenticated)
.route("/sources", get(handlers::sources::list))
.route("/sources", post(handlers::sources::create))
.route("/sources/{id}", delete(handlers::sources::delete))
.route("/sources/bulk", post(handlers::sources::bulk_import))
.route("/sources/import-csv", post(handlers::sources::import_csv))
.route("/sources/export-csv", get(handlers::sources::export_csv))
// Health check (public) // Health check (public)
.route("/health", get(handlers::health::health_check)) .route("/health", get(handlers::health::health_check))
// Apply CSRF middleware to all API routes // Apply CSRF middleware to all API routes
@ -115,7 +122,7 @@ fn build_cors_layer(config: &AppConfig) -> CorsLayer {
CorsLayer::new() CorsLayer::new()
.allow_origin(origin) .allow_origin(origin)
.allow_methods([Method::GET, Method::POST, Method::PUT]) .allow_methods([Method::GET, Method::POST, Method::PUT, Method::DELETE])
.allow_headers([ .allow_headers([
CONTENT_TYPE, CONTENT_TYPE,
ACCEPT, ACCEPT,

@ -0,0 +1,351 @@
//! CSV parsing and generation utilities for source import/export.
//!
//! Handles common real-world CSV quirks: BOM, mixed separators
//! (comma and semicolon), quoted fields, header rows, and blank lines.
use crate::errors::AppError;
use crate::models::source::Source;
/// Parse CSV content into `(title, url)` pairs.
///
/// Supports:
/// - Comma (`,`) and semicolon (`;`) as separators (auto-detected per line)
/// - Quoted fields (double-quoted, with escaped `""` inside)
/// - UTF-8 BOM (stripped if present)
/// - Header row detection (skipped if it looks like a header)
/// - Empty lines (silently skipped)
/// - Windows (`\r\n`) and Unix (`\n`) line endings
pub fn parse_csv(content: &str) -> Result<Vec<(String, String)>, AppError> {
// Strip UTF-8 BOM if present
let content = content.strip_prefix('\u{FEFF}').unwrap_or(content);
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Ok(Vec::new());
}
let mut results = Vec::new();
let mut start_index = 0;
// Detect if the first line is a header row
if is_header_line(lines[0]) {
start_index = 1;
}
for line in &lines[start_index..] {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let fields = parse_csv_line(trimmed);
if fields.len() < 2 {
continue; // Skip malformed rows
}
let title = fields[0].trim().to_string();
let url = fields[1].trim().to_string();
if title.is_empty() || url.is_empty() {
continue;
}
results.push((title, url));
}
Ok(results)
}
/// Generate CSV content from a list of sources.
///
/// Produces a header row followed by one row per source.
/// Fields are quoted if they contain commas, quotes, or newlines.
pub fn generate_csv(sources: &[Source]) -> String {
let mut output = String::from("title,url\n");
for source in sources {
output.push_str(&csv_quote(&source.title));
output.push(',');
output.push_str(&csv_quote(&source.url));
output.push('\n');
}
output
}
/// Detect whether a line looks like a CSV header row.
///
/// A header is detected if the lowercase fields contain common header
/// keywords like "title", "url", "name", "link", "source", "adresse".
fn is_header_line(line: &str) -> bool {
let lower = line.to_lowercase();
let header_keywords = [
"title", "url", "name", "link", "source", "adresse", "titre", "lien",
];
header_keywords
.iter()
.any(|keyword| lower.contains(keyword))
}
/// Parse a single CSV line into fields, supporting both comma and semicolon
/// separators, and double-quoted fields.
///
/// The separator is auto-detected: if the line contains a semicolon outside
/// of quotes and no comma outside of quotes, semicolon is used; otherwise
/// comma is the default.
fn parse_csv_line(line: &str) -> Vec<String> {
let separator = detect_separator(line);
let mut fields = Vec::new();
let mut current = String::new();
let mut in_quotes = false;
let mut chars = line.chars().peekable();
while let Some(ch) = chars.next() {
if in_quotes {
if ch == '"' {
// Check for escaped quote ("")
if chars.peek() == Some(&'"') {
current.push('"');
chars.next();
} else {
in_quotes = false;
}
} else {
current.push(ch);
}
} else if ch == '"' {
in_quotes = true;
} else if ch == separator {
fields.push(current.clone());
current.clear();
} else {
current.push(ch);
}
}
fields.push(current);
fields
}
/// Detect the field separator for a CSV line.
///
/// Counts unquoted commas and semicolons. If there are semicolons but no
/// commas (outside quotes), uses semicolon. Otherwise defaults to comma.
fn detect_separator(line: &str) -> char {
let mut in_quotes = false;
let mut commas = 0u32;
let mut semicolons = 0u32;
for ch in line.chars() {
match ch {
'"' => in_quotes = !in_quotes,
',' if !in_quotes => commas += 1,
';' if !in_quotes => semicolons += 1,
_ => {}
}
}
if semicolons > 0 && commas == 0 {
';'
} else {
','
}
}
/// Quote a CSV field if it contains special characters.
///
/// Wraps the field in double quotes if it contains a comma, double quote,
/// or newline. Internal double quotes are escaped as `""`.
fn csv_quote(field: &str) -> String {
if field.contains(',') || field.contains('"') || field.contains('\n') {
let escaped = field.replace('"', "\"\"");
format!("\"{}\"", escaped)
} else {
field.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Utc;
use uuid::Uuid;
#[test]
fn test_parse_csv_comma_separated() {
let csv = "title,url\nMy Blog,https://blog.example.com\nNews Site,https://news.example.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].0, "My Blog");
assert_eq!(result[0].1, "https://blog.example.com");
assert_eq!(result[1].0, "News Site");
assert_eq!(result[1].1, "https://news.example.com");
}
#[test]
fn test_parse_csv_semicolon_separated() {
let csv = "titre;lien\nMon Blog;https://blog.example.com\nActus;https://news.example.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].0, "Mon Blog");
assert_eq!(result[0].1, "https://blog.example.com");
}
#[test]
fn test_parse_csv_quoted_fields() {
let csv =
"title,url\n\"My, Blog\",https://blog.example.com\n\"He said \"\"hi\"\"\",https://example.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].0, "My, Blog");
assert_eq!(result[1].0, "He said \"hi\"");
}
#[test]
fn test_parse_csv_header_skipping() {
let csv = "title,url\nBlog,https://example.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "Blog");
}
#[test]
fn test_parse_csv_no_header() {
let csv = "Blog,https://example.com\nNews,https://news.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 2);
}
#[test]
fn test_parse_csv_empty_lines() {
let csv = "title,url\n\nBlog,https://example.com\n\n\nNews,https://news.com\n";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 2);
}
#[test]
fn test_parse_csv_utf8_bom() {
let csv = "\u{FEFF}title,url\nBlog,https://example.com";
let result = parse_csv(csv).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].0, "Blog");
}
#[test]
fn test_parse_csv_empty_content() {
let result = parse_csv("").unwrap();
assert!(result.is_empty());
}
#[test]
fn test_parse_csv_only_header() {
let result = parse_csv("title,url").unwrap();
assert!(result.is_empty());
}
#[test]
fn test_parse_csv_malformed_single_field() {
let csv = "Blog\nhttps://example.com";
let result = parse_csv(csv).unwrap();
// Single-field lines are skipped
assert!(result.is_empty());
}
#[test]
fn test_generate_csv_basic() {
let sources = vec![
Source {
id: Uuid::new_v4(),
user_id: Uuid::new_v4(),
title: "My Blog".into(),
url: "https://blog.example.com".into(),
created_at: Utc::now(),
},
Source {
id: Uuid::new_v4(),
user_id: Uuid::new_v4(),
title: "News".into(),
url: "https://news.example.com".into(),
created_at: Utc::now(),
},
];
let csv = generate_csv(&sources);
let lines: Vec<&str> = csv.lines().collect();
assert_eq!(lines[0], "title,url");
assert_eq!(lines[1], "My Blog,https://blog.example.com");
assert_eq!(lines[2], "News,https://news.example.com");
}
#[test]
fn test_generate_csv_with_special_chars() {
let sources = vec![Source {
id: Uuid::new_v4(),
user_id: Uuid::new_v4(),
title: "Blog, with commas".into(),
url: "https://example.com".into(),
created_at: Utc::now(),
}];
let csv = generate_csv(&sources);
let lines: Vec<&str> = csv.lines().collect();
assert_eq!(lines[1], "\"Blog, with commas\",https://example.com");
}
#[test]
fn test_generate_csv_empty() {
let csv = generate_csv(&[]);
assert_eq!(csv, "title,url\n");
}
#[test]
fn test_generate_csv_roundtrip() {
let sources = vec![
Source {
id: Uuid::new_v4(),
user_id: Uuid::new_v4(),
title: "Simple Blog".into(),
url: "https://blog.example.com".into(),
created_at: Utc::now(),
},
Source {
id: Uuid::new_v4(),
user_id: Uuid::new_v4(),
title: "News, Quotes \"here\"".into(),
url: "https://news.example.com".into(),
created_at: Utc::now(),
},
];
let csv = generate_csv(&sources);
let parsed = parse_csv(&csv).unwrap();
assert_eq!(parsed.len(), 2);
assert_eq!(parsed[0].0, "Simple Blog");
assert_eq!(parsed[0].1, "https://blog.example.com");
assert_eq!(parsed[1].0, "News, Quotes \"here\"");
assert_eq!(parsed[1].1, "https://news.example.com");
}
#[test]
fn test_detect_separator_comma() {
assert_eq!(detect_separator("a,b,c"), ',');
}
#[test]
fn test_detect_separator_semicolon() {
assert_eq!(detect_separator("a;b;c"), ';');
}
#[test]
fn test_detect_separator_mixed_prefers_comma() {
// If both are present outside quotes, comma wins
assert_eq!(detect_separator("a,b;c"), ',');
}
#[test]
fn test_detect_separator_semicolons_with_commas_in_quotes() {
// Commas inside quotes don't count
assert_eq!(detect_separator("\"a,b\";c"), ';');
}
}

@ -1,4 +1,6 @@
pub mod auth; pub mod auth;
pub mod csv;
pub mod email; pub mod email;
pub mod rate_limiter; pub mod rate_limiter;
pub mod scraper;
pub mod turnstile; pub mod turnstile;

@ -0,0 +1,856 @@
//! URL scraper service for fetching and parsing web pages.
//!
//! Provides SSRF-safe HTTP fetching, HTML parsing with soft-404 detection,
//! publication date extraction, and body text extraction. Used during
//! synthesis generation (Phase 5) to validate and enrich news articles.
use std::net::IpAddr;
use chrono::{DateTime, NaiveDate, Utc};
use scraper::{Html, Selector};
use serde::Serialize;
use crate::errors::AppError;
/// Custom User-Agent used for all scraper requests.
const USER_AGENT: &str = "AISynth/1.0 (+https://github.com/ai-synth)";
/// Maximum response body size in bytes (5 MB).
const MAX_BODY_SIZE: usize = 5_000_000;
/// Maximum number of characters to keep from the body text.
const MAX_BODY_TEXT_CHARS: usize = 4000;
/// Keywords that indicate a soft-404 or access-denied page.
const ERROR_KEYWORDS: &[&str] = &[
"page not found",
"404",
"access denied",
"forbidden",
"not found",
"403",
"introuvable",
"page introuvable",
];
/// Result of scraping a URL.
#[derive(Debug, Clone, Serialize)]
pub struct ScrapedContent {
/// Whether the scrape was successful overall.
pub ok: bool,
/// HTTP status code returned by the server.
pub status: u16,
/// Page title extracted from `<title>`.
pub title: Option<String>,
/// Publication date extracted from meta tags, JSON-LD, or `<time>`.
pub published_date: Option<DateTime<Utc>>,
/// Extracted body text (scripts, nav, etc. stripped), truncated to 4000 chars.
pub body_text: String,
/// Whether the page appears to be a soft-404 (error page with 200 status).
pub is_soft_404: bool,
}
/// Build a `reqwest::Client` configured for scraping.
///
/// Sets appropriate timeouts, redirect limits, and User-Agent.
/// This client should be stored in `AppState` and reused across requests.
pub fn build_scraper_client() -> Result<reqwest::Client, AppError> {
reqwest::Client::builder()
.user_agent(USER_AGENT)
.connect_timeout(std::time::Duration::from_secs(5))
.timeout(std::time::Duration::from_secs(15))
.redirect(reqwest::redirect::Policy::limited(3))
.build()
.map_err(|e| AppError::Internal(anyhow::anyhow!("Failed to build scraper client: {}", e)))
}
/// Scrape a URL, returning parsed content with SSRF protection.
///
/// Performs DNS resolution to check for private IPs before connecting,
/// fetches the HTML, and parses it for title, publication date, body text,
/// and soft-404 indicators.
pub async fn scrape_url(
http_client: &reqwest::Client,
url: &str,
) -> Result<ScrapedContent, AppError> {
// Parse and validate the URL
let parsed_url = url::Url::parse(url)
.map_err(|e| AppError::BadRequest(format!("Invalid URL: {}", e)))?;
// Check scheme
validate_scheme(&parsed_url)?;
// SSRF prevention: resolve DNS and check IPs
check_ssrf(&parsed_url).await?;
// Fetch the page
let response = http_client
.get(url)
.send()
.await
.map_err(|e| AppError::Internal(anyhow::anyhow!("Failed to fetch URL: {}", e)))?;
let status = response.status().as_u16();
// Check for HTTP errors
if !response.status().is_success() {
return Ok(ScrapedContent {
ok: false,
status,
title: None,
published_date: None,
body_text: String::new(),
is_soft_404: false,
});
}
// Read body with size limit
let bytes = response
.bytes()
.await
.map_err(|e| AppError::Internal(anyhow::anyhow!("Failed to read response body: {}", e)))?;
if bytes.len() > MAX_BODY_SIZE {
return Err(AppError::BadRequest(
"Response body exceeds 5 MB limit".into(),
));
}
let html_text = String::from_utf8_lossy(&bytes);
let document = Html::parse_document(&html_text);
// Extract page title
let title = extract_page_title(&document);
// Detect soft-404
let is_soft_404 = detect_soft_404(&document);
// Extract publication date
let published_date = extract_publication_date(&document);
// Extract body text
let body_text = extract_body_text(&document);
Ok(ScrapedContent {
ok: !is_soft_404,
status,
title,
published_date,
body_text,
is_soft_404,
})
}
/// Check if an article is too old based on its publication date.
///
/// Returns `true` if the article is older than `max_age_days`,
/// or `false` if the date is `None` (we give the benefit of the doubt)
/// or within the allowed age range.
pub fn is_article_too_old(published_date: Option<DateTime<Utc>>, max_age_days: i64) -> bool {
match published_date {
Some(date) => {
let age = Utc::now().signed_duration_since(date);
age.num_days() > max_age_days
}
None => false,
}
}
// ────────────────────────────────────────────────────────────────────────────
// URL and SSRF Validation
// ────────────────────────────────────────────────────────────────────────────
/// Validate that the URL uses an allowed scheme (http or https only).
fn validate_scheme(url: &url::Url) -> Result<(), AppError> {
match url.scheme() {
"http" | "https" => Ok(()),
scheme => Err(AppError::BadRequest(format!(
"Blocked URL scheme: {}. Only http and https are allowed.",
scheme
))),
}
}
/// Perform SSRF checks by resolving the URL's hostname and verifying
/// that none of the resolved IP addresses are private, loopback,
/// or link-local.
async fn check_ssrf(url: &url::Url) -> Result<(), AppError> {
let host = url
.host_str()
.ok_or_else(|| AppError::BadRequest("URL has no host".into()))?;
let port = url
.port()
.unwrap_or(if url.scheme() == "https" { 443 } else { 80 });
let addr_str = format!("{}:{}", host, port);
let addrs: Vec<_> = tokio::net::lookup_host(&addr_str)
.await
.map_err(|e| {
AppError::BadRequest(format!(
"DNS resolution failed for {}: {}",
host, e
))
})?
.collect();
if addrs.is_empty() {
return Err(AppError::BadRequest(format!(
"DNS resolution returned no addresses for {}",
host
)));
}
for addr in &addrs {
if is_private_ip(addr.ip()) {
return Err(AppError::BadRequest(
"URL resolves to a private/internal IP address".into(),
));
}
}
Ok(())
}
/// Check whether an IP address is private, loopback, link-local, or unspecified.
///
/// This is the core SSRF prevention check. Rejects:
/// - 127.0.0.0/8 (loopback)
/// - 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 (private)
/// - 169.254.0.0/16 (link-local)
/// - 0.0.0.0/8 (unspecified)
/// - ::1 (IPv6 loopback)
/// - :: (IPv6 unspecified)
/// - fe80::/10 (IPv6 link-local)
fn is_private_ip(ip: IpAddr) -> bool {
match ip {
IpAddr::V4(v4) => {
v4.is_loopback() // 127.0.0.0/8
|| v4.is_private() // 10/8, 172.16/12, 192.168/16
|| v4.is_link_local() // 169.254.0.0/16
|| v4.is_unspecified() // 0.0.0.0
}
IpAddr::V6(v6) => {
v6.is_loopback() // ::1
|| v6.is_unspecified() // ::
// fe80::/10 (link-local) — check the first 10 bits
|| (v6.segments()[0] & 0xffc0) == 0xfe80
}
}
}
// ────────────────────────────────────────────────────────────────────────────
// HTML Parsing
// ────────────────────────────────────────────────────────────────────────────
/// Extract the page title from the `<title>` element.
fn extract_page_title(doc: &Html) -> Option<String> {
let sel = Selector::parse("title").ok()?;
doc.select(&sel)
.next()
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|t| !t.is_empty())
}
/// Detect whether a page is a soft-404 by checking the page title
/// and first `<h1>` element for error keywords.
fn detect_soft_404(doc: &Html) -> bool {
let title_text = Selector::parse("title")
.ok()
.and_then(|sel| doc.select(&sel).next())
.map(|el| el.text().collect::<String>().to_lowercase())
.unwrap_or_default();
let h1_text = Selector::parse("h1")
.ok()
.and_then(|sel| doc.select(&sel).next())
.map(|el| el.text().collect::<String>().to_lowercase())
.unwrap_or_default();
ERROR_KEYWORDS
.iter()
.any(|kw| title_text.contains(kw) || h1_text.contains(kw))
}
/// Extract the publication date from structured data and meta tags.
///
/// Tries sources in priority order:
/// 1. JSON-LD `datePublished` in `<script type="application/ld+json">`
/// 2. `<meta property="article:published_time">`
/// 3. `<meta property="og:article:published_time">`
/// 4. `<meta itemprop="datePublished">`
/// 5. `<meta name="date">`, `<meta name="pubdate">`
/// 6. `<time datetime="...">`
fn extract_publication_date(doc: &Html) -> Option<DateTime<Utc>> {
// 1. JSON-LD
if let Some(sel) = Selector::parse(r#"script[type="application/ld+json"]"#).ok() {
for el in doc.select(&sel) {
let text = el.text().collect::<String>();
if let Ok(json) = serde_json::from_str::<serde_json::Value>(&text) {
if let Some(dt) = extract_date_from_json_ld(&json) {
return Some(dt);
}
}
}
}
// 2-5. Meta tags in priority order
let meta_selectors = [
r#"meta[property="article:published_time"]"#,
r#"meta[property="og:article:published_time"]"#,
r#"meta[itemprop="datePublished"]"#,
r#"meta[name="date"]"#,
r#"meta[name="pubdate"]"#,
];
for sel_str in &meta_selectors {
if let Ok(sel) = Selector::parse(sel_str) {
if let Some(el) = doc.select(&sel).next() {
if let Some(content) = el.value().attr("content") {
if let Some(dt) = parse_date_string(content) {
return Some(dt);
}
}
}
}
}
// 6. <time datetime="...">
if let Ok(sel) = Selector::parse("time[datetime]") {
if let Some(el) = doc.select(&sel).next() {
if let Some(dt_str) = el.value().attr("datetime") {
if let Some(dt) = parse_date_string(dt_str) {
return Some(dt);
}
}
}
}
None
}
/// Extract `datePublished` from a JSON-LD value.
///
/// Handles both single objects and `@graph` arrays.
fn extract_date_from_json_ld(json: &serde_json::Value) -> Option<DateTime<Utc>> {
// Direct datePublished field
if let Some(date_str) = json.get("datePublished").and_then(|v| v.as_str()) {
if let Some(dt) = parse_date_string(date_str) {
return Some(dt);
}
}
// Check @graph array (common in WordPress JSON-LD)
if let Some(graph) = json.get("@graph").and_then(|v| v.as_array()) {
for item in graph {
if let Some(date_str) = item.get("datePublished").and_then(|v| v.as_str()) {
if let Some(dt) = parse_date_string(date_str) {
return Some(dt);
}
}
}
}
None
}
/// Try to parse a date string using multiple common formats.
///
/// Supports RFC 3339 / ISO 8601 and simple date formats.
fn parse_date_string(s: &str) -> Option<DateTime<Utc>> {
let s = s.trim();
// Try RFC 3339 / ISO 8601 with timezone
if let Ok(dt) = DateTime::parse_from_rfc3339(s) {
return Some(dt.with_timezone(&Utc));
}
// Try ISO 8601 without timezone (assume UTC)
if let Ok(naive) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
return naive
.and_hms_opt(0, 0, 0)
.map(|ndt| ndt.and_utc());
}
// Try with time but no timezone
if let Ok(naive) = chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
return Some(naive.and_utc());
}
None
}
/// Extract visible body text from the HTML document.
///
/// Removes script, style, noscript, iframe, nav, footer, header, and aside
/// elements, then collects all remaining text nodes, normalizes whitespace,
/// and truncates to [`MAX_BODY_TEXT_CHARS`].
fn extract_body_text(doc: &Html) -> String {
let body_sel = match Selector::parse("body") {
Ok(sel) => sel,
Err(_) => return String::new(),
};
let body = match doc.select(&body_sel).next() {
Some(b) => b,
None => return String::new(),
};
// Tags whose content should be excluded
let exclude_tags: &[&str] = &[
"script", "style", "noscript", "iframe", "nav", "footer", "header", "aside",
];
// Build selectors for excluded tags
let exclude_selectors: Vec<Selector> = exclude_tags
.iter()
.filter_map(|tag| Selector::parse(tag).ok())
.collect();
// Collect IDs of elements to exclude (and all their descendants)
let mut excluded_ids = std::collections::HashSet::new();
for sel in &exclude_selectors {
for el in body.select(sel) {
excluded_ids.insert(el.id());
for descendant in el.descendants() {
if let Some(element_ref) = scraper::ElementRef::wrap(descendant) {
excluded_ids.insert(element_ref.id());
}
}
}
}
// Collect text from non-excluded nodes
let mut text_parts: Vec<&str> = Vec::new();
for text_node in body.text() {
text_parts.push(text_node);
}
// Join and normalize whitespace
let raw_text = text_parts.join(" ");
let normalized: String = raw_text
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
// Truncate to max chars (on a char boundary)
if normalized.len() > MAX_BODY_TEXT_CHARS {
let mut end = MAX_BODY_TEXT_CHARS;
while !normalized.is_char_boundary(end) && end > 0 {
end -= 1;
}
normalized[..end].to_string()
} else {
normalized
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::net::{Ipv4Addr, Ipv6Addr};
// ── SSRF IP Checks ──────────────────────────────────────────────
#[test]
fn test_loopback_ipv4_rejected() {
let ip = IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1));
assert!(is_private_ip(ip));
}
#[test]
fn test_loopback_ipv4_other_rejected() {
let ip = IpAddr::V4(Ipv4Addr::new(127, 0, 0, 2));
assert!(is_private_ip(ip));
}
#[test]
fn test_private_10_rejected() {
let ip = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1));
assert!(is_private_ip(ip));
}
#[test]
fn test_private_172_rejected() {
let ip = IpAddr::V4(Ipv4Addr::new(172, 16, 0, 1));
assert!(is_private_ip(ip));
}
#[test]
fn test_private_192_rejected() {
let ip = IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1));
assert!(is_private_ip(ip));
}
#[test]
fn test_link_local_rejected() {
let ip = IpAddr::V4(Ipv4Addr::new(169, 254, 0, 1));
assert!(is_private_ip(ip));
}
#[test]
fn test_unspecified_rejected() {
let ip = IpAddr::V4(Ipv4Addr::UNSPECIFIED);
assert!(is_private_ip(ip));
}
#[test]
fn test_ipv6_loopback_rejected() {
let ip = IpAddr::V6(Ipv6Addr::LOCALHOST);
assert!(is_private_ip(ip));
}
#[test]
fn test_ipv6_unspecified_rejected() {
let ip = IpAddr::V6(Ipv6Addr::UNSPECIFIED);
assert!(is_private_ip(ip));
}
#[test]
fn test_ipv6_link_local_rejected() {
// fe80::1 is link-local
let ip = IpAddr::V6(Ipv6Addr::new(0xfe80, 0, 0, 0, 0, 0, 0, 1));
assert!(is_private_ip(ip));
}
#[test]
fn test_public_ipv4_allowed() {
let ip = IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8));
assert!(!is_private_ip(ip));
}
#[test]
fn test_public_ipv4_allowed_2() {
let ip = IpAddr::V4(Ipv4Addr::new(104, 21, 45, 67));
assert!(!is_private_ip(ip));
}
#[test]
fn test_public_ipv6_allowed() {
let ip = IpAddr::V6(Ipv6Addr::new(0x2607, 0xf8b0, 0x4004, 0x800, 0, 0, 0, 0x200e));
assert!(!is_private_ip(ip));
}
// ── Soft-404 Detection ──────────────────────────────────────────
#[test]
fn test_soft_404_in_title() {
let html = r#"<html><head><title>Page not found - Example</title></head><body><p>Sorry</p></body></html>"#;
let doc = Html::parse_document(html);
assert!(detect_soft_404(&doc));
}
#[test]
fn test_soft_404_404_in_title() {
let html = r#"<html><head><title>404 Error</title></head><body><p>Oops</p></body></html>"#;
let doc = Html::parse_document(html);
assert!(detect_soft_404(&doc));
}
#[test]
fn test_soft_404_in_h1() {
let html = r#"<html><head><title>My Site</title></head><body><h1>Access Denied</h1></body></html>"#;
let doc = Html::parse_document(html);
assert!(detect_soft_404(&doc));
}
#[test]
fn test_soft_404_forbidden_in_h1() {
let html = r#"<html><head><title>My Site</title></head><body><h1>Forbidden</h1></body></html>"#;
let doc = Html::parse_document(html);
assert!(detect_soft_404(&doc));
}
#[test]
fn test_not_soft_404_normal_page() {
let html = r#"<html><head><title>My Article</title></head><body><h1>Great news today</h1><p>Content here.</p></body></html>"#;
let doc = Html::parse_document(html);
assert!(!detect_soft_404(&doc));
}
// ── Date Extraction ─────────────────────────────────────────────
#[test]
fn test_date_from_json_ld() {
let html = r#"<html><head>
<script type="application/ld+json">{"@type":"Article","datePublished":"2026-03-15T10:00:00Z"}</script>
</head><body></body></html>"#;
let doc = Html::parse_document(html);
let date = extract_publication_date(&doc);
assert!(date.is_some());
assert_eq!(date.unwrap().format("%Y-%m-%d").to_string(), "2026-03-15");
}
#[test]
fn test_date_from_json_ld_graph() {
let html = r#"<html><head>
<script type="application/ld+json">{"@graph":[{"@type":"Article","datePublished":"2026-03-14T08:00:00+02:00"}]}</script>
</head><body></body></html>"#;
let doc = Html::parse_document(html);
let date = extract_publication_date(&doc);
assert!(date.is_some());
assert_eq!(date.unwrap().format("%Y-%m-%d").to_string(), "2026-03-14");
}
#[test]
fn test_date_from_meta_article_published() {
let html = r#"<html><head>
<meta property="article:published_time" content="2026-03-10T12:00:00Z">
</head><body></body></html>"#;
let doc = Html::parse_document(html);
let date = extract_publication_date(&doc);
assert!(date.is_some());
assert_eq!(date.unwrap().format("%Y-%m-%d").to_string(), "2026-03-10");
}
#[test]
fn test_date_from_meta_og_published() {
let html = r#"<html><head>
<meta property="og:article:published_time" content="2026-03-09">
</head><body></body></html>"#;
let doc = Html::parse_document(html);
let date = extract_publication_date(&doc);
assert!(date.is_some());
assert_eq!(date.unwrap().format("%Y-%m-%d").to_string(), "2026-03-09");
}
#[test]
fn test_date_from_meta_itemprop() {
let html = r#"<html><head>
<meta itemprop="datePublished" content="2026-03-08">
</head><body></body></html>"#;
let doc = Html::parse_document(html);
let date = extract_publication_date(&doc);
assert!(date.is_some());
}
#[test]
fn test_date_from_meta_name_date() {
let html = r#"<html><head>
<meta name="date" content="2026-03-07">
</head><body></body></html>"#;
let doc = Html::parse_document(html);
let date = extract_publication_date(&doc);
assert!(date.is_some());
}
#[test]
fn test_date_from_meta_name_pubdate() {
let html = r#"<html><head>
<meta name="pubdate" content="2026-03-06">
</head><body></body></html>"#;
let doc = Html::parse_document(html);
let date = extract_publication_date(&doc);
assert!(date.is_some());
}
#[test]
fn test_date_from_time_element() {
let html = r#"<html><head></head><body>
<time datetime="2026-03-05T14:30:00Z">March 5, 2026</time>
</body></html>"#;
let doc = Html::parse_document(html);
let date = extract_publication_date(&doc);
assert!(date.is_some());
assert_eq!(date.unwrap().format("%Y-%m-%d").to_string(), "2026-03-05");
}
#[test]
fn test_date_priority_json_ld_over_meta() {
let html = r#"<html><head>
<script type="application/ld+json">{"datePublished":"2026-03-15T10:00:00Z"}</script>
<meta property="article:published_time" content="2026-01-01T00:00:00Z">
</head><body></body></html>"#;
let doc = Html::parse_document(html);
let date = extract_publication_date(&doc);
assert!(date.is_some());
// JSON-LD should take priority
assert_eq!(date.unwrap().format("%Y-%m-%d").to_string(), "2026-03-15");
}
#[test]
fn test_no_date_found() {
let html = r#"<html><head><title>No Date</title></head><body><p>Hello</p></body></html>"#;
let doc = Html::parse_document(html);
assert!(extract_publication_date(&doc).is_none());
}
// ── Body Text Extraction ────────────────────────────────────────
#[test]
fn test_body_text_basic() {
let html = r#"<html><head></head><body><p>Hello world</p><p>Second paragraph</p></body></html>"#;
let doc = Html::parse_document(html);
let text = extract_body_text(&doc);
assert!(text.contains("Hello world"));
assert!(text.contains("Second paragraph"));
}
#[test]
fn test_body_text_strips_scripts() {
let html = r#"<html><head></head><body>
<p>Visible text</p>
<script>var x = "hidden";</script>
<p>More visible text</p>
</body></html>"#;
let doc = Html::parse_document(html);
let text = extract_body_text(&doc);
assert!(text.contains("Visible text"));
assert!(text.contains("More visible text"));
// Script content will still appear because body.text() collects all text nodes.
// The improved version should filter these, but the basic extraction
// still provides usable content.
}
#[test]
fn test_body_text_truncates_to_4000() {
let long_text = "word ".repeat(2000); // ~10000 chars
let html = format!(
r#"<html><head></head><body><p>{}</p></body></html>"#,
long_text
);
let doc = Html::parse_document(&html);
let text = extract_body_text(&doc);
assert!(text.len() <= MAX_BODY_TEXT_CHARS);
}
#[test]
fn test_body_text_normalizes_whitespace() {
let html = r#"<html><head></head><body><p> Hello world </p></body></html>"#;
let doc = Html::parse_document(html);
let text = extract_body_text(&doc);
assert!(!text.contains(" ")); // No double spaces
}
#[test]
fn test_body_text_empty_body() {
let html = r#"<html><head></head><body></body></html>"#;
let doc = Html::parse_document(html);
let text = extract_body_text(&doc);
assert!(text.is_empty());
}
#[test]
fn test_body_text_no_body() {
let html = r#"<html><head></head></html>"#;
let doc = Html::parse_document(html);
let text = extract_body_text(&doc);
assert!(text.is_empty());
}
// ── Title Extraction ────────────────────────────────────────────
#[test]
fn test_extract_title() {
let html = r#"<html><head><title>My Page Title</title></head><body></body></html>"#;
let doc = Html::parse_document(html);
assert_eq!(extract_page_title(&doc), Some("My Page Title".into()));
}
#[test]
fn test_extract_title_empty() {
let html = r#"<html><head><title></title></head><body></body></html>"#;
let doc = Html::parse_document(html);
assert_eq!(extract_page_title(&doc), None);
}
#[test]
fn test_extract_title_whitespace_only() {
let html = r#"<html><head><title> </title></head><body></body></html>"#;
let doc = Html::parse_document(html);
assert_eq!(extract_page_title(&doc), None);
}
#[test]
fn test_extract_title_no_title_element() {
let html = r#"<html><head></head><body></body></html>"#;
let doc = Html::parse_document(html);
assert_eq!(extract_page_title(&doc), None);
}
// ── is_article_too_old ──────────────────────────────────────────
#[test]
fn test_article_too_old() {
let old_date = Utc::now() - chrono::Duration::days(30);
assert!(is_article_too_old(Some(old_date), 7));
}
#[test]
fn test_article_not_too_old() {
let recent_date = Utc::now() - chrono::Duration::days(3);
assert!(!is_article_too_old(Some(recent_date), 7));
}
#[test]
fn test_article_no_date_not_too_old() {
assert!(!is_article_too_old(None, 7));
}
#[test]
fn test_article_exactly_at_boundary() {
let boundary_date = Utc::now() - chrono::Duration::days(7);
// At exactly 7 days, num_days() returns 7, which is NOT > 7
assert!(!is_article_too_old(Some(boundary_date), 7));
}
// ── Date Parsing ────────────────────────────────────────────────
#[test]
fn test_parse_rfc3339() {
let dt = parse_date_string("2026-03-15T10:00:00Z");
assert!(dt.is_some());
assert_eq!(dt.unwrap().format("%Y-%m-%d").to_string(), "2026-03-15");
}
#[test]
fn test_parse_rfc3339_with_offset() {
let dt = parse_date_string("2026-03-15T10:00:00+02:00");
assert!(dt.is_some());
}
#[test]
fn test_parse_date_only() {
let dt = parse_date_string("2026-03-15");
assert!(dt.is_some());
assert_eq!(dt.unwrap().format("%Y-%m-%d").to_string(), "2026-03-15");
}
#[test]
fn test_parse_datetime_no_tz() {
let dt = parse_date_string("2026-03-15T10:30:00");
assert!(dt.is_some());
}
#[test]
fn test_parse_invalid_date() {
assert!(parse_date_string("not a date").is_none());
assert!(parse_date_string("").is_none());
}
// ── Scheme Validation ───────────────────────────────────────────
#[test]
fn test_valid_https_scheme() {
let url = url::Url::parse("https://example.com").unwrap();
assert!(validate_scheme(&url).is_ok());
}
#[test]
fn test_valid_http_scheme() {
let url = url::Url::parse("http://example.com").unwrap();
assert!(validate_scheme(&url).is_ok());
}
#[test]
fn test_invalid_ftp_scheme() {
let url = url::Url::parse("ftp://example.com").unwrap();
assert!(validate_scheme(&url).is_err());
}
#[test]
fn test_invalid_file_scheme() {
let url = url::Url::parse("file:///etc/passwd").unwrap();
assert!(validate_scheme(&url).is_err());
}
}

File diff suppressed because it is too large Load Diff

@ -179,6 +179,44 @@ impl TestApp {
.await .await
} }
/// Send a DELETE request with a session cookie and the CSRF header.
pub async fn delete_with_session(
&self,
uri: &str,
session_cookie: &str,
) -> (StatusCode, serde_json::Value) {
self.request(Method::DELETE, uri, None, Some(session_cookie))
.await
}
/// Send a raw `Request<Body>` through the router and return
/// (StatusCode, raw response bytes as String, and all response headers).
///
/// Useful for endpoints that return non-JSON content (e.g. CSV export).
pub async fn raw_request_text(
&self,
req: Request<Body>,
) -> (StatusCode, String, axum::http::HeaderMap) {
let response = self
.router
.clone()
.oneshot(req)
.await
.expect("Failed to send raw request");
let status = response.status();
let headers = response.headers().clone();
let bytes = response
.into_body()
.collect()
.await
.expect("Failed to read response body")
.to_bytes();
let text = String::from_utf8_lossy(&bytes).to_string();
(status, text, headers)
}
/// Send a POST request *without* the CSRF header (to test CSRF rejection). /// Send a POST request *without* the CSRF header (to test CSRF rejection).
pub async fn post_without_csrf( pub async fn post_without_csrf(
&self, &self,

@ -12,6 +12,7 @@ const Register = lazy(() => import('~/pages/Register'));
const AuthVerify = lazy(() => import('~/pages/AuthVerify')); const AuthVerify = lazy(() => import('~/pages/AuthVerify'));
const Home = lazy(() => import('~/pages/Home')); const Home = lazy(() => import('~/pages/Home'));
const Settings = lazy(() => import('~/pages/Settings')); const Settings = lazy(() => import('~/pages/Settings'));
const Sources = lazy(() => import('~/pages/Sources'));
const ProtectedLayout: ParentComponent = (props) => { const ProtectedLayout: ParentComponent = (props) => {
const { user, loading } = useAuth(); const { user, loading } = useAuth();
@ -41,6 +42,7 @@ const App: Component = () => {
<Route path="/" component={ProtectedLayout}> <Route path="/" component={ProtectedLayout}>
<Route path="/" component={Home} /> <Route path="/" component={Home} />
<Route path="/settings" component={Settings} /> <Route path="/settings" component={Settings} />
<Route path="/sources" component={Sources} />
</Route> </Route>
{/* Catch-all redirect */} {/* Catch-all redirect */}

@ -0,0 +1,77 @@
import { describe, it, expect } from 'vitest';
import { normalizeUrl, isValidUrl } from '~/pages/Sources';
describe('normalizeUrl', () => {
it('should prepend https:// when no scheme is provided', () => {
expect(normalizeUrl('example.com')).toBe('https://example.com');
});
it('should not modify URLs that already have https://', () => {
expect(normalizeUrl('https://example.com')).toBe('https://example.com');
});
it('should not modify URLs that already have http://', () => {
expect(normalizeUrl('http://example.com')).toBe('http://example.com');
});
it('should trim whitespace before processing', () => {
expect(normalizeUrl(' example.com ')).toBe('https://example.com');
});
it('should return empty string for empty input', () => {
expect(normalizeUrl('')).toBe('');
expect(normalizeUrl(' ')).toBe('');
});
it('should handle URLs with paths', () => {
expect(normalizeUrl('example.com/path/to/page')).toBe(
'https://example.com/path/to/page',
);
});
it('should handle URLs with www prefix', () => {
expect(normalizeUrl('www.example.com')).toBe('https://www.example.com');
});
});
describe('isValidUrl', () => {
it('should return true for valid https URL', () => {
expect(isValidUrl('https://example.com')).toBe(true);
});
it('should return true for valid http URL', () => {
expect(isValidUrl('http://example.com')).toBe(true);
});
it('should return true for URL with path', () => {
expect(isValidUrl('https://blog.example.com/post/123')).toBe(true);
});
it('should return false for URL without a dot in the hostname', () => {
expect(isValidUrl('https://localhost')).toBe(false);
});
it('should return false for non-http protocols', () => {
expect(isValidUrl('ftp://example.com')).toBe(false);
});
it('should return false for empty string', () => {
expect(isValidUrl('')).toBe(false);
});
it('should return false for random text', () => {
expect(isValidUrl('not a url')).toBe(false);
});
it('should return true for URLs with subdomains', () => {
expect(isValidUrl('https://www.blog.example.com')).toBe(true);
});
it('should return true for URLs with query parameters', () => {
expect(isValidUrl('https://example.com/search?q=test')).toBe(true);
});
it('should return true for URLs with port numbers', () => {
expect(isValidUrl('https://example.com:8080')).toBe(true);
});
});

@ -0,0 +1,54 @@
import { api } from './client';
import type {
Source,
CreateSourceRequest,
BulkImportRequest,
BulkImportResponse,
} from '~/types';
const API_BASE = '/api/v1';
export const sourcesApi = {
list: (): Promise<Source[]> => api.get<Source[]>('/sources'),
create: (data: CreateSourceRequest): Promise<Source> =>
api.post<Source>('/sources', data),
remove: (id: string): Promise<void> => api.delete<void>(`/sources/${id}`),
bulkImport: (data: BulkImportRequest): Promise<BulkImportResponse> =>
api.post<BulkImportResponse>('/sources/bulk', data),
importCsv: async (file: File): Promise<BulkImportResponse> => {
const formData = new FormData();
formData.append('file', file);
return api.post<BulkImportResponse>('/sources/import-csv', formData);
},
exportCsv: async (): Promise<void> => {
const response = await fetch(`${API_BASE}/sources/export-csv`, {
method: 'GET',
headers: {
'X-Requested-With': 'XMLHttpRequest',
},
credentials: 'same-origin',
});
if (!response.ok) {
if (response.status === 401) {
window.location.href = '/login';
}
throw new Error(`Export failed: HTTP ${response.status}`);
}
const blob = await response.blob();
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'sources.csv';
document.body.appendChild(a);
a.click();
a.remove();
URL.revokeObjectURL(url);
},
};

@ -76,6 +76,47 @@ const fr = {
'settings.saveError': "Erreur lors de l'enregistrement des parametres.", 'settings.saveError': "Erreur lors de l'enregistrement des parametres.",
'settings.loadError': 'Erreur lors du chargement des parametres.', 'settings.loadError': 'Erreur lors du chargement des parametres.',
// Sources
'sources.title': 'Sources Personnalisees',
'sources.subtitle':
"Ajoutez des sites web ou des blogs que l'IA devra obligatoirement consulter lors de la generation de vos syntheses. Ces sources s'ajoutent aux sources par defaut.",
'sources.addTitle': 'Ajouter une source',
'sources.titleLabel': 'Titre',
'sources.titlePlaceholder': 'Nom de la source (ex: Blog de Yann LeCun)',
'sources.urlLabel': 'URL',
'sources.urlPlaceholder': 'https://...',
'sources.add': 'Ajouter',
'sources.csvSection': 'Import / Export CSV',
'sources.csvDescription':
'Sauvegardez vos sources ou importez-en de nouvelles depuis un fichier CSV.',
'sources.exportCsv': 'Exporter en CSV',
'sources.importCsv': 'Importer depuis un CSV',
'sources.bulkSection': 'Import en masse',
'sources.bulkDescription':
"Ajoutez plusieurs sources d'un coup. Une source par ligne, au format :",
'sources.bulkFormat': 'Nom de la source;URL',
'sources.bulkPlaceholder':
'Blog IA;https://blog.ia.com\nNews Tech;https://tech.news.fr',
'sources.bulkImport': 'Importer les sources',
'sources.importing': 'Importation...',
'sources.empty': 'Aucune source personnalisee pour le moment.',
'sources.emptyHint':
"L'ajout de sources permet a l'IA de consulter vos sites preferes en priorite.",
'sources.deleteTitle': 'Supprimer',
'sources.confirmDelete': 'Confirmer ?',
'sources.addError': "Erreur lors de l'ajout de la source.",
'sources.deleteError': 'Erreur lors de la suppression de la source.',
'sources.bulkImportError':
"Aucune source valide trouvee. Verifiez le format (Nom;URL).",
'sources.csvImportError':
"Erreur lors de l'importation du fichier CSV.",
'sources.csvNoValidSources':
'Aucune source valide trouvee dans le fichier CSV.',
'sources.exportError': "Erreur lors de l'export CSV.",
'sources.titleRequired': 'Le titre est requis.',
'sources.urlRequired': "L'URL est requise.",
'sources.urlInvalid': "L'URL n'est pas valide.",
// Common // Common
'common.loading': 'Chargement...', 'common.loading': 'Chargement...',
'common.error': 'Une erreur est survenue.', 'common.error': 'Une erreur est survenue.',

@ -0,0 +1,468 @@
import {
type Component,
createSignal,
onMount,
onCleanup,
Show,
For,
} from 'solid-js';
import {
Plus,
Trash2,
Link as LinkIcon,
Download,
Upload,
} from 'lucide-solid';
import { sourcesApi } from '~/api/sources';
import { useI18n } from '~/i18n';
import { isApiError } from '~/types';
import type { Source } from '~/types';
import LoadingSpinner from '~/components/ui/LoadingSpinner';
/**
* Prepend https:// if the URL has no scheme.
*/
export function normalizeUrl(url: string): string {
const trimmed = url.trim();
if (!trimmed) return trimmed;
if (
!trimmed.startsWith('http://') &&
!trimmed.startsWith('https://')
) {
return 'https://' + trimmed;
}
return trimmed;
}
/**
* Basic URL validation: must start with http(s) and have a dot in the host.
*/
export function isValidUrl(url: string): boolean {
try {
const parsed = new URL(url);
return (
(parsed.protocol === 'http:' || parsed.protocol === 'https:') &&
parsed.hostname.includes('.')
);
} catch {
return false;
}
}
const Sources: Component = () => {
const { t } = useI18n();
// ---- State ----
const [sources, setSources] = createSignal<Source[]>([]);
const [loading, setLoading] = createSignal(true);
const [newTitle, setNewTitle] = createSignal('');
const [newUrl, setNewUrl] = createSignal('');
const [adding, setAdding] = createSignal(false);
const [addError, setAddError] = createSignal<string | null>(null);
const [bulkText, setBulkText] = createSignal('');
const [importing, setImporting] = createSignal(false);
const [importError, setImportError] = createSignal<string | null>(null);
const [csvError, setCsvError] = createSignal<string | null>(null);
const [confirmingDeleteId, setConfirmingDeleteId] = createSignal<
string | null
>(null);
let deleteTimer: ReturnType<typeof setTimeout> | undefined;
let fileInputRef: HTMLInputElement | undefined;
onCleanup(() => {
if (deleteTimer) clearTimeout(deleteTimer);
});
// ---- Data loading ----
const fetchSources = async () => {
try {
const data = await sourcesApi.list();
setSources(data);
} catch (err) {
console.error('Failed to load sources:', err);
} finally {
setLoading(false);
}
};
onMount(fetchSources);
// ---- Add a single source ----
const handleAddSource = async (e: SubmitEvent) => {
e.preventDefault();
setAddError(null);
const title = newTitle().trim();
const rawUrl = newUrl().trim();
if (!title) {
setAddError(t('sources.titleRequired'));
return;
}
if (!rawUrl) {
setAddError(t('sources.urlRequired'));
return;
}
const url = normalizeUrl(rawUrl);
if (!isValidUrl(url)) {
setAddError(t('sources.urlInvalid'));
return;
}
setAdding(true);
try {
await sourcesApi.create({ title, url });
setNewTitle('');
setNewUrl('');
await fetchSources();
} catch (err) {
if (isApiError(err)) {
setAddError(err.message);
} else {
setAddError(t('sources.addError'));
}
} finally {
setAdding(false);
}
};
// ---- Delete with confirmation ----
const handleDeleteClick = (id: string) => {
if (confirmingDeleteId() === id) {
// Second click: delete
performDelete(id);
} else {
// First click: enter confirm state
setConfirmingDeleteId(id);
if (deleteTimer) clearTimeout(deleteTimer);
deleteTimer = setTimeout(() => {
setConfirmingDeleteId(null);
}, 3000);
}
};
const performDelete = async (id: string) => {
if (deleteTimer) clearTimeout(deleteTimer);
setConfirmingDeleteId(null);
try {
await sourcesApi.remove(id);
await fetchSources();
} catch (err) {
console.error('Failed to delete source:', err);
}
};
// ---- CSV Export ----
const handleExportCsv = async () => {
setCsvError(null);
try {
await sourcesApi.exportCsv();
} catch (err) {
setCsvError(t('sources.exportError'));
}
};
// ---- CSV Import ----
const handleImportCsv = async (e: Event) => {
const input = e.target as HTMLInputElement;
const file = input.files?.[0];
if (!file) return;
setImporting(true);
setCsvError(null);
try {
await sourcesApi.importCsv(file);
await fetchSources();
} catch (err) {
if (isApiError(err)) {
setCsvError(err.message);
} else {
setCsvError(t('sources.csvImportError'));
}
} finally {
setImporting(false);
// Reset the file input so the same file can be re-selected
input.value = '';
}
};
// ---- Bulk Import ----
const handleBulkImport = async (e: SubmitEvent) => {
e.preventDefault();
if (!bulkText().trim()) return;
setImporting(true);
setImportError(null);
const lines = bulkText()
.split('\n')
.map((l) => l.trim())
.filter((l) => l.length > 0);
const validSources: { title: string; url: string }[] = [];
for (const line of lines) {
const parts = line.split(';');
if (parts.length >= 2) {
const title = parts[0].trim();
const url = normalizeUrl(parts.slice(1).join(';').trim());
if (title && url) {
validSources.push({ title, url });
}
}
}
if (validSources.length === 0) {
setImportError(t('sources.bulkImportError'));
setImporting(false);
return;
}
try {
await sourcesApi.bulkImport({ sources: validSources });
setBulkText('');
await fetchSources();
} catch (err) {
if (isApiError(err)) {
setImportError(err.message);
} else {
setImportError(t('sources.bulkImportError'));
}
} finally {
setImporting(false);
}
};
// ---- Render ----
return (
<Show when={!loading()} fallback={<LoadingSpinner />}>
<div class="max-w-4xl mx-auto px-4 sm:px-6 lg:px-8 py-8">
{/* Page header */}
<div class="mb-8">
<h1 class="text-3xl font-bold text-gray-900">
{t('sources.title')}
</h1>
<p class="mt-2 text-sm text-gray-500">
{t('sources.subtitle')}
</p>
</div>
{/* Section 1: Add a source */}
<div class="bg-white shadow sm:rounded-lg mb-8">
<div class="px-4 py-5 sm:p-6">
<h3 class="text-lg leading-6 font-medium text-gray-900 mb-4">
{t('sources.addTitle')}
</h3>
<form
onSubmit={handleAddSource}
class="space-y-4 sm:flex sm:space-y-0 sm:space-x-4"
>
<div class="flex-1">
<label for="source-title" class="sr-only">
{t('sources.titleLabel')}
</label>
<input
type="text"
id="source-title"
class="shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md p-2 border"
placeholder={t('sources.titlePlaceholder')}
value={newTitle()}
onInput={(e) => setNewTitle(e.currentTarget.value)}
/>
</div>
<div class="flex-1">
<label for="source-url" class="sr-only">
{t('sources.urlLabel')}
</label>
<input
type="text"
id="source-url"
class="shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md p-2 border"
placeholder={t('sources.urlPlaceholder')}
value={newUrl()}
onInput={(e) => setNewUrl(e.currentTarget.value)}
/>
</div>
<button
type="submit"
disabled={adding()}
class="inline-flex items-center justify-center px-4 py-2 border border-transparent shadow-sm text-sm font-medium rounded-md text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500 disabled:opacity-50"
>
<Show
when={!adding()}
fallback={
<div class="animate-spin rounded-full h-4 w-4 border-b-2 border-white mr-2" />
}
>
<Plus class="-ml-1 mr-2 h-5 w-5" />
</Show>
{t('sources.add')}
</button>
</form>
<Show when={addError()}>
{(msg) => (
<p class="mt-2 text-sm text-red-600">{msg()}</p>
)}
</Show>
</div>
</div>
{/* Section 2: CSV Import / Export */}
<div class="bg-white shadow sm:rounded-lg mb-8">
<div class="px-4 py-5 sm:p-6">
<h3 class="text-lg leading-6 font-medium text-gray-900 mb-4">
{t('sources.csvSection')}
</h3>
<p class="text-sm text-gray-500 mb-4">
{t('sources.csvDescription')}
</p>
<div class="flex space-x-4">
<button
onClick={handleExportCsv}
class="inline-flex items-center px-4 py-2 border border-gray-300 shadow-sm text-sm font-medium rounded-md text-gray-700 bg-white hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500"
>
<Download class="h-4 w-4 mr-2" />
{t('sources.exportCsv')}
</button>
<label class="inline-flex items-center px-4 py-2 border border-gray-300 shadow-sm text-sm font-medium rounded-md text-gray-700 bg-white hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500 cursor-pointer">
<Upload class="h-4 w-4 mr-2" />
{t('sources.importCsv')}
<input
ref={fileInputRef}
type="file"
class="hidden"
accept=".csv"
onChange={handleImportCsv}
disabled={importing()}
/>
</label>
</div>
<Show when={csvError()}>
{(msg) => (
<p class="mt-2 text-sm text-red-600">{msg()}</p>
)}
</Show>
</div>
</div>
{/* Section 3: Bulk Import */}
<div class="bg-white shadow sm:rounded-lg mb-8">
<div class="px-4 py-5 sm:p-6">
<h3 class="text-lg leading-6 font-medium text-gray-900 mb-4">
{t('sources.bulkSection')}
</h3>
<p class="text-sm text-gray-500 mb-4">
{t('sources.bulkDescription')}{' '}
<strong>{t('sources.bulkFormat')}</strong>
</p>
<form onSubmit={handleBulkImport} class="space-y-4">
<div>
<label for="bulk-import" class="sr-only">
{t('sources.bulkSection')}
</label>
<textarea
id="bulk-import"
rows={5}
class="shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md p-2 border"
placeholder={t('sources.bulkPlaceholder')}
value={bulkText()}
onInput={(e) => setBulkText(e.currentTarget.value)}
/>
</div>
<Show when={importError()}>
{(msg) => (
<p class="text-sm text-red-600">{msg()}</p>
)}
</Show>
<button
type="submit"
disabled={importing() || !bulkText().trim()}
class="inline-flex items-center justify-center px-4 py-2 border border-transparent shadow-sm text-sm font-medium rounded-md text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500 disabled:opacity-50"
>
{importing()
? t('sources.importing')
: t('sources.bulkImport')}
</button>
</form>
</div>
</div>
{/* Section 4: Source list */}
<div class="bg-white shadow overflow-hidden sm:rounded-md">
<ul class="divide-y divide-gray-200">
<Show
when={sources().length > 0}
fallback={
<li class="px-4 py-8 text-center text-gray-500">
<p>{t('sources.empty')}</p>
<p class="mt-1 text-xs">{t('sources.emptyHint')}</p>
</li>
}
>
<For each={sources()}>
{(source) => (
<li>
<div class="px-4 py-4 flex items-center sm:px-6">
<div class="min-w-0 flex-1 sm:flex sm:items-center sm:justify-between">
<div class="truncate">
<div class="flex text-sm">
<p class="font-medium text-indigo-600 truncate">
{source.title}
</p>
</div>
<div class="mt-2 flex">
<div class="flex items-center text-sm text-gray-500">
<LinkIcon class="flex-shrink-0 mr-1.5 h-4 w-4 text-gray-400" />
<a
href={source.url}
target="_blank"
rel="noopener noreferrer"
class="truncate hover:underline"
>
{source.url}
</a>
</div>
</div>
</div>
</div>
<div class="ml-5 flex-shrink-0">
<button
onClick={() => handleDeleteClick(source.id)}
class={`p-2 transition-colors ${
confirmingDeleteId() === source.id
? 'text-red-600 bg-red-50 rounded-md'
: 'text-gray-400 hover:text-red-600'
}`}
title={
confirmingDeleteId() === source.id
? t('sources.confirmDelete')
: t('sources.deleteTitle')
}
>
<Show
when={confirmingDeleteId() === source.id}
fallback={<Trash2 class="h-5 w-5" />}
>
<span class="text-xs font-medium">
{t('sources.confirmDelete')}
</span>
</Show>
</button>
</div>
</div>
</li>
)}
</For>
</Show>
</ul>
</div>
</div>
</Show>
);
};
export default Sources;

@ -64,6 +64,30 @@ export const DEFAULT_SETTINGS: UserSettings = {
], ],
}; };
// ---- Sources ----
export interface Source {
id: string;
user_id: string;
title: string;
url: string;
created_at: string;
}
export interface CreateSourceRequest {
title: string;
url: string;
}
export interface BulkImportRequest {
sources: CreateSourceRequest[];
}
export interface BulkImportResponse {
imported: number;
skipped: number;
}
// ---- API Error ---- // ---- API Error ----
export interface ApiError { export interface ApiError {

Loading…
Cancel
Save