feat: add batch_size setting for Phase 1 parallelism

Add a user-configurable batch_size setting (default 5, range 1-20)
that controls how many articles are processed in parallel during
Phase 1 scrape+classify. Previously hardcoded to 5.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent a5e6cf2ac0
commit 4c6381b09a

@ -117,7 +117,7 @@ cd frontend && npx tsc --noEmit
- `GET /api/v1/admin/users` — user list - `GET /api/v1/admin/users` — user list
- `PUT /api/v1/admin/users/:id/role` — role management - `PUT /api/v1/admin/users/:id/role` — role management
## Database (19 migrations) ## Database (20 migrations)
Tables: `users`, `sessions`, `magic_link_tokens`, `user_settings`, `sources`, `syntheses`, `admin_providers`, `admin_rate_limits`, `user_api_keys`, `audit_log` Tables: `users`, `sessions`, `magic_link_tokens`, `user_settings`, `sources`, `syntheses`, `admin_providers`, `admin_rate_limits`, `user_api_keys`, `audit_log`
## Environment Variables ## Environment Variables

@ -0,0 +1,2 @@
-- Add batch_size column to settings (parallelism for Phase 1 scrape+classify)
ALTER TABLE settings ADD COLUMN batch_size INTEGER NOT NULL DEFAULT 5;

@ -20,6 +20,7 @@ struct SettingsRow {
max_articles_per_source: i32, max_articles_per_source: i32,
use_llm_for_source_links: bool, use_llm_for_source_links: bool,
article_history_days: i32, article_history_days: i32,
batch_size: i32,
search_agent_behavior: String, search_agent_behavior: String,
ai_provider: String, ai_provider: String,
ai_model: String, ai_model: String,
@ -46,6 +47,7 @@ impl TryFrom<SettingsRow> for UserSettings {
max_articles_per_source: row.max_articles_per_source, max_articles_per_source: row.max_articles_per_source,
use_llm_for_source_links: row.use_llm_for_source_links, use_llm_for_source_links: row.use_llm_for_source_links,
article_history_days: row.article_history_days, article_history_days: row.article_history_days,
batch_size: row.batch_size,
search_agent_behavior: row.search_agent_behavior, search_agent_behavior: row.search_agent_behavior,
ai_provider: row.ai_provider, ai_provider: row.ai_provider,
ai_model: row.ai_model, ai_model: row.ai_model,
@ -72,10 +74,10 @@ pub async fn get_or_create_default(
let row = sqlx::query_as::<_, SettingsRow>( let row = sqlx::query_as::<_, SettingsRow>(
r#" r#"
INSERT INTO settings (user_id, theme, max_age_days, categories, max_items_per_category, search_agent_behavior, ai_provider, ai_model, ai_model_websearch, rate_limit_max_requests, rate_limit_time_window_seconds, max_articles_per_source, use_llm_for_source_links, article_history_days) INSERT INTO settings (user_id, theme, max_age_days, categories, max_items_per_category, search_agent_behavior, ai_provider, ai_model, ai_model_websearch, rate_limit_max_requests, rate_limit_time_window_seconds, max_articles_per_source, use_llm_for_source_links, article_history_days, batch_size)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
ON CONFLICT (user_id) DO UPDATE SET user_id = settings.user_id ON CONFLICT (user_id) DO UPDATE SET user_id = settings.user_id
RETURNING user_id, theme, max_age_days, categories, max_items_per_category, search_agent_behavior, ai_provider, ai_model, ai_model_websearch, rate_limit_max_requests, rate_limit_time_window_seconds, max_articles_per_source, use_llm_for_source_links, article_history_days, updated_at RETURNING user_id, theme, max_age_days, categories, max_items_per_category, search_agent_behavior, ai_provider, ai_model, ai_model_websearch, rate_limit_max_requests, rate_limit_time_window_seconds, max_articles_per_source, use_llm_for_source_links, article_history_days, batch_size, updated_at
"#, "#,
) )
.bind(user_id) .bind(user_id)
@ -92,6 +94,7 @@ pub async fn get_or_create_default(
.bind(defaults.max_articles_per_source) .bind(defaults.max_articles_per_source)
.bind(defaults.use_llm_for_source_links) .bind(defaults.use_llm_for_source_links)
.bind(defaults.article_history_days) .bind(defaults.article_history_days)
.bind(defaults.batch_size)
.fetch_one(pool) .fetch_one(pool)
.await?; .await?;
@ -110,8 +113,8 @@ pub async fn upsert(
let row = sqlx::query_as::<_, SettingsRow>( let row = sqlx::query_as::<_, SettingsRow>(
r#" r#"
INSERT INTO settings (user_id, theme, max_age_days, categories, max_items_per_category, search_agent_behavior, ai_provider, ai_model, ai_model_websearch, rate_limit_max_requests, rate_limit_time_window_seconds, max_articles_per_source, use_llm_for_source_links, article_history_days) INSERT INTO settings (user_id, theme, max_age_days, categories, max_items_per_category, search_agent_behavior, ai_provider, ai_model, ai_model_websearch, rate_limit_max_requests, rate_limit_time_window_seconds, max_articles_per_source, use_llm_for_source_links, article_history_days, batch_size)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
ON CONFLICT (user_id) DO UPDATE SET ON CONFLICT (user_id) DO UPDATE SET
theme = EXCLUDED.theme, theme = EXCLUDED.theme,
max_age_days = EXCLUDED.max_age_days, max_age_days = EXCLUDED.max_age_days,
@ -126,8 +129,9 @@ pub async fn upsert(
max_articles_per_source = EXCLUDED.max_articles_per_source, max_articles_per_source = EXCLUDED.max_articles_per_source,
use_llm_for_source_links = EXCLUDED.use_llm_for_source_links, use_llm_for_source_links = EXCLUDED.use_llm_for_source_links,
article_history_days = EXCLUDED.article_history_days, article_history_days = EXCLUDED.article_history_days,
batch_size = EXCLUDED.batch_size,
updated_at = now() updated_at = now()
RETURNING user_id, theme, max_age_days, categories, max_items_per_category, search_agent_behavior, ai_provider, ai_model, ai_model_websearch, rate_limit_max_requests, rate_limit_time_window_seconds, max_articles_per_source, use_llm_for_source_links, article_history_days, updated_at RETURNING user_id, theme, max_age_days, categories, max_items_per_category, search_agent_behavior, ai_provider, ai_model, ai_model_websearch, rate_limit_max_requests, rate_limit_time_window_seconds, max_articles_per_source, use_llm_for_source_links, article_history_days, batch_size, updated_at
"#, "#,
) )
.bind(user_id) .bind(user_id)
@ -144,6 +148,7 @@ pub async fn upsert(
.bind(req.max_articles_per_source) .bind(req.max_articles_per_source)
.bind(req.use_llm_for_source_links) .bind(req.use_llm_for_source_links)
.bind(req.article_history_days) .bind(req.article_history_days)
.bind(req.batch_size)
.fetch_one(pool) .fetch_one(pool)
.await?; .await?;

@ -15,6 +15,7 @@ pub struct UserSettings {
pub max_articles_per_source: i32, pub max_articles_per_source: i32,
pub use_llm_for_source_links: bool, pub use_llm_for_source_links: bool,
pub article_history_days: i32, pub article_history_days: i32,
pub batch_size: i32,
pub search_agent_behavior: String, pub search_agent_behavior: String,
pub ai_provider: String, pub ai_provider: String,
pub ai_model: String, pub ai_model: String,
@ -34,6 +35,7 @@ pub struct SettingsResponse {
pub max_articles_per_source: i32, pub max_articles_per_source: i32,
pub use_llm_for_source_links: bool, pub use_llm_for_source_links: bool,
pub article_history_days: i32, pub article_history_days: i32,
pub batch_size: i32,
pub search_agent_behavior: String, pub search_agent_behavior: String,
pub ai_provider: String, pub ai_provider: String,
pub ai_model: String, pub ai_model: String,
@ -52,6 +54,7 @@ impl From<UserSettings> for SettingsResponse {
max_articles_per_source: s.max_articles_per_source, max_articles_per_source: s.max_articles_per_source,
use_llm_for_source_links: s.use_llm_for_source_links, use_llm_for_source_links: s.use_llm_for_source_links,
article_history_days: s.article_history_days, article_history_days: s.article_history_days,
batch_size: s.batch_size,
search_agent_behavior: s.search_agent_behavior, search_agent_behavior: s.search_agent_behavior,
ai_provider: s.ai_provider, ai_provider: s.ai_provider,
ai_model: s.ai_model, ai_model: s.ai_model,
@ -72,6 +75,7 @@ pub struct UpdateSettingsRequest {
pub max_articles_per_source: i32, pub max_articles_per_source: i32,
pub use_llm_for_source_links: bool, pub use_llm_for_source_links: bool,
pub article_history_days: i32, pub article_history_days: i32,
pub batch_size: i32,
pub search_agent_behavior: String, pub search_agent_behavior: String,
pub ai_provider: String, pub ai_provider: String,
pub ai_model: String, pub ai_model: String,
@ -121,6 +125,9 @@ impl UpdateSettingsRequest {
if !(0..=365).contains(&self.article_history_days) { if !(0..=365).contains(&self.article_history_days) {
return Err("article_history_days must be between 0 and 365".into()); return Err("article_history_days must be between 0 and 365".into());
} }
if !(1..=20).contains(&self.batch_size) {
return Err("batch_size must be between 1 and 20".into());
}
if self.search_agent_behavior.len() > 2000 { if self.search_agent_behavior.len() > 2000 {
return Err("search_agent_behavior must be at most 2000 characters".into()); return Err("search_agent_behavior must be at most 2000 characters".into());
} }
@ -165,6 +172,7 @@ impl Default for UserSettings {
max_articles_per_source: 3, max_articles_per_source: 3,
use_llm_for_source_links: false, use_llm_for_source_links: false,
article_history_days: 90, article_history_days: 90,
batch_size: 5,
search_agent_behavior: String::new(), search_agent_behavior: String::new(),
ai_provider: String::new(), ai_provider: String::new(),
ai_model: String::new(), ai_model: String::new(),
@ -190,6 +198,7 @@ mod tests {
max_articles_per_source: 3, max_articles_per_source: 3,
use_llm_for_source_links: false, use_llm_for_source_links: false,
article_history_days: 90, article_history_days: 90,
batch_size: 5,
search_agent_behavior: String::new(), search_agent_behavior: String::new(),
ai_provider: String::new(), ai_provider: String::new(),
ai_model: String::new(), ai_model: String::new(),
@ -385,6 +394,26 @@ mod tests {
assert!(err.contains("ai_model")); assert!(err.contains("ai_model"));
} }
#[test]
fn test_validate_batch_size_below_range() {
let req = UpdateSettingsRequest {
batch_size: 0,
..valid_request()
};
let err = req.validate().unwrap_err();
assert!(err.contains("batch_size"));
}
#[test]
fn test_validate_batch_size_above_range() {
let req = UpdateSettingsRequest {
batch_size: 21,
..valid_request()
};
let err = req.validate().unwrap_err();
assert!(err.contains("batch_size"));
}
#[test] #[test]
fn test_validate_ai_model_websearch_too_long_rejected() { fn test_validate_ai_model_websearch_too_long_rejected() {
let req = UpdateSettingsRequest { let req = UpdateSettingsRequest {

@ -199,6 +199,7 @@ mod tests {
max_articles_per_source: 3, max_articles_per_source: 3,
use_llm_for_source_links: false, use_llm_for_source_links: false,
article_history_days: 90, article_history_days: 90,
batch_size: 5,
search_agent_behavior: String::new(), search_agent_behavior: String::new(),
ai_provider: String::new(), ai_provider: String::new(),
ai_model: String::new(), ai_model: String::new(),

@ -398,7 +398,7 @@ async fn run_generation_inner(
// 1b. Scrape, classify, summarize in batches of 5 // 1b. Scrape, classify, summarize in batches of 5
emit_progress(tx, "processing", "Traitement des articles...", 25); emit_progress(tx, "processing", "Traitement des articles...", 25);
let total_candidates = candidate_urls.len(); let total_candidates = candidate_urls.len();
let batch_size = 5; let batch_size = settings.batch_size.max(1) as usize;
let mut processed = 0usize; let mut processed = 0usize;
let mut candidates_iter = candidate_urls.into_iter(); let mut candidates_iter = candidate_urls.into_iter();
let mut done = false; let mut done = false;

@ -633,7 +633,8 @@ async fn generate_pipeline_resolves_model_from_admin_config() {
"ai_model_websearch": "", "ai_model_websearch": "",
"use_llm_for_source_links": false, "use_llm_for_source_links": false,
"use_llm_for_article_extraction": false, "use_llm_for_article_extraction": false,
"article_history_days": 90 "article_history_days": 90,
"batch_size": 5
}); });
let (settings_status, _) = app let (settings_status, _) = app
.put_with_session("/api/v1/settings", &settings, &session) .put_with_session("/api/v1/settings", &settings, &session)

@ -143,6 +143,7 @@ test.describe('Live generation with OpenAI', () => {
ai_model_websearch: 'gpt-4o-mini', ai_model_websearch: 'gpt-4o-mini',
use_llm_for_source_links: false, use_llm_for_source_links: false,
article_history_days: 90, article_history_days: 90,
batch_size: 5,
}); });
expect(settingsResp.status).toBe(200); expect(settingsResp.status).toBe(200);

@ -153,6 +153,8 @@ const fr = {
'settings.advancedExtraction': 'Extraction avancee', 'settings.advancedExtraction': 'Extraction avancee',
'settings.useLlmForSourceLinks': "Utiliser l'IA pour extraire les liens", 'settings.useLlmForSourceLinks': "Utiliser l'IA pour extraire les liens",
'settings.articleHistoryDays': 'Historique des articles (jours)', 'settings.articleHistoryDays': 'Historique des articles (jours)',
'settings.batchSize': 'Taille du lot de traitement',
'settings.batchSizeHelp': 'Nombre d\'articles traites en parallele lors de la generation (defaut: 5).',
'settings.export': 'Exporter', 'settings.export': 'Exporter',
'settings.import': 'Importer', 'settings.import': 'Importer',
'settings.exportIncludeKeys': 'Inclure les cles API', 'settings.exportIncludeKeys': 'Inclure les cles API',

@ -458,6 +458,33 @@ const Settings: Component = () => {
</A> </A>
</div> </div>
</div> </div>
<div>
<label
for="batchSize"
class="block text-sm font-medium text-gray-700"
>
{t('settings.batchSize')}
</label>
<p class="text-xs text-gray-500 mb-1">{t('settings.batchSizeHelp')}</p>
<div class="mt-1">
<input
type="number"
id="batchSize"
min="1"
max="20"
class="shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md py-2 px-3 border"
value={settings().batch_size}
onInput={(e) =>
setSettings((prev) => ({
...prev,
batch_size:
parseInt(e.currentTarget.value) || 5,
}))
}
/>
</div>
</div>
</div> </div>
{/* Advanced extraction */} {/* Advanced extraction */}

@ -46,6 +46,7 @@ export interface UserSettings {
max_articles_per_source: number; max_articles_per_source: number;
use_llm_for_source_links: boolean; use_llm_for_source_links: boolean;
article_history_days: number; article_history_days: number;
batch_size: number;
search_agent_behavior: string; search_agent_behavior: string;
ai_model: string; ai_model: string;
ai_model_websearch: string; ai_model_websearch: string;
@ -62,6 +63,7 @@ export const DEFAULT_SETTINGS: UserSettings = {
max_articles_per_source: 3, max_articles_per_source: 3,
use_llm_for_source_links: false, use_llm_for_source_links: false,
article_history_days: 90, article_history_days: 90,
batch_size: 5,
search_agent_behavior: search_agent_behavior:
"Tu peux egalement utiliser d'autres sources pertinentes trouvees via la recherche Google.", "Tu peux egalement utiliser d'autres sources pertinentes trouvees via la recherche Google.",
ai_model: '', ai_model: '',

Loading…
Cancel
Save