|
|
|
@ -236,7 +236,7 @@ pub async fn run_generation_inner(
|
|
|
|
let mut pending_traces: Vec<db::article_history::ArticleHistoryEntry> = Vec::new();
|
|
|
|
let mut pending_traces: Vec<db::article_history::ArticleHistoryEntry> = Vec::new();
|
|
|
|
|
|
|
|
|
|
|
|
// === INITIALIZATION ===
|
|
|
|
// === INITIALIZATION ===
|
|
|
|
emit_progress(tx, "settings", "Chargement des parametres...", 5);
|
|
|
|
emit_progress(tx, "sources", "Chargement des parametres...", 5);
|
|
|
|
let settings = db::settings::get_or_create_default(&state.pool, user_id).await?;
|
|
|
|
let settings = db::settings::get_or_create_default(&state.pool, user_id).await?;
|
|
|
|
|
|
|
|
|
|
|
|
if settings.article_history_days > 0 {
|
|
|
|
if settings.article_history_days > 0 {
|
|
|
|
@ -255,7 +255,7 @@ pub async fn run_generation_inner(
|
|
|
|
emit_progress(tx, "sources", "Chargement des sources...", 10);
|
|
|
|
emit_progress(tx, "sources", "Chargement des sources...", 10);
|
|
|
|
let sources = db::sources::list_for_user(&state.pool, user_id).await?;
|
|
|
|
let sources = db::sources::list_for_user(&state.pool, user_id).await?;
|
|
|
|
|
|
|
|
|
|
|
|
emit_progress(tx, "provider", "Configuration du fournisseur IA...", 12);
|
|
|
|
emit_progress(tx, "sources", "Configuration du fournisseur IA...", 12);
|
|
|
|
let (provider_name, provider) = if let Some(mock_provider) = provider_override {
|
|
|
|
let (provider_name, provider) = if let Some(mock_provider) = provider_override {
|
|
|
|
("mock".to_string(), mock_provider)
|
|
|
|
("mock".to_string(), mock_provider)
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
@ -287,7 +287,7 @@ pub async fn run_generation_inner(
|
|
|
|
|
|
|
|
|
|
|
|
// === PHASE 1: Personalized Sources ===
|
|
|
|
// === PHASE 1: Personalized Sources ===
|
|
|
|
if !sources.is_empty() {
|
|
|
|
if !sources.is_empty() {
|
|
|
|
emit_progress(tx, "sources_scrape", "Analyse des sources personnalisees...", 15);
|
|
|
|
emit_progress(tx, "sources", "Analyse des sources personnalisees...", 15);
|
|
|
|
|
|
|
|
|
|
|
|
let last_source = db::article_history::get_last_source_url(&state.pool, user_id).await.unwrap_or(None);
|
|
|
|
let last_source = db::article_history::get_last_source_url(&state.pool, user_id).await.unwrap_or(None);
|
|
|
|
let rotated_sources = rotate_sources(sources.clone(), last_source.as_deref());
|
|
|
|
let rotated_sources = rotate_sources(sources.clone(), last_source.as_deref());
|
|
|
|
@ -302,9 +302,11 @@ pub async fn run_generation_inner(
|
|
|
|
let total_waves = source_chunks.len();
|
|
|
|
let total_waves = source_chunks.len();
|
|
|
|
|
|
|
|
|
|
|
|
'wave_loop: for (wave_idx, wave_sources) in source_chunks.iter().enumerate() {
|
|
|
|
'wave_loop: for (wave_idx, wave_sources) in source_chunks.iter().enumerate() {
|
|
|
|
emit_progress(tx, "sources_scrape",
|
|
|
|
let articles_so_far: usize = article_scraped.values().map(|v| v.len()).sum();
|
|
|
|
&format!("Extraction des sources (vague {}/{})", wave_idx + 1, total_waves),
|
|
|
|
let pct = 5 + ((articles_so_far as u32 * 60) / max_total.max(1) as u32).min(60);
|
|
|
|
15 + ((wave_idx as u32 * 10) / total_waves.max(1) as u32).min(10) as u8);
|
|
|
|
emit_progress(tx, "sources",
|
|
|
|
|
|
|
|
&format!("Vague {}/{} : extraction des sources...", wave_idx + 1, total_waves),
|
|
|
|
|
|
|
|
pct as u8);
|
|
|
|
|
|
|
|
|
|
|
|
// 1a. Extract links from this wave's sources (all in parallel)
|
|
|
|
// 1a. Extract links from this wave's sources (all in parallel)
|
|
|
|
let mut wave_urls: Vec<(String, String)> = Vec::new();
|
|
|
|
let mut wave_urls: Vec<(String, String)> = Vec::new();
|
|
|
|
@ -407,8 +409,9 @@ pub async fn run_generation_inner(
|
|
|
|
break;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
let pct = 25 + ((processed as u32 * 40) / total_candidates.max(1) as u32).min(40);
|
|
|
|
let articles_so_far: usize = article_scraped.values().map(|v| v.len()).sum();
|
|
|
|
emit_progress(tx, "processing", &format!("Articles {}-{}/{}...", processed + 1, processed + batch.len(), total_candidates), pct as u8);
|
|
|
|
let pct = 5 + ((articles_so_far as u32 * 60) / max_total.max(1) as u32).min(60);
|
|
|
|
|
|
|
|
emit_progress(tx, "sources", &format!("Vague {}/{} : articles {}/{}...", wave_idx + 1, total_waves, processed + 1, total_candidates), pct as u8);
|
|
|
|
|
|
|
|
|
|
|
|
// Phase A: Scrape batch in parallel
|
|
|
|
// Phase A: Scrape batch in parallel
|
|
|
|
let mut scrape_set = tokio::task::JoinSet::new();
|
|
|
|
let mut scrape_set = tokio::task::JoinSet::new();
|
|
|
|
@ -575,7 +578,7 @@ pub async fn run_generation_inner(
|
|
|
|
if !category_gaps.is_empty() {
|
|
|
|
if !category_gaps.is_empty() {
|
|
|
|
if settings.use_brave_search {
|
|
|
|
if settings.use_brave_search {
|
|
|
|
// === BRAVE SEARCH PATH ===
|
|
|
|
// === BRAVE SEARCH PATH ===
|
|
|
|
emit_progress(tx, "search", "Recherche Brave Search...", 70);
|
|
|
|
emit_progress(tx, "websearch", "Recherche Brave Search...", 70);
|
|
|
|
|
|
|
|
|
|
|
|
let brave_key = resolve_brave_key(state, user_id).await?;
|
|
|
|
let brave_key = resolve_brave_key(state, user_id).await?;
|
|
|
|
let query = format!("{} actualites", settings.theme);
|
|
|
|
let query = format!("{} actualites", settings.theme);
|
|
|
|
@ -614,7 +617,7 @@ pub async fn run_generation_inner(
|
|
|
|
|
|
|
|
|
|
|
|
// Scrape + classify in batches (same as Phase 1)
|
|
|
|
// Scrape + classify in batches (same as Phase 1)
|
|
|
|
if !brave_urls.is_empty() {
|
|
|
|
if !brave_urls.is_empty() {
|
|
|
|
emit_progress(tx, "processing", "Traitement des articles Brave...", 75);
|
|
|
|
emit_progress(tx, "websearch", "Traitement des articles Brave...", 75);
|
|
|
|
let total_candidates = brave_urls.len();
|
|
|
|
let total_candidates = brave_urls.len();
|
|
|
|
let batch_size = settings.batch_size.max(1) as usize;
|
|
|
|
let batch_size = settings.batch_size.max(1) as usize;
|
|
|
|
let mut processed = 0usize;
|
|
|
|
let mut processed = 0usize;
|
|
|
|
@ -630,8 +633,8 @@ pub async fn run_generation_inner(
|
|
|
|
|
|
|
|
|
|
|
|
if batch.is_empty() { break; }
|
|
|
|
if batch.is_empty() { break; }
|
|
|
|
|
|
|
|
|
|
|
|
let pct = 75 + ((processed as u32 * 15) / total_candidates.max(1) as u32).min(15);
|
|
|
|
let pct = 75 + ((processed as u32 * 10) / total_candidates.max(1) as u32).min(10);
|
|
|
|
emit_progress(tx, "processing", &format!("Articles Brave {}-{}/{}...", processed + 1, processed + batch.len(), total_candidates), pct as u8);
|
|
|
|
emit_progress(tx, "websearch", &format!("Verification des sources {}/{}...", processed + 1, total_candidates), pct as u8);
|
|
|
|
|
|
|
|
|
|
|
|
// Scrape batch in parallel
|
|
|
|
// Scrape batch in parallel
|
|
|
|
let mut scrape_set = tokio::task::JoinSet::new();
|
|
|
|
let mut scrape_set = tokio::task::JoinSet::new();
|
|
|
|
@ -782,7 +785,7 @@ pub async fn run_generation_inner(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
// === EXISTING LLM SEARCH PATH ===
|
|
|
|
// === EXISTING LLM SEARCH PATH ===
|
|
|
|
emit_progress(tx, "search", "Recherche d'actualites complementaires...", 70);
|
|
|
|
emit_progress(tx, "websearch", "Recherche d'actualites...", 70);
|
|
|
|
check_rate_limit(state, &user_rate_limiter, &provider_name).await?;
|
|
|
|
check_rate_limit(state, &user_rate_limiter, &provider_name).await?;
|
|
|
|
|
|
|
|
|
|
|
|
let search_schema = crate::services::llm::schema::build_category_schema(&user_categories, settings.max_items_per_category);
|
|
|
|
let search_schema = crate::services::llm::schema::build_category_schema(&user_categories, settings.max_items_per_category);
|
|
|
|
@ -794,7 +797,7 @@ pub async fn run_generation_inner(
|
|
|
|
let llm_duration = llm_start.elapsed().as_millis() as u64;
|
|
|
|
let llm_duration = llm_start.elapsed().as_millis() as u64;
|
|
|
|
log_llm_call(&state.pool, user_id, job_id, "search", &model_websearch, &sys_prompt, &usr_prompt, &raw_results, llm_duration, None).await;
|
|
|
|
log_llm_call(&state.pool, user_id, job_id, "search", &model_websearch, &sys_prompt, &usr_prompt, &raw_results, llm_duration, None).await;
|
|
|
|
|
|
|
|
|
|
|
|
emit_progress(tx, "parsing", "Analyse des resultats...", 75);
|
|
|
|
emit_progress(tx, "websearch", "Analyse des resultats...", 75);
|
|
|
|
let parsed = parse_llm_output(&raw_results, &user_categories)?;
|
|
|
|
let parsed = parse_llm_output(&raw_results, &user_categories)?;
|
|
|
|
|
|
|
|
|
|
|
|
// Filter and validate Phase 2 articles
|
|
|
|
// Filter and validate Phase 2 articles
|
|
|
|
@ -827,7 +830,7 @@ pub async fn run_generation_inner(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Scrape Phase 2 for validation
|
|
|
|
// Scrape Phase 2 for validation
|
|
|
|
emit_progress(tx, "scraping", "Verification des sources web...", 80);
|
|
|
|
emit_progress(tx, "websearch", "Verification des sources...", 80);
|
|
|
|
for (cat_key, item) in phase2_items {
|
|
|
|
for (cat_key, item) in phase2_items {
|
|
|
|
let (_body_text, _, final_url, drop_reason) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await;
|
|
|
|
let (_body_text, _, final_url, drop_reason) = scrape_single_article(&state.http_client, &item.url, settings.max_age_days as i64).await;
|
|
|
|
|
|
|
|
|
|
|
|
|