feat: restructure Phase 1 into windowed source extraction waves

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 0f1b0306e4
commit 37d17e577a

@ -292,17 +292,25 @@ pub async fn run_generation_inner(
let last_source = db::article_history::get_last_source_url(&state.pool, user_id).await.unwrap_or(None);
let rotated_sources = rotate_sources(sources.clone(), last_source.as_deref());
let max_links = 15usize;
let window_size = settings.source_extraction_window.max(1) as usize;
// 1a. Extract article links from source pages (parallel, max 5 concurrent)
let mut candidate_urls: Vec<(String, String)> = Vec::new();
// Process sources in waves of `window_size`
let source_chunks: Vec<Vec<&crate::models::source::Source>> = rotated_sources
.chunks(window_size)
.map(|chunk| chunk.iter().collect())
.collect();
let total_waves = source_chunks.len();
'wave_loop: for (wave_idx, wave_sources) in source_chunks.iter().enumerate() {
emit_progress(tx, "sources_scrape",
&format!("Extraction des sources (vague {}/{})", wave_idx + 1, total_waves),
15 + ((wave_idx as u32 * 10) / total_waves.max(1) as u32).min(10) as u8);
// 1a. Extract links from this wave's sources (all in parallel)
let mut wave_urls: Vec<(String, String)> = Vec::new();
{
let mut join_set = tokio::task::JoinSet::new();
let mut pending = rotated_sources.iter().peekable();
let max_concurrent = 5;
// Seed initial tasks
for _ in 0..max_concurrent {
if let Some(source) = pending.next() {
for source in wave_sources {
let client = state.http_client.clone();
let source_url = source.url.clone();
let source_title = source.title.clone();
@ -327,7 +335,6 @@ pub async fn run_generation_inner(
(source_url, source_title, links)
});
}
}
while let Some(join_result) = join_set.join_next().await {
if let Ok((source_url, source_title, links_result)) = join_result {
@ -336,7 +343,7 @@ pub async fn run_generation_inner(
tracing::info!(source = %source_title, links = links.len(), "Extracted links from source");
for link in links {
if seen_urls.insert(link.to_lowercase()) {
candidate_urls.push((link, source_url.clone()));
wave_urls.push((link, source_url.clone()));
}
}
}
@ -345,42 +352,15 @@ pub async fn run_generation_inner(
}
}
}
// Spawn next task
if let Some(source) = pending.next() {
let client = state.http_client.clone();
let source_url = source.url.clone();
let source_title = source.title.clone();
let use_llm = settings.use_llm_for_source_links;
let provider_clone = std::sync::Arc::clone(&provider);
let model = Arc::clone(&model_research);
let max_l = max_links;
let pool = state.pool.clone();
let uid = user_id;
let jid = job_id;
join_set.spawn(async move {
let links = if use_llm {
source_scraper::extract_article_links_with_llm(
&client, &source_url, max_l, &provider_clone, &model,
Some(&pool), Some(uid), Some(jid),
).await
} else {
source_scraper::extract_article_links(
&client, &source_url, max_l,
).await
};
(source_url, source_title, links)
});
}
}
}
// Filter against article history
if settings.article_history_days > 0 && !candidate_urls.is_empty() {
let hashes: Vec<String> = candidate_urls.iter().map(|(url, _)| hash_article_url(url)).collect();
// 1b. Filter against article history
if settings.article_history_days > 0 && !wave_urls.is_empty() {
let hashes: Vec<String> = wave_urls.iter().map(|(url, _)| hash_article_url(url)).collect();
let existing = db::article_history::check_urls_exist(&state.pool, user_id, &hashes).await.unwrap_or_default();
if !existing.is_empty() {
for (url, source_url) in &candidate_urls {
for (url, source_url) in &wave_urls {
if existing.contains(&hash_article_url(url)) {
pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace {
url, title: "", source_type: "personalized_source",
@ -390,7 +370,7 @@ pub async fn run_generation_inner(
}));
}
}
candidate_urls.retain(|(url, _)| !existing.contains(&hash_article_url(url)));
wave_urls.retain(|(url, _)| !existing.contains(&hash_article_url(url)));
// Flush history dedup traces
if !pending_traces.is_empty() {
db::article_history::batch_insert_entries(&state.pool, &pending_traces).await.ok();
@ -399,25 +379,26 @@ pub async fn run_generation_inner(
}
}
// Shuffle candidates to interleave articles from different sources
// 1c. Shuffle this wave's candidates
use rand::seq::SliceRandom;
candidate_urls.shuffle(&mut rand::thread_rng());
wave_urls.shuffle(&mut rand::thread_rng());
// Track url -> source
for (url, source_url) in &candidate_urls {
for (url, source_url) in &wave_urls {
url_source.insert(url.clone(), source_url.clone());
}
// 1b. Scrape, classify, summarize in batches of 5
emit_progress(tx, "processing", "Traitement des articles...", 25);
let total_candidates = candidate_urls.len();
// 1d. Batch scrape+classify (operates on this wave's URLs)
if !wave_urls.is_empty() {
let total_candidates = wave_urls.len();
let batch_size = settings.batch_size.max(1) as usize;
let snippet_size = match settings.summary_length { 1 => 500, 2 => 2000, _ => 4000 };
let mut processed = 0usize;
let mut candidates_iter = candidate_urls.into_iter();
let mut candidates_iter = wave_urls.into_iter();
let mut done = false;
while !done {
// Take next batch of candidates (up to 5), filtering source limits
// Take next batch of candidates, filtering source limits
let mut batch: Vec<(String, String)> = Vec::new();
while batch.len() < batch_size {
let Some((url, source_url)) = candidates_iter.next() else {
@ -487,11 +468,6 @@ pub async fn run_generation_inner(
let model = Arc::clone(&model_research);
let schema = Arc::clone(&classify_schema);
let cats = Arc::clone(&classification_categories);
let snippet_size = match settings.summary_length {
1 => 500,
2 => 2000,
_ => 4000,
};
let body_snippet: String = body_text.chars().take(snippet_size).collect();
let title = page_title.clone();
let url = final_url.clone();
@ -574,13 +550,22 @@ pub async fn run_generation_inner(
done = true;
}
}
}
// Flush Phase 1 traces
// 1e. Check if full after this wave
let total: usize = article_scraped.values().map(|v| v.len()).sum();
if total >= max_total {
tracing::info!(wave = wave_idx + 1, total_waves = total_waves, "Synthesis full after wave, skipping remaining sources");
break 'wave_loop;
}
// 1f. Flush traces between waves
if !pending_traces.is_empty() {
db::article_history::batch_insert_entries(&state.pool, &pending_traces).await.ok();
pending_traces.clear();
}
}
}
// === PHASE 2: Web Search Fallback ===
let category_gaps: Vec<(String, i32)> = user_categories.iter().filter_map(|cat| {

Loading…
Cancel
Save