|
|
|
@ -292,17 +292,25 @@ pub async fn run_generation_inner(
|
|
|
|
let last_source = db::article_history::get_last_source_url(&state.pool, user_id).await.unwrap_or(None);
|
|
|
|
let last_source = db::article_history::get_last_source_url(&state.pool, user_id).await.unwrap_or(None);
|
|
|
|
let rotated_sources = rotate_sources(sources.clone(), last_source.as_deref());
|
|
|
|
let rotated_sources = rotate_sources(sources.clone(), last_source.as_deref());
|
|
|
|
let max_links = 15usize;
|
|
|
|
let max_links = 15usize;
|
|
|
|
|
|
|
|
let window_size = settings.source_extraction_window.max(1) as usize;
|
|
|
|
|
|
|
|
|
|
|
|
// 1a. Extract article links from source pages (parallel, max 5 concurrent)
|
|
|
|
// Process sources in waves of `window_size`
|
|
|
|
let mut candidate_urls: Vec<(String, String)> = Vec::new();
|
|
|
|
let source_chunks: Vec<Vec<&crate::models::source::Source>> = rotated_sources
|
|
|
|
|
|
|
|
.chunks(window_size)
|
|
|
|
|
|
|
|
.map(|chunk| chunk.iter().collect())
|
|
|
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
let total_waves = source_chunks.len();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'wave_loop: for (wave_idx, wave_sources) in source_chunks.iter().enumerate() {
|
|
|
|
|
|
|
|
emit_progress(tx, "sources_scrape",
|
|
|
|
|
|
|
|
&format!("Extraction des sources (vague {}/{})", wave_idx + 1, total_waves),
|
|
|
|
|
|
|
|
15 + ((wave_idx as u32 * 10) / total_waves.max(1) as u32).min(10) as u8);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 1a. Extract links from this wave's sources (all in parallel)
|
|
|
|
|
|
|
|
let mut wave_urls: Vec<(String, String)> = Vec::new();
|
|
|
|
{
|
|
|
|
{
|
|
|
|
let mut join_set = tokio::task::JoinSet::new();
|
|
|
|
let mut join_set = tokio::task::JoinSet::new();
|
|
|
|
let mut pending = rotated_sources.iter().peekable();
|
|
|
|
for source in wave_sources {
|
|
|
|
let max_concurrent = 5;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Seed initial tasks
|
|
|
|
|
|
|
|
for _ in 0..max_concurrent {
|
|
|
|
|
|
|
|
if let Some(source) = pending.next() {
|
|
|
|
|
|
|
|
let client = state.http_client.clone();
|
|
|
|
let client = state.http_client.clone();
|
|
|
|
let source_url = source.url.clone();
|
|
|
|
let source_url = source.url.clone();
|
|
|
|
let source_title = source.title.clone();
|
|
|
|
let source_title = source.title.clone();
|
|
|
|
@ -327,7 +335,6 @@ pub async fn run_generation_inner(
|
|
|
|
(source_url, source_title, links)
|
|
|
|
(source_url, source_title, links)
|
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while let Some(join_result) = join_set.join_next().await {
|
|
|
|
while let Some(join_result) = join_set.join_next().await {
|
|
|
|
if let Ok((source_url, source_title, links_result)) = join_result {
|
|
|
|
if let Ok((source_url, source_title, links_result)) = join_result {
|
|
|
|
@ -336,7 +343,7 @@ pub async fn run_generation_inner(
|
|
|
|
tracing::info!(source = %source_title, links = links.len(), "Extracted links from source");
|
|
|
|
tracing::info!(source = %source_title, links = links.len(), "Extracted links from source");
|
|
|
|
for link in links {
|
|
|
|
for link in links {
|
|
|
|
if seen_urls.insert(link.to_lowercase()) {
|
|
|
|
if seen_urls.insert(link.to_lowercase()) {
|
|
|
|
candidate_urls.push((link, source_url.clone()));
|
|
|
|
wave_urls.push((link, source_url.clone()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@ -345,42 +352,15 @@ pub async fn run_generation_inner(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Spawn next task
|
|
|
|
|
|
|
|
if let Some(source) = pending.next() {
|
|
|
|
|
|
|
|
let client = state.http_client.clone();
|
|
|
|
|
|
|
|
let source_url = source.url.clone();
|
|
|
|
|
|
|
|
let source_title = source.title.clone();
|
|
|
|
|
|
|
|
let use_llm = settings.use_llm_for_source_links;
|
|
|
|
|
|
|
|
let provider_clone = std::sync::Arc::clone(&provider);
|
|
|
|
|
|
|
|
let model = Arc::clone(&model_research);
|
|
|
|
|
|
|
|
let max_l = max_links;
|
|
|
|
|
|
|
|
let pool = state.pool.clone();
|
|
|
|
|
|
|
|
let uid = user_id;
|
|
|
|
|
|
|
|
let jid = job_id;
|
|
|
|
|
|
|
|
join_set.spawn(async move {
|
|
|
|
|
|
|
|
let links = if use_llm {
|
|
|
|
|
|
|
|
source_scraper::extract_article_links_with_llm(
|
|
|
|
|
|
|
|
&client, &source_url, max_l, &provider_clone, &model,
|
|
|
|
|
|
|
|
Some(&pool), Some(uid), Some(jid),
|
|
|
|
|
|
|
|
).await
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
source_scraper::extract_article_links(
|
|
|
|
|
|
|
|
&client, &source_url, max_l,
|
|
|
|
|
|
|
|
).await
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
(source_url, source_title, links)
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Filter against article history
|
|
|
|
// 1b. Filter against article history
|
|
|
|
if settings.article_history_days > 0 && !candidate_urls.is_empty() {
|
|
|
|
if settings.article_history_days > 0 && !wave_urls.is_empty() {
|
|
|
|
let hashes: Vec<String> = candidate_urls.iter().map(|(url, _)| hash_article_url(url)).collect();
|
|
|
|
let hashes: Vec<String> = wave_urls.iter().map(|(url, _)| hash_article_url(url)).collect();
|
|
|
|
let existing = db::article_history::check_urls_exist(&state.pool, user_id, &hashes).await.unwrap_or_default();
|
|
|
|
let existing = db::article_history::check_urls_exist(&state.pool, user_id, &hashes).await.unwrap_or_default();
|
|
|
|
if !existing.is_empty() {
|
|
|
|
if !existing.is_empty() {
|
|
|
|
for (url, source_url) in &candidate_urls {
|
|
|
|
for (url, source_url) in &wave_urls {
|
|
|
|
if existing.contains(&hash_article_url(url)) {
|
|
|
|
if existing.contains(&hash_article_url(url)) {
|
|
|
|
pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace {
|
|
|
|
pending_traces.push(build_trace_entry(user_id, job_id, &ArticleTrace {
|
|
|
|
url, title: "", source_type: "personalized_source",
|
|
|
|
url, title: "", source_type: "personalized_source",
|
|
|
|
@ -390,7 +370,7 @@ pub async fn run_generation_inner(
|
|
|
|
}));
|
|
|
|
}));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
candidate_urls.retain(|(url, _)| !existing.contains(&hash_article_url(url)));
|
|
|
|
wave_urls.retain(|(url, _)| !existing.contains(&hash_article_url(url)));
|
|
|
|
// Flush history dedup traces
|
|
|
|
// Flush history dedup traces
|
|
|
|
if !pending_traces.is_empty() {
|
|
|
|
if !pending_traces.is_empty() {
|
|
|
|
db::article_history::batch_insert_entries(&state.pool, &pending_traces).await.ok();
|
|
|
|
db::article_history::batch_insert_entries(&state.pool, &pending_traces).await.ok();
|
|
|
|
@ -399,25 +379,26 @@ pub async fn run_generation_inner(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Shuffle candidates to interleave articles from different sources
|
|
|
|
// 1c. Shuffle this wave's candidates
|
|
|
|
use rand::seq::SliceRandom;
|
|
|
|
use rand::seq::SliceRandom;
|
|
|
|
candidate_urls.shuffle(&mut rand::thread_rng());
|
|
|
|
wave_urls.shuffle(&mut rand::thread_rng());
|
|
|
|
|
|
|
|
|
|
|
|
// Track url -> source
|
|
|
|
// Track url -> source
|
|
|
|
for (url, source_url) in &candidate_urls {
|
|
|
|
for (url, source_url) in &wave_urls {
|
|
|
|
url_source.insert(url.clone(), source_url.clone());
|
|
|
|
url_source.insert(url.clone(), source_url.clone());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 1b. Scrape, classify, summarize in batches of 5
|
|
|
|
// 1d. Batch scrape+classify (operates on this wave's URLs)
|
|
|
|
emit_progress(tx, "processing", "Traitement des articles...", 25);
|
|
|
|
if !wave_urls.is_empty() {
|
|
|
|
let total_candidates = candidate_urls.len();
|
|
|
|
let total_candidates = wave_urls.len();
|
|
|
|
let batch_size = settings.batch_size.max(1) as usize;
|
|
|
|
let batch_size = settings.batch_size.max(1) as usize;
|
|
|
|
|
|
|
|
let snippet_size = match settings.summary_length { 1 => 500, 2 => 2000, _ => 4000 };
|
|
|
|
let mut processed = 0usize;
|
|
|
|
let mut processed = 0usize;
|
|
|
|
let mut candidates_iter = candidate_urls.into_iter();
|
|
|
|
let mut candidates_iter = wave_urls.into_iter();
|
|
|
|
let mut done = false;
|
|
|
|
let mut done = false;
|
|
|
|
|
|
|
|
|
|
|
|
while !done {
|
|
|
|
while !done {
|
|
|
|
// Take next batch of candidates (up to 5), filtering source limits
|
|
|
|
// Take next batch of candidates, filtering source limits
|
|
|
|
let mut batch: Vec<(String, String)> = Vec::new();
|
|
|
|
let mut batch: Vec<(String, String)> = Vec::new();
|
|
|
|
while batch.len() < batch_size {
|
|
|
|
while batch.len() < batch_size {
|
|
|
|
let Some((url, source_url)) = candidates_iter.next() else {
|
|
|
|
let Some((url, source_url)) = candidates_iter.next() else {
|
|
|
|
@ -487,11 +468,6 @@ pub async fn run_generation_inner(
|
|
|
|
let model = Arc::clone(&model_research);
|
|
|
|
let model = Arc::clone(&model_research);
|
|
|
|
let schema = Arc::clone(&classify_schema);
|
|
|
|
let schema = Arc::clone(&classify_schema);
|
|
|
|
let cats = Arc::clone(&classification_categories);
|
|
|
|
let cats = Arc::clone(&classification_categories);
|
|
|
|
let snippet_size = match settings.summary_length {
|
|
|
|
|
|
|
|
1 => 500,
|
|
|
|
|
|
|
|
2 => 2000,
|
|
|
|
|
|
|
|
_ => 4000,
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
let body_snippet: String = body_text.chars().take(snippet_size).collect();
|
|
|
|
let body_snippet: String = body_text.chars().take(snippet_size).collect();
|
|
|
|
let title = page_title.clone();
|
|
|
|
let title = page_title.clone();
|
|
|
|
let url = final_url.clone();
|
|
|
|
let url = final_url.clone();
|
|
|
|
@ -574,13 +550,22 @@ pub async fn run_generation_inner(
|
|
|
|
done = true;
|
|
|
|
done = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Flush Phase 1 traces
|
|
|
|
// 1e. Check if full after this wave
|
|
|
|
|
|
|
|
let total: usize = article_scraped.values().map(|v| v.len()).sum();
|
|
|
|
|
|
|
|
if total >= max_total {
|
|
|
|
|
|
|
|
tracing::info!(wave = wave_idx + 1, total_waves = total_waves, "Synthesis full after wave, skipping remaining sources");
|
|
|
|
|
|
|
|
break 'wave_loop;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 1f. Flush traces between waves
|
|
|
|
if !pending_traces.is_empty() {
|
|
|
|
if !pending_traces.is_empty() {
|
|
|
|
db::article_history::batch_insert_entries(&state.pool, &pending_traces).await.ok();
|
|
|
|
db::article_history::batch_insert_entries(&state.pool, &pending_traces).await.ok();
|
|
|
|
pending_traces.clear();
|
|
|
|
pending_traces.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// === PHASE 2: Web Search Fallback ===
|
|
|
|
// === PHASE 2: Web Search Fallback ===
|
|
|
|
let category_gaps: Vec<(String, i32)> = user_categories.iter().filter_map(|cat| {
|
|
|
|
let category_gaps: Vec<(String, i32)> = user_categories.iter().filter_map(|cat| {
|
|
|
|
|