feat: integrate site_search fallback into Phase 1 pipeline

Build SiteSearchProvider before wave loop, chain as third fallback
after RSS + HTML when both return 0 links.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 2 months ago
parent 71c791fec0
commit 926c7ec709

@ -28,6 +28,7 @@ use crate::services::llm::factory::create_provider;
use crate::services::scraper;
use crate::services::source_scraper;
use crate::services::feed_parser;
use crate::services::site_search;
mod helpers;
pub(crate) use helpers::{
@ -156,6 +157,22 @@ pub async fn run_generation_inner(
let model_research = Arc::new(model_research);
let classification_categories = Arc::new(classification_categories);
// Build the site search fallback provider (Brave if available, else LLM)
let site_search_provider = if settings.use_brave_search {
match resolve_brave_key(state, user_id).await {
Ok(key) => Arc::new(site_search::SiteSearchProvider::Brave { api_key: key }),
Err(_) => Arc::new(site_search::SiteSearchProvider::Llm {
provider: provider.clone(),
model: model_websearch.clone(),
}),
}
} else {
Arc::new(site_search::SiteSearchProvider::Llm {
provider: provider.clone(),
model: model_websearch.clone(),
})
};
// === PHASE 1: Personalized Sources ===
if !sources.is_empty() {
emit_progress(tx, "sources", "Analyse des sources personnalisees...", 15);
@ -203,6 +220,9 @@ pub async fn run_generation_inner(
let rss_url = source.rss_url.clone();
let rss_discovered_at = source.rss_discovered_at;
let max_l = max_links;
let ss_provider = site_search_provider.clone();
let ss_theme = theme.theme.clone();
let ss_max_age = theme.max_age_days;
join_set.spawn(async move {
// Try RSS feed first
let feed_result = feed_parser::detect_and_parse_feed(
@ -239,8 +259,30 @@ pub async fn run_generation_inner(
feed_parser::FeedResult::Found { .. } => {
// Feed found but too few entries — keep the cache, fall back to HTML
let links = source_scraper::extract_article_links(&client, &source_url, max_l).await;
match links {
Ok(ref l) if l.is_empty() => {
// HTML also returned 0 links — try site search fallback
if let Some(domain) = crate::services::synthesis::extract_domain(&source_url) {
let ss_config = site_search::SiteSearchConfig {
domain,
theme: ss_theme,
max_results: max_l,
max_age_days: ss_max_age,
};
let ss_links = site_search::search(&client, &ss_config, &ss_provider).await;
if !ss_links.is_empty() {
tracing::info!(source = %source_title, links = ss_links.len(), "Site search fallback produced links");
(source_url, source_title, Ok(ss_links), None)
} else {
(source_url, source_title, links, None)
}
} else {
(source_url, source_title, links, None)
}
}
_ => (source_url, source_title, links, None),
}
}
feed_parser::FeedResult::NotFound => {
// No feed discovered — fall back to HTML and clear any stale cache
let links = source_scraper::extract_article_links(&client, &source_url, max_l).await;
@ -249,9 +291,31 @@ pub async fn run_generation_inner(
} else {
None
};
match links {
Ok(ref l) if l.is_empty() => {
// HTML also returned 0 links — try site search fallback
if let Some(domain) = crate::services::synthesis::extract_domain(&source_url) {
let ss_config = site_search::SiteSearchConfig {
domain,
theme: ss_theme,
max_results: max_l,
max_age_days: ss_max_age,
};
let ss_links = site_search::search(&client, &ss_config, &ss_provider).await;
if !ss_links.is_empty() {
tracing::info!(source = %source_title, links = ss_links.len(), "Site search fallback produced links");
(source_url, source_title, Ok(ss_links), update)
} else {
(source_url, source_title, links, update)
}
} else {
(source_url, source_title, links, update)
}
}
_ => (source_url, source_title, links, update),
}
}
}
});
}

Loading…
Cancel
Save