|
|
|
|
@ -486,31 +486,11 @@ async fn run_generation_inner(
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let llm_title = class_response.get("title").and_then(|t| t.as_str()).unwrap_or(&page_title).to_string();
|
|
|
|
|
let llm_summary = class_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
|
|
|
|
|
let mut llm_category = class_response.get("category").and_then(|c| c.as_str()).unwrap_or("Autre").to_string();
|
|
|
|
|
|
|
|
|
|
if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) {
|
|
|
|
|
llm_category = "Autre".to_string();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let cat_key = if llm_category.to_lowercase() == "autre" {
|
|
|
|
|
"category_autre".to_string()
|
|
|
|
|
} else {
|
|
|
|
|
user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase())
|
|
|
|
|
.map(|i| format!("category_{}", i))
|
|
|
|
|
.unwrap_or_else(|| "category_autre".to_string())
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0);
|
|
|
|
|
let (final_cat_key, final_cat_name) = if cat_filled >= settings.max_items_per_category as usize && llm_category.to_lowercase() != "autre" {
|
|
|
|
|
let autre_filled = filled_counts.get("Autre").copied().unwrap_or(0);
|
|
|
|
|
if autre_filled >= settings.max_items_per_category as usize {
|
|
|
|
|
let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
|
|
|
|
|
&class_response, &page_title, &user_categories, &classification_categories,
|
|
|
|
|
&filled_counts, settings.max_items_per_category as usize,
|
|
|
|
|
) else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
("category_autre".to_string(), "Autre".to_string())
|
|
|
|
|
} else {
|
|
|
|
|
(cat_key, llm_category)
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
article_scraped.entry(final_cat_key).or_default().push(NewsItem {
|
|
|
|
|
@ -558,43 +538,15 @@ async fn run_generation_inner(
|
|
|
|
|
// Filter Brave results
|
|
|
|
|
let mut brave_urls: Vec<String> = Vec::new();
|
|
|
|
|
for result in &brave_results {
|
|
|
|
|
let url_lower = result.url.to_lowercase();
|
|
|
|
|
|
|
|
|
|
// Homepage filter
|
|
|
|
|
if let Ok(parsed_url) = url::Url::parse(&result.url) {
|
|
|
|
|
let path = parsed_url.path();
|
|
|
|
|
if path.is_empty() || path == "/" {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_homepage", false).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Cross-phase dedup
|
|
|
|
|
if seen_urls.contains(&url_lower) {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_cross_phase_dedup", false).await;
|
|
|
|
|
if let Some(reason) = filter_phase2_url(
|
|
|
|
|
&state.pool, user_id, &result.url, &seen_urls, &source_counts,
|
|
|
|
|
settings.article_history_days, settings.max_articles_per_source as usize,
|
|
|
|
|
).await {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, reason, false).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// History dedup
|
|
|
|
|
if settings.article_history_days > 0 {
|
|
|
|
|
let hash = hash_article_url(&result.url);
|
|
|
|
|
let exists = db::article_history::check_urls_exist(&state.pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
|
|
|
|
|
if exists.contains(&hash) {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_history", false).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Source diversity
|
|
|
|
|
if let Some(domain) = extract_domain(&result.url) {
|
|
|
|
|
let count = source_counts.get(&domain).copied().unwrap_or(0);
|
|
|
|
|
if count >= settings.max_articles_per_source as usize {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_diversity", false).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
seen_urls.insert(url_lower);
|
|
|
|
|
seen_urls.insert(result.url.to_lowercase());
|
|
|
|
|
url_source.insert(result.url.clone(), "brave_search".to_string());
|
|
|
|
|
brave_urls.push(result.url.clone());
|
|
|
|
|
}
|
|
|
|
|
@ -693,31 +645,11 @@ async fn run_generation_inner(
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let llm_title = class_response.get("title").and_then(|t| t.as_str()).unwrap_or(&page_title).to_string();
|
|
|
|
|
let llm_summary = class_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
|
|
|
|
|
let mut llm_category = class_response.get("category").and_then(|c| c.as_str()).unwrap_or("Autre").to_string();
|
|
|
|
|
|
|
|
|
|
if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) {
|
|
|
|
|
llm_category = "Autre".to_string();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let cat_key = if llm_category.to_lowercase() == "autre" {
|
|
|
|
|
"category_autre".to_string()
|
|
|
|
|
} else {
|
|
|
|
|
user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase())
|
|
|
|
|
.map(|i| format!("category_{}", i))
|
|
|
|
|
.unwrap_or_else(|| "category_autre".to_string())
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0);
|
|
|
|
|
let (final_cat_key, final_cat_name) = if cat_filled >= settings.max_items_per_category as usize && llm_category.to_lowercase() != "autre" {
|
|
|
|
|
let autre_filled = filled_counts.get("Autre").copied().unwrap_or(0);
|
|
|
|
|
if autre_filled >= settings.max_items_per_category as usize {
|
|
|
|
|
let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
|
|
|
|
|
&class_response, &page_title, &user_categories, &classification_categories,
|
|
|
|
|
&filled_counts, settings.max_items_per_category as usize,
|
|
|
|
|
) else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
("category_autre".to_string(), "Autre".to_string())
|
|
|
|
|
} else {
|
|
|
|
|
(cat_key, llm_category)
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
article_scraped.entry(final_cat_key).or_default().push(NewsItem {
|
|
|
|
|
@ -763,43 +695,15 @@ async fn run_generation_inner(
|
|
|
|
|
|
|
|
|
|
for (cat_key, items) in parsed {
|
|
|
|
|
for item in items {
|
|
|
|
|
let url_lower = item.url.to_lowercase();
|
|
|
|
|
|
|
|
|
|
// Homepage filter
|
|
|
|
|
if let Ok(parsed_url) = url::Url::parse(&item.url) {
|
|
|
|
|
let path = parsed_url.path();
|
|
|
|
|
if path.is_empty() || path == "/" {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_homepage", false).await;
|
|
|
|
|
if let Some(reason) = filter_phase2_url(
|
|
|
|
|
&state.pool, user_id, &item.url, &seen_urls, &source_counts,
|
|
|
|
|
settings.article_history_days, settings.max_articles_per_source as usize,
|
|
|
|
|
).await {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, reason, false).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Cross-phase dedup
|
|
|
|
|
if seen_urls.contains(&url_lower) {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_cross_phase_dedup", false).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// History dedup
|
|
|
|
|
if settings.article_history_days > 0 {
|
|
|
|
|
let hash = hash_article_url(&item.url);
|
|
|
|
|
let exists = db::article_history::check_urls_exist(&state.pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
|
|
|
|
|
if exists.contains(&hash) {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_history", false).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Source limit
|
|
|
|
|
if let Some(domain) = extract_domain(&item.url) {
|
|
|
|
|
let count = source_counts.get(&domain).copied().unwrap_or(0);
|
|
|
|
|
if count >= settings.max_articles_per_source as usize {
|
|
|
|
|
trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_diversity", false).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
seen_urls.insert(url_lower);
|
|
|
|
|
seen_urls.insert(item.url.to_lowercase());
|
|
|
|
|
phase2_items.push((cat_key.clone(), item));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@ -1046,6 +950,80 @@ fn extract_domain(url: &str) -> Option<String> {
|
|
|
|
|
.and_then(|u| u.host_str().map(|h| h.to_lowercase()))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Assign an article to a category based on LLM classification response.
|
|
|
|
|
/// Returns `Some((cat_key, cat_name, title, summary))` or `None` if all categories full.
|
|
|
|
|
fn assign_category(
|
|
|
|
|
llm_response: &serde_json::Value,
|
|
|
|
|
page_title: &str,
|
|
|
|
|
user_categories: &[String],
|
|
|
|
|
classification_categories: &[String],
|
|
|
|
|
filled_counts: &HashMap<String, usize>,
|
|
|
|
|
max_items_per_category: usize,
|
|
|
|
|
) -> Option<(String, String, String, String)> {
|
|
|
|
|
let llm_title = llm_response.get("title").and_then(|t| t.as_str()).unwrap_or(page_title).to_string();
|
|
|
|
|
let llm_summary = llm_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
|
|
|
|
|
let mut llm_category = llm_response.get("category").and_then(|c| c.as_str()).unwrap_or("Autre").to_string();
|
|
|
|
|
|
|
|
|
|
if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) {
|
|
|
|
|
llm_category = "Autre".to_string();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let cat_key = if llm_category.to_lowercase() == "autre" {
|
|
|
|
|
"category_autre".to_string()
|
|
|
|
|
} else {
|
|
|
|
|
user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase())
|
|
|
|
|
.map(|i| format!("category_{}", i))
|
|
|
|
|
.unwrap_or_else(|| "category_autre".to_string())
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0);
|
|
|
|
|
if cat_filled >= max_items_per_category && llm_category.to_lowercase() != "autre" {
|
|
|
|
|
let autre_filled = filled_counts.get("Autre").copied().unwrap_or(0);
|
|
|
|
|
if autre_filled >= max_items_per_category {
|
|
|
|
|
return None;
|
|
|
|
|
}
|
|
|
|
|
Some(("category_autre".to_string(), "Autre".to_string(), llm_title, llm_summary))
|
|
|
|
|
} else {
|
|
|
|
|
Some((cat_key, llm_category, llm_title, llm_summary))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Check if a Phase 2 URL passes all filters.
|
|
|
|
|
/// Returns the filter reason if rejected, None if accepted.
|
|
|
|
|
async fn filter_phase2_url(
|
|
|
|
|
pool: &sqlx::PgPool,
|
|
|
|
|
user_id: Uuid,
|
|
|
|
|
url: &str,
|
|
|
|
|
seen_urls: &std::collections::HashSet<String>,
|
|
|
|
|
source_counts: &HashMap<String, usize>,
|
|
|
|
|
article_history_days: i32,
|
|
|
|
|
max_articles_per_source: usize,
|
|
|
|
|
) -> Option<&'static str> {
|
|
|
|
|
if let Ok(parsed_url) = url::Url::parse(url) {
|
|
|
|
|
let path = parsed_url.path();
|
|
|
|
|
if path.is_empty() || path == "/" {
|
|
|
|
|
return Some("filtered_homepage");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if seen_urls.contains(&url.to_lowercase()) {
|
|
|
|
|
return Some("filtered_cross_phase_dedup");
|
|
|
|
|
}
|
|
|
|
|
if article_history_days > 0 {
|
|
|
|
|
let hash = hash_article_url(url);
|
|
|
|
|
let exists = db::article_history::check_urls_exist(pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
|
|
|
|
|
if exists.contains(&hash) {
|
|
|
|
|
return Some("filtered_history");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if let Some(domain) = extract_domain(url) {
|
|
|
|
|
let count = source_counts.get(&domain).copied().unwrap_or(0);
|
|
|
|
|
if count >= max_articles_per_source {
|
|
|
|
|
return Some("filtered_diversity");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
None
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Normalize an article URL for consistent history hashing.
|
|
|
|
|
///
|
|
|
|
|
/// Strips fragments, trailing slashes, and known tracking query parameters
|
|
|
|
|
|