refactor: extract assign_category and filter_phase2_url helpers from synthesis pipeline

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 030fe6f941
commit e056ef9d3e

@ -486,31 +486,11 @@ async fn run_generation_inner(
} }
}; };
let llm_title = class_response.get("title").and_then(|t| t.as_str()).unwrap_or(&page_title).to_string(); let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
let llm_summary = class_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string(); &class_response, &page_title, &user_categories, &classification_categories,
let mut llm_category = class_response.get("category").and_then(|c| c.as_str()).unwrap_or("Autre").to_string(); &filled_counts, settings.max_items_per_category as usize,
) else {
if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) {
llm_category = "Autre".to_string();
}
let cat_key = if llm_category.to_lowercase() == "autre" {
"category_autre".to_string()
} else {
user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase())
.map(|i| format!("category_{}", i))
.unwrap_or_else(|| "category_autre".to_string())
};
let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0);
let (final_cat_key, final_cat_name) = if cat_filled >= settings.max_items_per_category as usize && llm_category.to_lowercase() != "autre" {
let autre_filled = filled_counts.get("Autre").copied().unwrap_or(0);
if autre_filled >= settings.max_items_per_category as usize {
continue; continue;
}
("category_autre".to_string(), "Autre".to_string())
} else {
(cat_key, llm_category)
}; };
article_scraped.entry(final_cat_key).or_default().push(NewsItem { article_scraped.entry(final_cat_key).or_default().push(NewsItem {
@ -558,43 +538,15 @@ async fn run_generation_inner(
// Filter Brave results // Filter Brave results
let mut brave_urls: Vec<String> = Vec::new(); let mut brave_urls: Vec<String> = Vec::new();
for result in &brave_results { for result in &brave_results {
let url_lower = result.url.to_lowercase(); if let Some(reason) = filter_phase2_url(
&state.pool, user_id, &result.url, &seen_urls, &source_counts,
// Homepage filter settings.article_history_days, settings.max_articles_per_source as usize,
if let Ok(parsed_url) = url::Url::parse(&result.url) { ).await {
let path = parsed_url.path(); trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, reason, false).await;
if path.is_empty() || path == "/" {
trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_homepage", false).await;
continue;
}
}
// Cross-phase dedup
if seen_urls.contains(&url_lower) {
trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_cross_phase_dedup", false).await;
continue; continue;
} }
// History dedup seen_urls.insert(result.url.to_lowercase());
if settings.article_history_days > 0 {
let hash = hash_article_url(&result.url);
let exists = db::article_history::check_urls_exist(&state.pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
if exists.contains(&hash) {
trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_history", false).await;
continue;
}
}
// Source diversity
if let Some(domain) = extract_domain(&result.url) {
let count = source_counts.get(&domain).copied().unwrap_or(0);
if count >= settings.max_articles_per_source as usize {
trace_article(&state.pool, user_id, job_id, &result.url, &result.title, "brave_search", None, None, None, "filtered_diversity", false).await;
continue;
}
}
seen_urls.insert(url_lower);
url_source.insert(result.url.clone(), "brave_search".to_string()); url_source.insert(result.url.clone(), "brave_search".to_string());
brave_urls.push(result.url.clone()); brave_urls.push(result.url.clone());
} }
@ -693,31 +645,11 @@ async fn run_generation_inner(
} }
}; };
let llm_title = class_response.get("title").and_then(|t| t.as_str()).unwrap_or(&page_title).to_string(); let Some((final_cat_key, final_cat_name, llm_title, llm_summary)) = assign_category(
let llm_summary = class_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string(); &class_response, &page_title, &user_categories, &classification_categories,
let mut llm_category = class_response.get("category").and_then(|c| c.as_str()).unwrap_or("Autre").to_string(); &filled_counts, settings.max_items_per_category as usize,
) else {
if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) {
llm_category = "Autre".to_string();
}
let cat_key = if llm_category.to_lowercase() == "autre" {
"category_autre".to_string()
} else {
user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase())
.map(|i| format!("category_{}", i))
.unwrap_or_else(|| "category_autre".to_string())
};
let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0);
let (final_cat_key, final_cat_name) = if cat_filled >= settings.max_items_per_category as usize && llm_category.to_lowercase() != "autre" {
let autre_filled = filled_counts.get("Autre").copied().unwrap_or(0);
if autre_filled >= settings.max_items_per_category as usize {
continue; continue;
}
("category_autre".to_string(), "Autre".to_string())
} else {
(cat_key, llm_category)
}; };
article_scraped.entry(final_cat_key).or_default().push(NewsItem { article_scraped.entry(final_cat_key).or_default().push(NewsItem {
@ -763,43 +695,15 @@ async fn run_generation_inner(
for (cat_key, items) in parsed { for (cat_key, items) in parsed {
for item in items { for item in items {
let url_lower = item.url.to_lowercase(); if let Some(reason) = filter_phase2_url(
&state.pool, user_id, &item.url, &seen_urls, &source_counts,
// Homepage filter settings.article_history_days, settings.max_articles_per_source as usize,
if let Ok(parsed_url) = url::Url::parse(&item.url) { ).await {
let path = parsed_url.path(); trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, reason, false).await;
if path.is_empty() || path == "/" {
trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_homepage", false).await;
continue; continue;
} }
}
// Cross-phase dedup
if seen_urls.contains(&url_lower) {
trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_cross_phase_dedup", false).await;
continue;
}
// History dedup
if settings.article_history_days > 0 {
let hash = hash_article_url(&item.url);
let exists = db::article_history::check_urls_exist(&state.pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
if exists.contains(&hash) {
trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_history", false).await;
continue;
}
}
// Source limit seen_urls.insert(item.url.to_lowercase());
if let Some(domain) = extract_domain(&item.url) {
let count = source_counts.get(&domain).copied().unwrap_or(0);
if count >= settings.max_articles_per_source as usize {
trace_article(&state.pool, user_id, job_id, &item.url, &item.title, "web_search", None, None, None, "filtered_diversity", false).await;
continue;
}
}
seen_urls.insert(url_lower);
phase2_items.push((cat_key.clone(), item)); phase2_items.push((cat_key.clone(), item));
} }
} }
@ -1046,6 +950,80 @@ fn extract_domain(url: &str) -> Option<String> {
.and_then(|u| u.host_str().map(|h| h.to_lowercase())) .and_then(|u| u.host_str().map(|h| h.to_lowercase()))
} }
/// Assign an article to a category based on LLM classification response.
/// Returns `Some((cat_key, cat_name, title, summary))` or `None` if all categories full.
fn assign_category(
llm_response: &serde_json::Value,
page_title: &str,
user_categories: &[String],
classification_categories: &[String],
filled_counts: &HashMap<String, usize>,
max_items_per_category: usize,
) -> Option<(String, String, String, String)> {
let llm_title = llm_response.get("title").and_then(|t| t.as_str()).unwrap_or(page_title).to_string();
let llm_summary = llm_response.get("summary").and_then(|s| s.as_str()).unwrap_or("").to_string();
let mut llm_category = llm_response.get("category").and_then(|c| c.as_str()).unwrap_or("Autre").to_string();
if !classification_categories.iter().any(|c| c.to_lowercase() == llm_category.to_lowercase()) {
llm_category = "Autre".to_string();
}
let cat_key = if llm_category.to_lowercase() == "autre" {
"category_autre".to_string()
} else {
user_categories.iter().position(|c| c.to_lowercase() == llm_category.to_lowercase())
.map(|i| format!("category_{}", i))
.unwrap_or_else(|| "category_autre".to_string())
};
let cat_filled = filled_counts.get(&llm_category).copied().unwrap_or(0);
if cat_filled >= max_items_per_category && llm_category.to_lowercase() != "autre" {
let autre_filled = filled_counts.get("Autre").copied().unwrap_or(0);
if autre_filled >= max_items_per_category {
return None;
}
Some(("category_autre".to_string(), "Autre".to_string(), llm_title, llm_summary))
} else {
Some((cat_key, llm_category, llm_title, llm_summary))
}
}
/// Check if a Phase 2 URL passes all filters.
/// Returns the filter reason if rejected, None if accepted.
async fn filter_phase2_url(
pool: &sqlx::PgPool,
user_id: Uuid,
url: &str,
seen_urls: &std::collections::HashSet<String>,
source_counts: &HashMap<String, usize>,
article_history_days: i32,
max_articles_per_source: usize,
) -> Option<&'static str> {
if let Ok(parsed_url) = url::Url::parse(url) {
let path = parsed_url.path();
if path.is_empty() || path == "/" {
return Some("filtered_homepage");
}
}
if seen_urls.contains(&url.to_lowercase()) {
return Some("filtered_cross_phase_dedup");
}
if article_history_days > 0 {
let hash = hash_article_url(url);
let exists = db::article_history::check_urls_exist(pool, user_id, std::slice::from_ref(&hash)).await.unwrap_or_default();
if exists.contains(&hash) {
return Some("filtered_history");
}
}
if let Some(domain) = extract_domain(url) {
let count = source_counts.get(&domain).copied().unwrap_or(0);
if count >= max_articles_per_source {
return Some("filtered_diversity");
}
}
None
}
/// Normalize an article URL for consistent history hashing. /// Normalize an article URL for consistent history hashing.
/// ///
/// Strips fragments, trailing slashes, and known tracking query parameters /// Strips fragments, trailing slashes, and known tracking query parameters

Loading…
Cancel
Save