diff --git a/backend/src/services/prompts.rs b/backend/src/services/prompts.rs
index 84577ef..9dbbe2d 100644
--- a/backend/src/services/prompts.rs
+++ b/backend/src/services/prompts.rs
@@ -120,24 +120,24 @@ pub fn build_search_prompt(
}
/// Build a prompt for LLM-assisted link extraction from a source page.
-pub fn build_link_extraction_prompt(body_html: &str) -> (String, String) {
+///
+/// Receives a pre-formatted list of (href, anchor_text) pairs, not raw HTML.
+pub fn build_link_extraction_prompt(links_text: &str) -> (String, String) {
let system_prompt =
- "Tu es un assistant qui analyse des pages web. \
+ "Tu es un assistant qui analyse des listes de liens. \
Tu dois identifier les liens vers des articles d'actualite. \
Reponds uniquement au format JSON demande."
.to_string();
- let body_truncated: String = body_html.chars().take(12000).collect();
-
let user_prompt = format!(
- "Voici le contenu HTML du body d'une page de blog ou de site d'actualites.\n\n\
- {body}\n\n\
- Extrais UNIQUEMENT les URLs qui pointent vers des articles \
+ "Voici une liste de liens extraits d'une page de blog ou de site d'actualites.\n\n\
+ {links}\n\n\
+ Selectionne UNIQUEMENT les URLs qui pointent vers des articles \
(pas les liens de navigation, tags, categories, login, pages statiques, topics, \
archive, companies, events, company, event, collections, etc.).\n\
Retourne les URLs completes, sans les modifier, dans le format JSON demande. \
Ne change jamais les URLs retournees, et ne les tronque jamais.",
- body = body_truncated,
+ links = links_text,
);
(system_prompt, user_prompt)
@@ -353,19 +353,18 @@ mod tests {
}
#[test]
- fn link_extraction_prompt_includes_body() {
- let (sys, user) = build_link_extraction_prompt("P");
- assert!(user.contains(""));
- assert!(user.contains("articles"));
+ fn link_extraction_prompt_includes_links() {
+ let links = "- https://example.com/post-1 | \"Breaking News\"\n- https://example.com/post-2 | \"Update\"";
+ let (sys, user) = build_link_extraction_prompt(links);
+ assert!(user.contains("https://example.com/post-1"));
+ assert!(user.contains("Breaking News"));
assert!(sys.contains("liens"));
- assert!(!user.contains(""));
}
#[test]
- fn link_extraction_prompt_truncates_body() {
- let long_body = "x".repeat(20000);
- let (_, user) = build_link_extraction_prompt(&long_body);
- assert!(user.len() < 18000); // 12000 chars of body + prompt text
+ fn link_extraction_prompt_empty_links() {
+ let (_, user) = build_link_extraction_prompt("");
+ assert!(user.contains("articles"));
}
#[test]
diff --git a/backend/src/services/source_scraper.rs b/backend/src/services/source_scraper.rs
index 2b1656a..ae07afc 100644
--- a/backend/src/services/source_scraper.rs
+++ b/backend/src/services/source_scraper.rs
@@ -117,10 +117,65 @@ pub fn extract_links_from_html(
links
}
-/// Extract the section from HTML (UTF-8 safe, up to 12000 chars).
-pub fn extract_body_html(html: &str) -> String {
- let body_start = html.find(" Vec<(String, String)> {
+ let base_domain = base_url.host_str().unwrap_or("").to_lowercase();
+ let document = Html::parse_document(html);
+ let selector = Selector::parse("a[href]").unwrap();
+ let mut pairs = Vec::new();
+
+ for element in document.select(&selector) {
+ if let Some(href) = element.value().attr("href") {
+ let resolved = match base_url.join(href) {
+ Ok(u) => u,
+ Err(_) => continue,
+ };
+
+ if resolved.scheme() != "http" && resolved.scheme() != "https" {
+ continue;
+ }
+
+ let link_domain = resolved.host_str().unwrap_or("").to_lowercase();
+ if link_domain != base_domain {
+ continue;
+ }
+
+ let path = resolved.path();
+ if path.is_empty() || path == "/" {
+ continue;
+ }
+
+ let anchor_text: String = element.text().collect::>().join(" ");
+ let anchor_text = anchor_text.trim().to_string();
+
+ pairs.push((resolved.to_string(), anchor_text));
+ }
+ }
+
+ pairs
+}
+
+/// Format link pairs as a text list for the LLM prompt.
+/// Caps at 200 links to limit token usage.
+fn format_links_for_llm(pairs: &[(String, String)]) -> String {
+ pairs
+ .iter()
+ .take(200)
+ .map(|(href, text)| {
+ if text.is_empty() {
+ format!("- {}", href)
+ } else {
+ format!("- {} | \"{}\"", href, text)
+ }
+ })
+ .collect::>()
+ .join("\n")
}
/// Extract article links using LLM analysis of the page HTML.
@@ -155,8 +210,9 @@ pub async fn extract_article_links_with_llm(
AppError::Internal(anyhow::anyhow!("Failed to read source page body: {}", e))
})?;
- let body_html = extract_body_html(&html_text);
- let (system, user) = build_link_extraction_prompt(&body_html);
+ let pairs = extract_links_as_pairs(&html_text, &base_url);
+ let links_text = format_links_for_llm(&pairs);
+ let (system, user) = build_link_extraction_prompt(&links_text);
let schema = build_link_extraction_schema();
let llm_start = std::time::Instant::now();
@@ -308,18 +364,65 @@ mod tests {
}
#[test]
- fn extract_body_html_gets_body_content() {
- let html = "TContent
";
- let body = extract_body_html(html);
- assert!(body.contains("Content
"));
- assert!(!body.contains(""));
+ fn extract_pairs_returns_href_and_text() {
+ let html = r#"
+
+ Breaking AI News
+ GPT-6 Released
+ "#;
+ let base = base_url("https://example.com/blog");
+ let pairs = extract_links_as_pairs(html, &base);
+ assert_eq!(pairs.len(), 2);
+ assert!(pairs[0].0.contains("/blog/article-1"));
+ assert_eq!(pairs[0].1, "Breaking AI News");
+ assert!(pairs[1].0.contains("/blog/article-2"));
+ assert_eq!(pairs[1].1, "GPT-6 Released");
+ }
+
+ #[test]
+ fn extract_pairs_filters_external_links() {
+ let html = r#"External"#;
+ let base = base_url("https://example.com");
+ let pairs = extract_links_as_pairs(html, &base);
+ assert!(pairs.is_empty());
+ }
+
+ #[test]
+ fn extract_pairs_filters_root_path() {
+ let html = r#"Home"#;
+ let base = base_url("https://example.com");
+ let pairs = extract_links_as_pairs(html, &base);
+ assert!(pairs.is_empty());
+ }
+
+ #[test]
+ fn extract_pairs_handles_empty_anchor_text() {
+ let html = r#"
"#;
+ let base = base_url("https://example.com");
+ let pairs = extract_links_as_pairs(html, &base);
+ assert_eq!(pairs.len(), 1);
+ assert_eq!(pairs[0].1, "");
+ }
+
+ #[test]
+ fn format_links_for_llm_formats_correctly() {
+ let pairs = vec![
+ ("https://example.com/a".to_string(), "Article One".to_string()),
+ ("https://example.com/b".to_string(), "".to_string()),
+ ];
+ let result = format_links_for_llm(&pairs);
+ assert!(result.contains("- https://example.com/a | \"Article One\""));
+ assert!(result.contains("- https://example.com/b"));
+ assert!(!result.contains("| \"\""));
}
#[test]
- fn extract_body_html_truncates_safely() {
- let long_body = "x".repeat(20000);
- let html = format!("{}", long_body);
- let body = extract_body_html(&html);
- assert_eq!(body.chars().count(), 12000);
+ fn format_links_for_llm_caps_at_200() {
+ let pairs: Vec<(String, String)> = (0..300)
+ .map(|i| (format!("https://example.com/{}", i), format!("Link {}", i)))
+ .collect();
+ let result = format_links_for_llm(&pairs);
+ let line_count = result.lines().count();
+ assert_eq!(line_count, 200);
}
}