|
|
|
@ -120,25 +120,23 @@ pub fn build_search_prompt(
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Build a prompt for LLM-assisted link extraction from a source page.
|
|
|
|
/// Build a prompt for LLM-assisted link extraction from a source page.
|
|
|
|
pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) {
|
|
|
|
pub fn build_link_extraction_prompt(body_html: &str) -> (String, String) {
|
|
|
|
let system_prompt =
|
|
|
|
let system_prompt =
|
|
|
|
"Tu es un assistant qui analyse des pages web. \
|
|
|
|
"Tu es un assistant qui analyse des pages web. \
|
|
|
|
Tu dois identifier les liens vers des articles d'actualite. \
|
|
|
|
Tu dois identifier les liens vers des articles d'actualite. \
|
|
|
|
Reponds uniquement au format JSON demande."
|
|
|
|
Reponds uniquement au format JSON demande."
|
|
|
|
.to_string();
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
|
|
|
|
let body_truncated: String = body_html.chars().take(8000).collect();
|
|
|
|
let body_truncated: String = body_html.chars().take(12000).collect();
|
|
|
|
|
|
|
|
|
|
|
|
let user_prompt = format!(
|
|
|
|
let user_prompt = format!(
|
|
|
|
"Voici le contenu HTML d'une page de blog ou de site d'actualites.\n\n\
|
|
|
|
"Voici le contenu HTML du body d'une page de blog ou de site d'actualites.\n\n\
|
|
|
|
<head>\n{head}\n</head>\n\n\
|
|
|
|
{body}\n\n\
|
|
|
|
<body (extrait)>\n{body}\n</body>\n\n\
|
|
|
|
|
|
|
|
Extrais UNIQUEMENT les URLs qui pointent vers des articles \
|
|
|
|
Extrais UNIQUEMENT les URLs qui pointent vers des articles \
|
|
|
|
(pas les liens de navigation, tags, categories, login, pages statiques, topics, \
|
|
|
|
(pas les liens de navigation, tags, categories, login, pages statiques, topics, \
|
|
|
|
archive, companies, events, company, event, collections, etc.).\n\
|
|
|
|
archive, companies, events, company, event, collections, etc.).\n\
|
|
|
|
Retourne les URLs completes, sans les modifier, dans le format JSON demande. \
|
|
|
|
Retourne les URLs completes, sans les modifier, dans le format JSON demande. \
|
|
|
|
Ne change jamais les URLs retournees, et ne les tronque jamais.",
|
|
|
|
Ne change jamais les URLs retournees, et ne les tronque jamais.",
|
|
|
|
head = head_html,
|
|
|
|
|
|
|
|
body = body_truncated,
|
|
|
|
body = body_truncated,
|
|
|
|
);
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
@ -354,18 +352,19 @@ mod tests {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[test]
|
|
|
|
fn link_extraction_prompt_includes_html() {
|
|
|
|
fn link_extraction_prompt_includes_body() {
|
|
|
|
let (sys, user) = build_link_extraction_prompt("<title>Blog</title>", "<a href='/post'>P</a>");
|
|
|
|
let (sys, user) = build_link_extraction_prompt("<a href='/post'>P</a>");
|
|
|
|
assert!(user.contains("<title>Blog</title>"));
|
|
|
|
assert!(user.contains("<a href='/post'>"));
|
|
|
|
assert!(user.contains("articles"));
|
|
|
|
assert!(user.contains("articles"));
|
|
|
|
assert!(sys.contains("liens"));
|
|
|
|
assert!(sys.contains("liens"));
|
|
|
|
|
|
|
|
assert!(!user.contains("<head>"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[test]
|
|
|
|
fn link_extraction_prompt_truncates_body() {
|
|
|
|
fn link_extraction_prompt_truncates_body() {
|
|
|
|
let long_body = "x".repeat(20000);
|
|
|
|
let long_body = "x".repeat(20000);
|
|
|
|
let (_, user) = build_link_extraction_prompt("", &long_body);
|
|
|
|
let (_, user) = build_link_extraction_prompt(&long_body);
|
|
|
|
assert!(user.len() < 15000);
|
|
|
|
assert!(user.len() < 18000); // 12000 chars of body + prompt text
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
#[test]
|
|
|
|
|