|
|
|
@ -156,6 +156,55 @@ pub fn build_rewrite_prompt(
|
|
|
|
(system_prompt, user_prompt)
|
|
|
|
(system_prompt, user_prompt)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Build a prompt for LLM-assisted link extraction from a source page.
|
|
|
|
|
|
|
|
pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) {
|
|
|
|
|
|
|
|
let system_prompt =
|
|
|
|
|
|
|
|
"Tu es un assistant qui analyse des pages web. \
|
|
|
|
|
|
|
|
Tu dois identifier les liens vers des articles d'actualite. \
|
|
|
|
|
|
|
|
Reponds uniquement au format JSON demande."
|
|
|
|
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let body_truncated: String = body_html.chars().take(8000).collect();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let user_prompt = format!(
|
|
|
|
|
|
|
|
"Voici le contenu HTML d'une page de blog ou de site d'actualites.\n\n\
|
|
|
|
|
|
|
|
<head>\n{head}\n</head>\n\n\
|
|
|
|
|
|
|
|
<body (extrait)>\n{body}\n</body>\n\n\
|
|
|
|
|
|
|
|
Extrais UNIQUEMENT les URLs qui pointent vers des articles \
|
|
|
|
|
|
|
|
(pas les liens de navigation, tags, categories, login, pages statiques, etc.).\n\
|
|
|
|
|
|
|
|
Retourne les URLs completes dans le format JSON demande.",
|
|
|
|
|
|
|
|
head = head_html,
|
|
|
|
|
|
|
|
body = body_truncated,
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(system_prompt, user_prompt)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Build a prompt for LLM-assisted article content extraction.
|
|
|
|
|
|
|
|
pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (String, String) {
|
|
|
|
|
|
|
|
let system_prompt =
|
|
|
|
|
|
|
|
"Tu es un assistant qui analyse des articles web. \
|
|
|
|
|
|
|
|
Tu dois extraire les informations structurees de l'article. \
|
|
|
|
|
|
|
|
Reponds uniquement au format JSON demande."
|
|
|
|
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let user_prompt = format!(
|
|
|
|
|
|
|
|
"Voici le contenu d'une page web.\n\n\
|
|
|
|
|
|
|
|
<head>\n{head}\n</head>\n\n\
|
|
|
|
|
|
|
|
Contenu textuel de la page :\n{body}\n\n\
|
|
|
|
|
|
|
|
Extrais les informations suivantes :\n\
|
|
|
|
|
|
|
|
- title : le titre de l'article\n\
|
|
|
|
|
|
|
|
- published_date : la date de publication au format ISO 8601 (YYYY-MM-DDTHH:MM:SSZ), \
|
|
|
|
|
|
|
|
ou une chaine vide si introuvable\n\
|
|
|
|
|
|
|
|
- body_text : le contenu principal de l'article (pas la navigation, pas les pubs)\n\
|
|
|
|
|
|
|
|
- is_error_page : true si c'est une page d'erreur/404, false sinon",
|
|
|
|
|
|
|
|
head = head_html,
|
|
|
|
|
|
|
|
body = body_text,
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(system_prompt, user_prompt)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Build a prompt for classifying scraped articles into categories.
|
|
|
|
/// Build a prompt for classifying scraped articles into categories.
|
|
|
|
///
|
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
/// # Arguments
|
|
|
|
@ -462,4 +511,27 @@ mod tests {
|
|
|
|
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
|
|
|
|
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
|
|
|
|
assert!(user_prompt.contains("exactement"));
|
|
|
|
assert!(user_prompt.contains("exactement"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn link_extraction_prompt_includes_html() {
|
|
|
|
|
|
|
|
let (sys, user) = build_link_extraction_prompt("<title>Blog</title>", "<a href='/post'>P</a>");
|
|
|
|
|
|
|
|
assert!(user.contains("<title>Blog</title>"));
|
|
|
|
|
|
|
|
assert!(user.contains("articles"));
|
|
|
|
|
|
|
|
assert!(sys.contains("liens"));
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn link_extraction_prompt_truncates_body() {
|
|
|
|
|
|
|
|
let long_body = "x".repeat(20000);
|
|
|
|
|
|
|
|
let (_, user) = build_link_extraction_prompt("", &long_body);
|
|
|
|
|
|
|
|
assert!(user.len() < 15000);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
|
|
fn article_extraction_prompt_includes_content() {
|
|
|
|
|
|
|
|
let (_, user) = build_article_extraction_prompt("<meta name='date'>", "Article body here");
|
|
|
|
|
|
|
|
assert!(user.contains("Article body here"));
|
|
|
|
|
|
|
|
assert!(user.contains("published_date"));
|
|
|
|
|
|
|
|
assert!(user.contains("is_error_page"));
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|