feat: add LLM prompts and schemas for link and article extraction

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
master
oabrivard 3 months ago
parent 23f121a58d
commit e6e8aa1eeb

@ -107,6 +107,36 @@ pub fn build_classification_schema() -> Value {
}) })
} }
/// Build a JSON Schema for LLM link extraction response.
pub fn build_link_extraction_schema() -> Value {
serde_json::json!({
"type": "object",
"properties": {
"urls": {
"type": "array",
"items": { "type": "string" }
}
},
"required": ["urls"],
"additionalProperties": false
})
}
/// Build a JSON Schema for LLM article content extraction response.
pub fn build_article_extraction_schema() -> Value {
serde_json::json!({
"type": "object",
"properties": {
"title": { "type": "string", "description": "Article title" },
"published_date": { "type": "string", "description": "ISO 8601 date or empty string if not found" },
"body_text": { "type": "string", "description": "Main article content" },
"is_error_page": { "type": "boolean", "description": "True if this is an error/404 page" }
},
"required": ["title", "published_date", "body_text", "is_error_page"],
"additionalProperties": false
})
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -299,4 +329,24 @@ mod tests {
assert_eq!(assignments["items"]["additionalProperties"], false); assert_eq!(assignments["items"]["additionalProperties"], false);
assert_eq!(schema["additionalProperties"], false); assert_eq!(schema["additionalProperties"], false);
} }
#[test]
fn link_extraction_schema_has_urls_array() {
let schema = build_link_extraction_schema();
assert_eq!(schema["properties"]["urls"]["type"], "array");
assert_eq!(schema["additionalProperties"], false);
}
#[test]
fn article_extraction_schema_strict_mode_compatible() {
let schema = build_article_extraction_schema();
let props = schema["properties"].as_object().unwrap();
assert!(props.contains_key("title"));
assert!(props.contains_key("published_date"));
assert!(props.contains_key("body_text"));
assert!(props.contains_key("is_error_page"));
assert_eq!(schema["additionalProperties"], false);
// published_date is string (not union type) for OpenAI strict mode
assert_eq!(props["published_date"]["type"], "string");
}
} }

@ -156,6 +156,55 @@ pub fn build_rewrite_prompt(
(system_prompt, user_prompt) (system_prompt, user_prompt)
} }
/// Build a prompt for LLM-assisted link extraction from a source page.
pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) {
let system_prompt =
"Tu es un assistant qui analyse des pages web. \
Tu dois identifier les liens vers des articles d'actualite. \
Reponds uniquement au format JSON demande."
.to_string();
let body_truncated: String = body_html.chars().take(8000).collect();
let user_prompt = format!(
"Voici le contenu HTML d'une page de blog ou de site d'actualites.\n\n\
<head>\n{head}\n</head>\n\n\
<body (extrait)>\n{body}\n</body>\n\n\
Extrais UNIQUEMENT les URLs qui pointent vers des articles \
(pas les liens de navigation, tags, categories, login, pages statiques, etc.).\n\
Retourne les URLs completes dans le format JSON demande.",
head = head_html,
body = body_truncated,
);
(system_prompt, user_prompt)
}
/// Build a prompt for LLM-assisted article content extraction.
pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (String, String) {
let system_prompt =
"Tu es un assistant qui analyse des articles web. \
Tu dois extraire les informations structurees de l'article. \
Reponds uniquement au format JSON demande."
.to_string();
let user_prompt = format!(
"Voici le contenu d'une page web.\n\n\
<head>\n{head}\n</head>\n\n\
Contenu textuel de la page :\n{body}\n\n\
Extrais les informations suivantes :\n\
- title : le titre de l'article\n\
- published_date : la date de publication au format ISO 8601 (YYYY-MM-DDTHH:MM:SSZ), \
ou une chaine vide si introuvable\n\
- body_text : le contenu principal de l'article (pas la navigation, pas les pubs)\n\
- is_error_page : true si c'est une page d'erreur/404, false sinon",
head = head_html,
body = body_text,
);
(system_prompt, user_prompt)
}
/// Build a prompt for classifying scraped articles into categories. /// Build a prompt for classifying scraped articles into categories.
/// ///
/// # Arguments /// # Arguments
@ -462,4 +511,27 @@ mod tests {
let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None); let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None);
assert!(user_prompt.contains("exactement")); assert!(user_prompt.contains("exactement"));
} }
#[test]
fn link_extraction_prompt_includes_html() {
let (sys, user) = build_link_extraction_prompt("<title>Blog</title>", "<a href='/post'>P</a>");
assert!(user.contains("<title>Blog</title>"));
assert!(user.contains("articles"));
assert!(sys.contains("liens"));
}
#[test]
fn link_extraction_prompt_truncates_body() {
let long_body = "x".repeat(20000);
let (_, user) = build_link_extraction_prompt("", &long_body);
assert!(user.len() < 15000);
}
#[test]
fn article_extraction_prompt_includes_content() {
let (_, user) = build_article_extraction_prompt("<meta name='date'>", "Article body here");
assert!(user.contains("Article body here"));
assert!(user.contains("published_date"));
assert!(user.contains("is_error_page"));
}
} }

Loading…
Cancel
Save