diff --git a/backend/src/services/llm/schema.rs b/backend/src/services/llm/schema.rs index f137b92..17a244d 100644 --- a/backend/src/services/llm/schema.rs +++ b/backend/src/services/llm/schema.rs @@ -107,6 +107,36 @@ pub fn build_classification_schema() -> Value { }) } +/// Build a JSON Schema for LLM link extraction response. +pub fn build_link_extraction_schema() -> Value { + serde_json::json!({ + "type": "object", + "properties": { + "urls": { + "type": "array", + "items": { "type": "string" } + } + }, + "required": ["urls"], + "additionalProperties": false + }) +} + +/// Build a JSON Schema for LLM article content extraction response. +pub fn build_article_extraction_schema() -> Value { + serde_json::json!({ + "type": "object", + "properties": { + "title": { "type": "string", "description": "Article title" }, + "published_date": { "type": "string", "description": "ISO 8601 date or empty string if not found" }, + "body_text": { "type": "string", "description": "Main article content" }, + "is_error_page": { "type": "boolean", "description": "True if this is an error/404 page" } + }, + "required": ["title", "published_date", "body_text", "is_error_page"], + "additionalProperties": false + }) +} + #[cfg(test)] mod tests { use super::*; @@ -299,4 +329,24 @@ mod tests { assert_eq!(assignments["items"]["additionalProperties"], false); assert_eq!(schema["additionalProperties"], false); } + + #[test] + fn link_extraction_schema_has_urls_array() { + let schema = build_link_extraction_schema(); + assert_eq!(schema["properties"]["urls"]["type"], "array"); + assert_eq!(schema["additionalProperties"], false); + } + + #[test] + fn article_extraction_schema_strict_mode_compatible() { + let schema = build_article_extraction_schema(); + let props = schema["properties"].as_object().unwrap(); + assert!(props.contains_key("title")); + assert!(props.contains_key("published_date")); + assert!(props.contains_key("body_text")); + assert!(props.contains_key("is_error_page")); + assert_eq!(schema["additionalProperties"], false); + // published_date is string (not union type) for OpenAI strict mode + assert_eq!(props["published_date"]["type"], "string"); + } } diff --git a/backend/src/services/prompts.rs b/backend/src/services/prompts.rs index 6d13d47..5659664 100644 --- a/backend/src/services/prompts.rs +++ b/backend/src/services/prompts.rs @@ -156,6 +156,55 @@ pub fn build_rewrite_prompt( (system_prompt, user_prompt) } +/// Build a prompt for LLM-assisted link extraction from a source page. +pub fn build_link_extraction_prompt(head_html: &str, body_html: &str) -> (String, String) { + let system_prompt = + "Tu es un assistant qui analyse des pages web. \ + Tu dois identifier les liens vers des articles d'actualite. \ + Reponds uniquement au format JSON demande." + .to_string(); + + let body_truncated: String = body_html.chars().take(8000).collect(); + + let user_prompt = format!( + "Voici le contenu HTML d'une page de blog ou de site d'actualites.\n\n\ + \n{head}\n\n\n\ + \n{body}\n\n\n\ + Extrais UNIQUEMENT les URLs qui pointent vers des articles \ + (pas les liens de navigation, tags, categories, login, pages statiques, etc.).\n\ + Retourne les URLs completes dans le format JSON demande.", + head = head_html, + body = body_truncated, + ); + + (system_prompt, user_prompt) +} + +/// Build a prompt for LLM-assisted article content extraction. +pub fn build_article_extraction_prompt(head_html: &str, body_text: &str) -> (String, String) { + let system_prompt = + "Tu es un assistant qui analyse des articles web. \ + Tu dois extraire les informations structurees de l'article. \ + Reponds uniquement au format JSON demande." + .to_string(); + + let user_prompt = format!( + "Voici le contenu d'une page web.\n\n\ + \n{head}\n\n\n\ + Contenu textuel de la page :\n{body}\n\n\ + Extrais les informations suivantes :\n\ + - title : le titre de l'article\n\ + - published_date : la date de publication au format ISO 8601 (YYYY-MM-DDTHH:MM:SSZ), \ + ou une chaine vide si introuvable\n\ + - body_text : le contenu principal de l'article (pas la navigation, pas les pubs)\n\ + - is_error_page : true si c'est une page d'erreur/404, false sinon", + head = head_html, + body = body_text, + ); + + (system_prompt, user_prompt) +} + /// Build a prompt for classifying scraped articles into categories. /// /// # Arguments @@ -462,4 +511,27 @@ mod tests { let (_, user_prompt) = build_search_prompt(&settings, &sources, date, &[], None); assert!(user_prompt.contains("exactement")); } + + #[test] + fn link_extraction_prompt_includes_html() { + let (sys, user) = build_link_extraction_prompt("Blog", "P"); + assert!(user.contains("Blog")); + assert!(user.contains("articles")); + assert!(sys.contains("liens")); + } + + #[test] + fn link_extraction_prompt_truncates_body() { + let long_body = "x".repeat(20000); + let (_, user) = build_link_extraction_prompt("", &long_body); + assert!(user.len() < 15000); + } + + #[test] + fn article_extraction_prompt_includes_content() { + let (_, user) = build_article_extraction_prompt("", "Article body here"); + assert!(user.contains("Article body here")); + assert!(user.contains("published_date")); + assert!(user.contains("is_error_page")); + } }