You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

303 lines
9.9 KiB
Rust

//! JSON Schema builder for structured LLM output.
//!
//! Constructs the JSON Schema that is passed to the LLM provider
//! to enforce structured output matching the user's categories.
use serde_json::Value;
/// Build a JSON Schema for structured output based on user categories.
///
/// Each category is mapped to a property named `category_0`, `category_1`, etc.
/// Each property is an array of news items with `title`, `url`, and `summary` fields.
///
/// # Example
///
/// For categories `["Major Announcements", "Research"]`, produces:
/// ```json
/// {
/// "type": "object",
/// "properties": {
/// "category_0": {
/// "type": "array",
/// "description": "Major Announcements",
/// "items": {
/// "type": "object",
/// "properties": {
/// "title": { "type": "string" },
/// "url": { "type": "string" },
/// "summary": { "type": "string" }
/// },
/// "required": ["title", "url", "summary"]
/// }
/// },
/// "category_1": { ... }
/// },
/// "required": ["category_0", "category_1"]
/// }
/// ```
pub fn build_category_schema(categories: &[String], max_items_per_category: i32) -> Value {
let news_item_schema = serde_json::json!({
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The title of the news article"
},
"url": {
"type": "string",
"description": "The URL of the source article"
},
"summary": {
"type": "string",
"description": "A concise summary of the article"
}
},
"required": ["title", "url", "summary"],
"additionalProperties": false
});
let mut properties = serde_json::Map::new();
let mut required = Vec::new();
for (i, category_name) in categories.iter().enumerate() {
let key = format!("category_{}", i);
properties.insert(
key.clone(),
serde_json::json!({
"type": "array",
"description": category_name,
"items": news_item_schema,
"minItems": max_items_per_category,
"maxItems": max_items_per_category
}),
);
required.push(Value::String(key));
}
serde_json::json!({
"type": "object",
"properties": properties,
"required": required,
"additionalProperties": false
})
}
/// Build a JSON Schema for the article classification response.
///
/// The LLM returns an array of assignments mapping article indices to category names.
pub fn build_classification_schema() -> Value {
serde_json::json!({
"type": "object",
"properties": {
"assignments": {
"type": "array",
"items": {
"type": "object",
"properties": {
"index": { "type": "integer", "description": "Article index from the input list" },
"category": { "type": "string", "description": "Category name to assign this article to" }
},
"required": ["index", "category"],
"additionalProperties": false
}
}
},
"required": ["assignments"],
"additionalProperties": false
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn schema_with_one_category() {
let categories = vec!["AI News".to_string()];
let schema = build_category_schema(&categories, 5);
assert_eq!(schema["type"], "object");
// One property
let props = schema["properties"].as_object().unwrap();
assert_eq!(props.len(), 1);
assert!(props.contains_key("category_0"));
// Category description
assert_eq!(props["category_0"]["description"], "AI News");
// Array type with items
assert_eq!(props["category_0"]["type"], "array");
let items = &props["category_0"]["items"];
assert_eq!(items["type"], "object");
assert!(items["properties"].get("title").is_some());
assert!(items["properties"].get("url").is_some());
assert!(items["properties"].get("summary").is_some());
// Required fields
let required = schema["required"].as_array().unwrap();
assert_eq!(required.len(), 1);
assert_eq!(required[0], "category_0");
// OpenAI strict mode: additionalProperties must be false on all objects
assert_eq!(schema["additionalProperties"], false);
assert_eq!(items["additionalProperties"], false);
}
#[test]
fn schema_with_three_categories() {
let categories = vec![
"Annonces majeures".to_string(),
"Recherche".to_string(),
"Secteur public".to_string(),
];
let schema = build_category_schema(&categories, 5);
let props = schema["properties"].as_object().unwrap();
assert_eq!(props.len(), 3);
assert_eq!(props["category_0"]["description"], "Annonces majeures");
assert_eq!(props["category_1"]["description"], "Recherche");
assert_eq!(props["category_2"]["description"], "Secteur public");
let required = schema["required"].as_array().unwrap();
assert_eq!(required.len(), 3);
}
#[test]
fn schema_with_five_categories() {
let categories: Vec<String> = (0..5)
.map(|i| format!("Category {}", i))
.collect();
let schema = build_category_schema(&categories, 5);
let props = schema["properties"].as_object().unwrap();
assert_eq!(props.len(), 5);
for i in 0..5 {
let key = format!("category_{}", i);
assert!(props.contains_key(&key));
assert_eq!(
props[&key]["description"].as_str().unwrap(),
format!("Category {}", i)
);
}
let required = schema["required"].as_array().unwrap();
assert_eq!(required.len(), 5);
}
#[test]
fn schema_with_empty_categories() {
let categories: Vec<String> = vec![];
let schema = build_category_schema(&categories, 5);
let props = schema["properties"].as_object().unwrap();
assert_eq!(props.len(), 0);
let required = schema["required"].as_array().unwrap();
assert_eq!(required.len(), 0);
}
#[test]
fn schema_news_item_has_required_fields() {
let categories = vec!["Test".to_string()];
let schema = build_category_schema(&categories, 5);
let items = &schema["properties"]["category_0"]["items"];
let item_required = items["required"].as_array().unwrap();
let item_required_strs: Vec<&str> = item_required
.iter()
.map(|v| v.as_str().unwrap())
.collect();
assert!(item_required_strs.contains(&"title"));
assert!(item_required_strs.contains(&"url"));
assert!(item_required_strs.contains(&"summary"));
}
#[test]
fn schema_meets_openai_strict_mode_requirements() {
let categories = vec!["Test".to_string(), "Other".to_string()];
let schema = build_category_schema(&categories, 5);
// Every "type": "object" must have "additionalProperties": false
assert_eq!(
schema["additionalProperties"], false,
"Root object must have additionalProperties: false"
);
let items = &schema["properties"]["category_0"]["items"];
assert_eq!(
items["additionalProperties"], false,
"News item object must have additionalProperties: false"
);
// All properties must be listed in required
let props: Vec<&str> = schema["properties"]
.as_object()
.unwrap()
.keys()
.map(|k| k.as_str())
.collect();
let required: Vec<&str> = schema["required"]
.as_array()
.unwrap()
.iter()
.map(|v| v.as_str().unwrap())
.collect();
for prop in &props {
assert!(
required.contains(prop),
"Property '{}' must be in required array",
prop
);
}
// News item required fields must match properties
let item_props: Vec<&str> = items["properties"]
.as_object()
.unwrap()
.keys()
.map(|k| k.as_str())
.collect();
let item_required: Vec<&str> = items["required"]
.as_array()
.unwrap()
.iter()
.map(|v| v.as_str().unwrap())
.collect();
for prop in &item_props {
assert!(
item_required.contains(prop),
"News item property '{}' must be in required array",
prop
);
}
}
#[test]
fn schema_with_special_characters_in_category_name() {
let categories = vec![
"AI & Machine Learning".to_string(),
"R&D / Innovation".to_string(),
];
let schema = build_category_schema(&categories, 5);
let props = schema["properties"].as_object().unwrap();
assert_eq!(props["category_0"]["description"], "AI & Machine Learning");
assert_eq!(props["category_1"]["description"], "R&D / Innovation");
}
#[test]
fn classification_schema_has_assignments_array() {
let schema = build_classification_schema();
assert_eq!(schema["type"], "object");
let assignments = &schema["properties"]["assignments"];
assert_eq!(assignments["type"], "array");
let item_props = &assignments["items"]["properties"];
assert!(item_props.get("index").is_some());
assert!(item_props.get("category").is_some());
assert_eq!(assignments["items"]["additionalProperties"], false);
assert_eq!(schema["additionalProperties"], false);
}
}