You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
303 lines
9.9 KiB
Rust
303 lines
9.9 KiB
Rust
//! JSON Schema builder for structured LLM output.
|
|
//!
|
|
//! Constructs the JSON Schema that is passed to the LLM provider
|
|
//! to enforce structured output matching the user's categories.
|
|
|
|
use serde_json::Value;
|
|
|
|
/// Build a JSON Schema for structured output based on user categories.
|
|
///
|
|
/// Each category is mapped to a property named `category_0`, `category_1`, etc.
|
|
/// Each property is an array of news items with `title`, `url`, and `summary` fields.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// For categories `["Major Announcements", "Research"]`, produces:
|
|
/// ```json
|
|
/// {
|
|
/// "type": "object",
|
|
/// "properties": {
|
|
/// "category_0": {
|
|
/// "type": "array",
|
|
/// "description": "Major Announcements",
|
|
/// "items": {
|
|
/// "type": "object",
|
|
/// "properties": {
|
|
/// "title": { "type": "string" },
|
|
/// "url": { "type": "string" },
|
|
/// "summary": { "type": "string" }
|
|
/// },
|
|
/// "required": ["title", "url", "summary"]
|
|
/// }
|
|
/// },
|
|
/// "category_1": { ... }
|
|
/// },
|
|
/// "required": ["category_0", "category_1"]
|
|
/// }
|
|
/// ```
|
|
pub fn build_category_schema(categories: &[String], max_items_per_category: i32) -> Value {
|
|
let news_item_schema = serde_json::json!({
|
|
"type": "object",
|
|
"properties": {
|
|
"title": {
|
|
"type": "string",
|
|
"description": "The title of the news article"
|
|
},
|
|
"url": {
|
|
"type": "string",
|
|
"description": "The URL of the source article"
|
|
},
|
|
"summary": {
|
|
"type": "string",
|
|
"description": "A concise summary of the article"
|
|
}
|
|
},
|
|
"required": ["title", "url", "summary"],
|
|
"additionalProperties": false
|
|
});
|
|
|
|
let mut properties = serde_json::Map::new();
|
|
let mut required = Vec::new();
|
|
|
|
for (i, category_name) in categories.iter().enumerate() {
|
|
let key = format!("category_{}", i);
|
|
properties.insert(
|
|
key.clone(),
|
|
serde_json::json!({
|
|
"type": "array",
|
|
"description": category_name,
|
|
"items": news_item_schema,
|
|
"minItems": max_items_per_category,
|
|
"maxItems": max_items_per_category
|
|
}),
|
|
);
|
|
required.push(Value::String(key));
|
|
}
|
|
|
|
serde_json::json!({
|
|
"type": "object",
|
|
"properties": properties,
|
|
"required": required,
|
|
"additionalProperties": false
|
|
})
|
|
}
|
|
|
|
/// Build a JSON Schema for the article classification response.
|
|
///
|
|
/// The LLM returns an array of assignments mapping article indices to category names.
|
|
pub fn build_classification_schema() -> Value {
|
|
serde_json::json!({
|
|
"type": "object",
|
|
"properties": {
|
|
"assignments": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"properties": {
|
|
"index": { "type": "integer", "description": "Article index from the input list" },
|
|
"category": { "type": "string", "description": "Category name to assign this article to" }
|
|
},
|
|
"required": ["index", "category"],
|
|
"additionalProperties": false
|
|
}
|
|
}
|
|
},
|
|
"required": ["assignments"],
|
|
"additionalProperties": false
|
|
})
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn schema_with_one_category() {
|
|
let categories = vec!["AI News".to_string()];
|
|
let schema = build_category_schema(&categories, 5);
|
|
|
|
assert_eq!(schema["type"], "object");
|
|
|
|
// One property
|
|
let props = schema["properties"].as_object().unwrap();
|
|
assert_eq!(props.len(), 1);
|
|
assert!(props.contains_key("category_0"));
|
|
|
|
// Category description
|
|
assert_eq!(props["category_0"]["description"], "AI News");
|
|
|
|
// Array type with items
|
|
assert_eq!(props["category_0"]["type"], "array");
|
|
let items = &props["category_0"]["items"];
|
|
assert_eq!(items["type"], "object");
|
|
assert!(items["properties"].get("title").is_some());
|
|
assert!(items["properties"].get("url").is_some());
|
|
assert!(items["properties"].get("summary").is_some());
|
|
|
|
// Required fields
|
|
let required = schema["required"].as_array().unwrap();
|
|
assert_eq!(required.len(), 1);
|
|
assert_eq!(required[0], "category_0");
|
|
|
|
// OpenAI strict mode: additionalProperties must be false on all objects
|
|
assert_eq!(schema["additionalProperties"], false);
|
|
assert_eq!(items["additionalProperties"], false);
|
|
}
|
|
|
|
#[test]
|
|
fn schema_with_three_categories() {
|
|
let categories = vec![
|
|
"Annonces majeures".to_string(),
|
|
"Recherche".to_string(),
|
|
"Secteur public".to_string(),
|
|
];
|
|
let schema = build_category_schema(&categories, 5);
|
|
|
|
let props = schema["properties"].as_object().unwrap();
|
|
assert_eq!(props.len(), 3);
|
|
assert_eq!(props["category_0"]["description"], "Annonces majeures");
|
|
assert_eq!(props["category_1"]["description"], "Recherche");
|
|
assert_eq!(props["category_2"]["description"], "Secteur public");
|
|
|
|
let required = schema["required"].as_array().unwrap();
|
|
assert_eq!(required.len(), 3);
|
|
}
|
|
|
|
#[test]
|
|
fn schema_with_five_categories() {
|
|
let categories: Vec<String> = (0..5)
|
|
.map(|i| format!("Category {}", i))
|
|
.collect();
|
|
let schema = build_category_schema(&categories, 5);
|
|
|
|
let props = schema["properties"].as_object().unwrap();
|
|
assert_eq!(props.len(), 5);
|
|
|
|
for i in 0..5 {
|
|
let key = format!("category_{}", i);
|
|
assert!(props.contains_key(&key));
|
|
assert_eq!(
|
|
props[&key]["description"].as_str().unwrap(),
|
|
format!("Category {}", i)
|
|
);
|
|
}
|
|
|
|
let required = schema["required"].as_array().unwrap();
|
|
assert_eq!(required.len(), 5);
|
|
}
|
|
|
|
#[test]
|
|
fn schema_with_empty_categories() {
|
|
let categories: Vec<String> = vec![];
|
|
let schema = build_category_schema(&categories, 5);
|
|
|
|
let props = schema["properties"].as_object().unwrap();
|
|
assert_eq!(props.len(), 0);
|
|
|
|
let required = schema["required"].as_array().unwrap();
|
|
assert_eq!(required.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn schema_news_item_has_required_fields() {
|
|
let categories = vec!["Test".to_string()];
|
|
let schema = build_category_schema(&categories, 5);
|
|
|
|
let items = &schema["properties"]["category_0"]["items"];
|
|
let item_required = items["required"].as_array().unwrap();
|
|
let item_required_strs: Vec<&str> = item_required
|
|
.iter()
|
|
.map(|v| v.as_str().unwrap())
|
|
.collect();
|
|
|
|
assert!(item_required_strs.contains(&"title"));
|
|
assert!(item_required_strs.contains(&"url"));
|
|
assert!(item_required_strs.contains(&"summary"));
|
|
}
|
|
|
|
#[test]
|
|
fn schema_meets_openai_strict_mode_requirements() {
|
|
let categories = vec!["Test".to_string(), "Other".to_string()];
|
|
let schema = build_category_schema(&categories, 5);
|
|
|
|
// Every "type": "object" must have "additionalProperties": false
|
|
assert_eq!(
|
|
schema["additionalProperties"], false,
|
|
"Root object must have additionalProperties: false"
|
|
);
|
|
let items = &schema["properties"]["category_0"]["items"];
|
|
assert_eq!(
|
|
items["additionalProperties"], false,
|
|
"News item object must have additionalProperties: false"
|
|
);
|
|
|
|
// All properties must be listed in required
|
|
let props: Vec<&str> = schema["properties"]
|
|
.as_object()
|
|
.unwrap()
|
|
.keys()
|
|
.map(|k| k.as_str())
|
|
.collect();
|
|
let required: Vec<&str> = schema["required"]
|
|
.as_array()
|
|
.unwrap()
|
|
.iter()
|
|
.map(|v| v.as_str().unwrap())
|
|
.collect();
|
|
for prop in &props {
|
|
assert!(
|
|
required.contains(prop),
|
|
"Property '{}' must be in required array",
|
|
prop
|
|
);
|
|
}
|
|
|
|
// News item required fields must match properties
|
|
let item_props: Vec<&str> = items["properties"]
|
|
.as_object()
|
|
.unwrap()
|
|
.keys()
|
|
.map(|k| k.as_str())
|
|
.collect();
|
|
let item_required: Vec<&str> = items["required"]
|
|
.as_array()
|
|
.unwrap()
|
|
.iter()
|
|
.map(|v| v.as_str().unwrap())
|
|
.collect();
|
|
for prop in &item_props {
|
|
assert!(
|
|
item_required.contains(prop),
|
|
"News item property '{}' must be in required array",
|
|
prop
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn schema_with_special_characters_in_category_name() {
|
|
let categories = vec![
|
|
"AI & Machine Learning".to_string(),
|
|
"R&D / Innovation".to_string(),
|
|
];
|
|
let schema = build_category_schema(&categories, 5);
|
|
|
|
let props = schema["properties"].as_object().unwrap();
|
|
assert_eq!(props["category_0"]["description"], "AI & Machine Learning");
|
|
assert_eq!(props["category_1"]["description"], "R&D / Innovation");
|
|
}
|
|
|
|
#[test]
|
|
fn classification_schema_has_assignments_array() {
|
|
let schema = build_classification_schema();
|
|
assert_eq!(schema["type"], "object");
|
|
let assignments = &schema["properties"]["assignments"];
|
|
assert_eq!(assignments["type"], "array");
|
|
let item_props = &assignments["items"]["properties"];
|
|
assert!(item_props.get("index").is_some());
|
|
assert!(item_props.get("category").is_some());
|
|
assert_eq!(assignments["items"]["additionalProperties"], false);
|
|
assert_eq!(schema["additionalProperties"], false);
|
|
}
|
|
}
|