use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; use std::process::Command; pub fn classify(tag_tree: &str, content: String) -> Result { let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags. # RULES: - Each level down = narrower specialization - Assign MOST SPECIFIC tags that fit (prefer leaf nodes when appropriate) - If no good fit exists, suggest new tag(s) with proposed location in tree - Output JSON only # CURRENT TAG TREE: {tag_tree} # RESOURCE INFORMATION: {content} # OUTPUT FORMAT: {{ \"tags\": [\"path/to/tag1\", \"path/to/tag2\"], \"confidence\": [0.95, 0.87], \"new_tags\": [ {{ \"name\": \"suggested_tag\", \"parent\": \"path/to/parent\", \"reason\": \"why this tag is needed\" }} ], \"reasoning\": \"brief explanation of classification\" }}"); let out = Command::new("codex") .arg("e") .arg(prompt) .output() .with_context(|| "Failed to execute classification command")?; println!("Output: {:?}", out); Ok(String::from_utf8_lossy(&out.stdout).to_string()) } pub fn classify_with_retry( tag_tree: &str, content: String, max_attempts: u32, ) -> Result { for attempt in 1..=max_attempts { match classify(tag_tree, content.clone()) { Ok(json) => match ClassificationResult::from_json(&json) { Ok(result) => return Ok(result), Err(e) => { eprintln!( "Attempt {}/{}: Failed to parse: {}", attempt, max_attempts, e ); eprintln!("Raw response: {}", json); if attempt == max_attempts { return Err(e.into()); } } }, Err(e) => { eprintln!( "Attempt {}/{}: LLM call failed: {}", attempt, max_attempts, e ); if attempt == max_attempts { return Err(e); } } } } unreachable!() } // Yeah #[derive(Debug, Serialize, Deserialize)] pub struct ClassificationResult { #[serde(default)] pub tags: Vec, #[serde(default)] pub confidence: Vec, #[serde(default)] pub new_tags: Vec, #[serde(default)] pub reasoning: String, } #[derive(Debug, Serialize, Deserialize)] pub struct NewTagSuggestion { pub name: String, pub parent: String, pub reason: String, } impl ClassificationResult { /// Parse from the JSON string returned by the LLM pub fn from_json(json_str: &str) -> Result { serde_json::from_str(json_str) } /// Get the most confident tag (if any exist) pub fn primary_tag(&self) -> Option<(&str, f32)> { self.tags .iter() .zip(self.confidence.iter()) .max_by(|a, b| a.1.partial_cmp(b.1).unwrap()) .map(|(tag, conf)| (tag.as_str(), *conf)) } /// Check if classification confidence is above threshold pub fn is_confident(&self, threshold: f32) -> bool { self.confidence.iter().any(|&c| c >= threshold) } /// Get tags above confidence threshold pub fn confident_tags(&self, threshold: f32) -> Vec<&str> { self.tags .iter() .zip(self.confidence.iter()) .filter(|&(_, &conf)| conf >= threshold) .map(|(tag, _)| tag.as_str()) .collect() } } // Example usage in your code: #[cfg(test)] mod tests { use super::*; #[test] fn test_parse_example() { let json = r#"{ "tags": ["cs/theory/algorithms/compression"], "confidence": [0.42], "new_tags": [ { "name": "information_theory", "parent": "cs/theory", "reason": "Resource is explicitly about learning information theory concepts (entropy, intuition, applications)." } ], "reasoning": "The content is centered on information theory; the closest existing tag is compression under theory/algorithms, but a dedicated information theory tag would fit better." }"#; let result = ClassificationResult::from_json(json).unwrap(); assert_eq!(result.tags.len(), 1); assert_eq!(result.tags[0], "cs/theory/algorithms/compression"); assert_eq!(result.confidence[0], 0.42); assert_eq!(result.new_tags.len(), 1); assert_eq!(result.new_tags[0].name, "information_theory"); println!("Primary tag: {:?}", result.primary_tag()); println!("Is confident (>0.5): {}", result.is_confident(0.5)); } }