160 lines
4.7 KiB
Rust
160 lines
4.7 KiB
Rust
use anyhow::{Context, Result};
|
|
use serde::{Deserialize, Serialize};
|
|
use std::process::Command;
|
|
|
|
pub fn classify(tag_tree: &str, content: String) -> Result<String> {
|
|
let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags.
|
|
|
|
# RULES:
|
|
- Each level down = narrower specialization
|
|
- Assign MOST SPECIFIC tags that fit (prefer leaf nodes when appropriate)
|
|
- If no good fit exists, suggest new tag(s) with proposed location in tree
|
|
- Output JSON only
|
|
|
|
# CURRENT TAG TREE:
|
|
{tag_tree}
|
|
|
|
# RESOURCE INFORMATION:
|
|
{content}
|
|
|
|
# OUTPUT FORMAT:
|
|
{{
|
|
\"tags\": [\"path/to/tag1\", \"path/to/tag2\"],
|
|
\"confidence\": [0.95, 0.87],
|
|
\"new_tags\": [
|
|
{{
|
|
\"name\": \"suggested_tag\",
|
|
\"parent\": \"path/to/parent\",
|
|
\"reason\": \"why this tag is needed\"
|
|
}}
|
|
],
|
|
\"reasoning\": \"brief explanation of classification\"
|
|
}}");
|
|
|
|
let out = Command::new("codex")
|
|
.arg("e")
|
|
.arg(prompt)
|
|
.output()
|
|
.with_context(|| "Failed to execute classification command")?;
|
|
println!("Output: {:?}", out);
|
|
Ok(String::from_utf8_lossy(&out.stdout).to_string())
|
|
}
|
|
|
|
pub fn classify_with_retry(
|
|
tag_tree: &str,
|
|
content: String,
|
|
max_attempts: u32,
|
|
) -> Result<ClassificationResult> {
|
|
for attempt in 1..=max_attempts {
|
|
match classify(tag_tree, content.clone()) {
|
|
Ok(json) => match ClassificationResult::from_json(&json) {
|
|
Ok(result) => return Ok(result),
|
|
Err(e) => {
|
|
eprintln!(
|
|
"Attempt {}/{}: Failed to parse: {}",
|
|
attempt, max_attempts, e
|
|
);
|
|
eprintln!("Raw response: {}", json);
|
|
if attempt == max_attempts {
|
|
return Err(e.into());
|
|
}
|
|
}
|
|
},
|
|
Err(e) => {
|
|
eprintln!(
|
|
"Attempt {}/{}: LLM call failed: {}",
|
|
attempt, max_attempts, e
|
|
);
|
|
if attempt == max_attempts {
|
|
return Err(e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
unreachable!()
|
|
}
|
|
|
|
// Yeah
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
pub struct ClassificationResult {
|
|
#[serde(default)]
|
|
pub tags: Vec<String>,
|
|
#[serde(default)]
|
|
pub confidence: Vec<f32>,
|
|
#[serde(default)]
|
|
pub new_tags: Vec<NewTagSuggestion>,
|
|
#[serde(default)]
|
|
pub reasoning: String,
|
|
}
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
pub struct NewTagSuggestion {
|
|
pub name: String,
|
|
pub parent: String,
|
|
pub reason: String,
|
|
}
|
|
|
|
impl ClassificationResult {
|
|
/// Parse from the JSON string returned by the LLM
|
|
pub fn from_json(json_str: &str) -> Result<Self, serde_json::Error> {
|
|
serde_json::from_str(json_str)
|
|
}
|
|
|
|
/// Get the most confident tag (if any exist)
|
|
pub fn primary_tag(&self) -> Option<(&str, f32)> {
|
|
self.tags
|
|
.iter()
|
|
.zip(self.confidence.iter())
|
|
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
|
|
.map(|(tag, conf)| (tag.as_str(), *conf))
|
|
}
|
|
|
|
/// Check if classification confidence is above threshold
|
|
pub fn is_confident(&self, threshold: f32) -> bool {
|
|
self.confidence.iter().any(|&c| c >= threshold)
|
|
}
|
|
|
|
/// Get tags above confidence threshold
|
|
pub fn confident_tags(&self, threshold: f32) -> Vec<&str> {
|
|
self.tags
|
|
.iter()
|
|
.zip(self.confidence.iter())
|
|
.filter(|&(_, &conf)| conf >= threshold)
|
|
.map(|(tag, _)| tag.as_str())
|
|
.collect()
|
|
}
|
|
}
|
|
|
|
// Example usage in your code:
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_parse_example() {
|
|
let json = r#"{
|
|
"tags": ["cs/theory/algorithms/compression"],
|
|
"confidence": [0.42],
|
|
"new_tags": [
|
|
{
|
|
"name": "information_theory",
|
|
"parent": "cs/theory",
|
|
"reason": "Resource is explicitly about learning information theory concepts (entropy, intuition, applications)."
|
|
}
|
|
],
|
|
"reasoning": "The content is centered on information theory; the closest existing tag is compression under theory/algorithms, but a dedicated information theory tag would fit better."
|
|
}"#;
|
|
|
|
let result = ClassificationResult::from_json(json).unwrap();
|
|
|
|
assert_eq!(result.tags.len(), 1);
|
|
assert_eq!(result.tags[0], "cs/theory/algorithms/compression");
|
|
assert_eq!(result.confidence[0], 0.42);
|
|
assert_eq!(result.new_tags.len(), 1);
|
|
assert_eq!(result.new_tags[0].name, "information_theory");
|
|
|
|
println!("Primary tag: {:?}", result.primary_tag());
|
|
println!("Is confident (>0.5): {}", result.is_confident(0.5));
|
|
}
|
|
}
|