facharbeit/src/classifiers.rs

use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::process::Command;

pub fn classify(tag_tree: &str, content: String) -> Result<String> {
    let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags.

# RULES:
- Each level down = narrower specialization
- Assign MOST SPECIFIC tags that fit (prefer leaf nodes when appropriate)
- If no good fit exists, suggest new tag(s) with proposed location in tree
- Output JSON only

# CURRENT TAG TREE:
{tag_tree}

# RESOURCE INFORMATION:
{content}

# OUTPUT FORMAT:
{{
  \"tags\": [\"path/to/tag1\", \"path/to/tag2\"],
  \"confidence\": [0.95, 0.87],
  \"new_tags\": [
    {{
      \"name\": \"suggested_tag\",
      \"parent\": \"path/to/parent\",
      \"reason\": \"why this tag is needed\"
    }}
  ],
  \"reasoning\": \"brief explanation of classification\"
}}");

    let out = Command::new("codex")
        .arg("e")
        .arg(prompt)
        .output()
        .with_context(|| "Failed to execute classification command")?;
    println!("Output: {:?}", out);
    Ok(String::from_utf8_lossy(&out.stdout).to_string())
}

pub fn classify_with_retry(
    tag_tree: &str,
    content: String,
    max_attempts: u32,
) -> Result<ClassificationResult> {
    for attempt in 1..=max_attempts {
        match classify(tag_tree, content.clone()) {
            Ok(json) => match ClassificationResult::from_json(&json) {
                Ok(result) => return Ok(result),
                Err(e) => {
                    eprintln!(
                        "Attempt {}/{}: Failed to parse: {}",
                        attempt, max_attempts, e
                    );
                    eprintln!("Raw response: {}", json);
                    if attempt == max_attempts {
                        return Err(e.into());
                    }
                }
            },
            Err(e) => {
                eprintln!(
                    "Attempt {}/{}: LLM call failed: {}",
                    attempt, max_attempts, e
                );
                if attempt == max_attempts {
                    return Err(e);
                }
            }
        }
    }
    unreachable!()
}

// Yeah

#[derive(Debug, Serialize, Deserialize)]
pub struct ClassificationResult {
    #[serde(default)]
    pub tags: Vec<String>,
    #[serde(default)]
    pub confidence: Vec<f32>,
    #[serde(default)]
    pub new_tags: Vec<NewTagSuggestion>,
    #[serde(default)]
    pub reasoning: String,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct NewTagSuggestion {
    pub name: String,
    pub parent: String,
    pub reason: String,
}

impl ClassificationResult {
    /// Parse from the JSON string returned by the LLM
    pub fn from_json(json_str: &str) -> Result<Self, serde_json::Error> {
        serde_json::from_str(json_str)
    }

    /// Get the most confident tag (if any exist)
    pub fn primary_tag(&self) -> Option<(&str, f32)> {
        self.tags
            .iter()
            .zip(self.confidence.iter())
            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
            .map(|(tag, conf)| (tag.as_str(), *conf))
    }

    /// Check if classification confidence is above threshold
    pub fn is_confident(&self, threshold: f32) -> bool {
        self.confidence.iter().any(|&c| c >= threshold)
    }

    /// Get tags above confidence threshold
    pub fn confident_tags(&self, threshold: f32) -> Vec<&str> {
        self.tags
            .iter()
            .zip(self.confidence.iter())
            .filter(|&(_, &conf)| conf >= threshold)
            .map(|(tag, _)| tag.as_str())
            .collect()
    }
}

// Example usage in your code:
#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_example() {
        let json = r#"{
  "tags": ["cs/theory/algorithms/compression"],
  "confidence": [0.42],
  "new_tags": [
    {
      "name": "information_theory",
      "parent": "cs/theory",
      "reason": "Resource is explicitly about learning information theory concepts (entropy, intuition, applications)."
    }
  ],
  "reasoning": "The content is centered on information theory; the closest existing tag is compression under theory/algorithms, but a dedicated information theory tag would fit better."
}"#;

        let result = ClassificationResult::from_json(json).unwrap();

        assert_eq!(result.tags.len(), 1);
        assert_eq!(result.tags[0], "cs/theory/algorithms/compression");
        assert_eq!(result.confidence[0], 0.42);
        assert_eq!(result.new_tags.len(), 1);
        assert_eq!(result.new_tags[0].name, "information_theory");

        println!("Primary tag: {:?}", result.primary_tag());
        println!("Is confident (>0.5): {}", result.is_confident(0.5));
    }
}