fix typos in .gitignore

2026-01-14 23:36:31 +01:00 · 2026-01-14 23:36:31 +01:00 · 6eb3097f3d
commit 6eb3097f3d
parent 35057d7957
6 changed files with 241 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,10 +3,10 @@
 !.gitignore
 !docs/
-!docs/**/
+!docs/**
 !src/
-!src/**/
+!src/**
 !Cargo.lock
 !Cargo.toml
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,2 @@
 # Facharbeit
 Repository for my term paper.
--- a/src/classifiers.rs
+++ b/src/classifiers.rs
@ -0,0 +1,121 @@
 use std::process::Command;
 use serde::{Deserialize, Serialize};
 use anyhow::{Context, Result};
 pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
    let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags.
 # RULES:
 - Each level down = narrower specialization
 - Assign MOST SPECIFIC tags that fit (prefer leaf nodes when appropriate)
 - If no good fit exists, suggest new tag(s) with proposed location in tree
 - Output JSON only
 # CURRENT TAG TREE:
 {current_tag_tree}
 # RESOURCE INFORMATION:
 {input}
 # OUTPUT FORMAT:
 {{
  \"tags\": [\"path/to/tag1\", \"path/to/tag2\"],
  \"confidence\": [0.95, 0.87],
  \"new_tags\": [
    {{
      \"name\": \"suggested_tag\",
      \"parent\": \"path/to/parent\",
      \"reason\": \"why this tag is needed\"
    }}
  ],
  \"reasoning\": \"brief explanation of classification\"
 }}");
    let out = Command::new("codex")
        .arg("e")
        .arg(prompt)
        .output()
        .with_context(|| "Failed to execute tweet scraping command")?;
    println!("Output: {:?}", out);
    Ok(String::from_utf8_lossy(&out.stdout).to_string())
 }
 // Yeah
 #[derive(Debug, Serialize, Deserialize)]
 pub struct ClassificationResult {
    pub tags: Vec<String>,
    pub confidence: Vec<f32>,
    #[serde(default)]
    pub new_tags: Vec<NewTagSuggestion>,
    pub reasoning: String,
 }
 #[derive(Debug, Serialize, Deserialize)]
 pub struct NewTagSuggestion {
    pub name: String,
    pub parent: String,
    pub reason: String,
 }
 impl ClassificationResult {
    /// Parse from the JSON string returned by the LLM
    pub fn from_json(json_str: &str) -> Result<Self, serde_json::Error> {
        serde_json::from_str(json_str)
    }
    /// Get the most confident tag (if any exist)
    pub fn primary_tag(&self) -> Option<(&str, f32)> {
        self.tags.iter()
            .zip(self.confidence.iter())
            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
            .map(|(tag, conf)| (tag.as_str(), *conf))
    }
    /// Check if classification confidence is above threshold
    pub fn is_confident(&self, threshold: f32) -> bool {
        self.confidence.iter().any(|&c| c >= threshold)
    }
    /// Get tags above confidence threshold
    pub fn confident_tags(&self, threshold: f32) -> Vec<&str> {
        self.tags.iter()
            .zip(self.confidence.iter())
            .filter(|&(_, &conf)| conf >= threshold)
            .map(|(tag, _)| tag.as_str())
            .collect()
    }
 }
 // Example usage in your code:
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_parse_example() {
        let json = r#"{
  "tags": ["cs/theory/algorithms/compression"],
  "confidence": [0.42],
  "new_tags": [
    {
      "name": "information_theory",
      "parent": "cs/theory",
      "reason": "Resource is explicitly about learning information theory concepts (entropy, intuition, applications)."
    }
  ],
  "reasoning": "The content is centered on information theory; the closest existing tag is compression under theory/algorithms, but a dedicated information theory tag would fit better."
 }"#;
        let result = ClassificationResult::from_json(json).unwrap();
        assert_eq!(result.tags.len(), 1);
        assert_eq!(result.tags[0], "cs/theory/algorithms/compression");
        assert_eq!(result.confidence[0], 0.42);
        assert_eq!(result.new_tags.len(), 1);
        assert_eq!(result.new_tags[0].name, "information_theory");
        println!("Primary tag: {:?}", result.primary_tag());
        println!("Is confident (>0.5): {}", result.is_confident(0.5));
    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,91 @@
 use std::fs;
 use anyhow::{Context, Result};
 mod classifiers;
 mod scrapers;
 enum Source {
    Twitter,
    Other,
 }
 fn determine_resource_source(line: &str) -> Source {
    if line.contains("twitter.com") || line.contains("x.com") {
        Source::Twitter
    } else {
        Source::Other
    }
 }
 fn main() -> Result<()> {
    // Read the file
    let contents = fs::read_to_string("test-classification-list")
        .expect("Something went wrong reading the file");
    let current_tag_tree =
        fs::read_to_string("tag-tree").expect("Something went wrong reading the tag tree file");
    // Determine source
    for line in contents.lines() {
        let source = determine_resource_source(line);
        match source {
            Source::Twitter => {
                println!("Classifying Twitter resource: {}", line);
                // Scrape the Tweet
                let tweet_file = scrapers::twitter::scrape(line);
                let tweet_scrape_contents = match fs::read_to_string(tweet_file.unwrap())
                    .with_context(|| "Something went wrong reading the scraped tweet file")
                {
                    Err(e) => {
                        eprintln!("Error reading scraped tweet file: {:?}", e);
                        continue;
                    }
                    Ok(contents) => contents,
                };
                let classifier_output =
                    classifiers::classify(&current_tag_tree, tweet_scrape_contents);
                match classifier_output {
                    Ok(json_string) => {
                        match classifiers::ClassificationResult::from_json(&json_string) {
                            Ok(result) => {
                                println!("Tags: {:?}", result.tags);
                                println!("Confidence: {:?}", result.confidence);
                                println!("Reasoning: {}", result.reasoning);
                                // Check if we need to review new tags
                                if !result.new_tags.is_empty() {
                                    println!("\n🆕 New tag suggestions:");
                                    for suggestion in &result.new_tags {
                                        println!(
                                            "  - {} (under {})",
                                            suggestion.name, suggestion.parent
                                        );
                                        println!("    Reason: {}", suggestion.reason);
                                    }
                                }
                                // Only use high-confidence tags
                                let confident = result.confident_tags(0.5);
                                if confident.is_empty() {
                                    println!("⚠️  Low confidence classification - review needed");
                                } else {
                                    println!("✅ Confident tags: {:?}", confident);
                                }
                            }
                            Err(e) => eprintln!("Failed to parse classification: {}", e),
                        }
                    }
                    Err(e) => eprintln!("Classification failed: {}", e),
                }
            }
            Source::Other => {
                eprintln!("Classification of this source/website is not covered yet!");
            }
        }
    }
    Ok(())
 }
--- a/src/scrapers/mod.rs
+++ b/src/scrapers/mod.rs
@ -0,0 +1 @@
 pub mod twitter;
--- a/src/scrapers/twitter.rs
+++ b/src/scrapers/twitter.rs
@ -0,0 +1,24 @@
 use anyhow::{Context, Result, bail};
 use std::{path::PathBuf, process::Command};
 pub fn scrape(url: &str) -> Result<PathBuf> {
    let tweet_id = url.split('/').next_back().unwrap();
    println!("Scraping tweet ID: {}", tweet_id);
    let out = Command::new("python")
        .arg("scrape_user_tweet_contents.py")
        .arg("--tweet-ids")
        .arg(tweet_id)
        .output()
        .with_context(|| "Failed to execute tweet scraping command")?;
    println!("Output command: {:?}", out);
    if PathBuf::from("scraped-tweets")
        .join(format!("tweet-{}.toml", tweet_id))
        .exists()
    {
        return Ok(PathBuf::from("scraped-tweets").join(format!("tweet-{}.toml", tweet_id)));
    }
    bail!("Scraping failed for tweet: {}", url)
 }
		`@ -0,0 +1,2 @@`
							`# Facharbeit`
							`Repository for my term paper.`