From 6eb3097f3d3f96c77911af61d766ae852f12ba9d Mon Sep 17 00:00:00 2001
From: TheGeneralist <180094941+thegeneralist01@users.noreply.github.com>
Date: Wed, 14 Jan 2026 23:36:31 +0100
Subject: [PATCH] fix typos in `.gitignore`

---
 .gitignore              |   4 +-
 docs/README.md          |   2 +
 src/classifiers.rs      | 121 ++++++++++++++++++++++++++++++++++++++++
 src/main.rs             |  91 ++++++++++++++++++++++++++++++
 src/scrapers/mod.rs     |   1 +
 src/scrapers/twitter.rs |  24 ++++++++
 6 files changed, 241 insertions(+), 2 deletions(-)
 create mode 100644 docs/README.md
 create mode 100644 src/classifiers.rs
 create mode 100644 src/main.rs
 create mode 100644 src/scrapers/mod.rs
 create mode 100644 src/scrapers/twitter.rs
diff --git a/.gitignore b/.gitignore
index 78167b3..fdff2b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,10 +3,10 @@
 !.gitignore
 
 !docs/
-!docs/**/
+!docs/**
 
 !src/
-!src/**/
+!src/**
 
 !Cargo.lock
 !Cargo.toml
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..4ef1ebe
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,2 @@
+# Facharbeit
+Repository for my term paper.
diff --git a/src/classifiers.rs b/src/classifiers.rs
new file mode 100644
index 0000000..3510872
--- /dev/null
+++ b/src/classifiers.rs
@@ -0,0 +1,121 @@
+use std::process::Command;
+use serde::{Deserialize, Serialize};
+use anyhow::{Context, Result};
+
+pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
+    let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags.
+
+# RULES:
+- Each level down = narrower specialization
+- Assign MOST SPECIFIC tags that fit (prefer leaf nodes when appropriate)
+- If no good fit exists, suggest new tag(s) with proposed location in tree
+- Output JSON only
+
+# CURRENT TAG TREE:
+{current_tag_tree}
+
+# RESOURCE INFORMATION:
+{input}
+
+# OUTPUT FORMAT:
+{{
+  \"tags\": [\"path/to/tag1\", \"path/to/tag2\"],
+  \"confidence\": [0.95, 0.87],
+  \"new_tags\": [
+    {{
+      \"name\": \"suggested_tag\",
+      \"parent\": \"path/to/parent\",
+      \"reason\": \"why this tag is needed\"
+    }}
+  ],
+  \"reasoning\": \"brief explanation of classification\"
+}}");
+
+    let out = Command::new("codex")
+        .arg("e")
+        .arg(prompt)
+        .output()
+        .with_context(|| "Failed to execute tweet scraping command")?;
+    println!("Output: {:?}", out);
+    Ok(String::from_utf8_lossy(&out.stdout).to_string())
+}
+
+// Yeah
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct ClassificationResult {
+    pub tags: Vec<String>,
+    pub confidence: Vec<f32>,
+    #[serde(default)]
+    pub new_tags: Vec<NewTagSuggestion>,
+    pub reasoning: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct NewTagSuggestion {
+    pub name: String,
+    pub parent: String,
+    pub reason: String,
+}
+
+impl ClassificationResult {
+    /// Parse from the JSON string returned by the LLM
+    pub fn from_json(json_str: &str) -> Result<Self, serde_json::Error> {
+        serde_json::from_str(json_str)
+    }
+
+    /// Get the most confident tag (if any exist)
+    pub fn primary_tag(&self) -> Option<(&str, f32)> {
+        self.tags.iter()
+            .zip(self.confidence.iter())
+            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
+            .map(|(tag, conf)| (tag.as_str(), *conf))
+    }
+
+    /// Check if classification confidence is above threshold
+    pub fn is_confident(&self, threshold: f32) -> bool {
+        self.confidence.iter().any(|&c| c >= threshold)
+    }
+
+    /// Get tags above confidence threshold
+    pub fn confident_tags(&self, threshold: f32) -> Vec<&str> {
+        self.tags.iter()
+            .zip(self.confidence.iter())
+            .filter(|&(_, &conf)| conf >= threshold)
+            .map(|(tag, _)| tag.as_str())
+            .collect()
+    }
+}
+
+// Example usage in your code:
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_example() {
+        let json = r#"{
+  "tags": ["cs/theory/algorithms/compression"],
+  "confidence": [0.42],
+  "new_tags": [
+    {
+      "name": "information_theory",
+      "parent": "cs/theory",
+      "reason": "Resource is explicitly about learning information theory concepts (entropy, intuition, applications)."
+    }
+  ],
+  "reasoning": "The content is centered on information theory; the closest existing tag is compression under theory/algorithms, but a dedicated information theory tag would fit better."
+}"#;
+
+        let result = ClassificationResult::from_json(json).unwrap();
+
+        assert_eq!(result.tags.len(), 1);
+        assert_eq!(result.tags[0], "cs/theory/algorithms/compression");
+        assert_eq!(result.confidence[0], 0.42);
+        assert_eq!(result.new_tags.len(), 1);
+        assert_eq!(result.new_tags[0].name, "information_theory");
+
+        println!("Primary tag: {:?}", result.primary_tag());
+        println!("Is confident (>0.5): {}", result.is_confident(0.5));
+    }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..1f20a98
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,91 @@
+use std::fs;
+
+use anyhow::{Context, Result};
+mod classifiers;
+mod scrapers;
+
+enum Source {
+    Twitter,
+    Other,
+}
+
+fn determine_resource_source(line: &str) -> Source {
+    if line.contains("twitter.com") || line.contains("x.com") {
+        Source::Twitter
+    } else {
+        Source::Other
+    }
+}
+
+fn main() -> Result<()> {
+    // Read the file
+    let contents = fs::read_to_string("test-classification-list")
+        .expect("Something went wrong reading the file");
+    let current_tag_tree =
+        fs::read_to_string("tag-tree").expect("Something went wrong reading the tag tree file");
+
+    // Determine source
+    for line in contents.lines() {
+        let source = determine_resource_source(line);
+
+        match source {
+            Source::Twitter => {
+                println!("Classifying Twitter resource: {}", line);
+
+                // Scrape the Tweet
+                let tweet_file = scrapers::twitter::scrape(line);
+                let tweet_scrape_contents = match fs::read_to_string(tweet_file.unwrap())
+                    .with_context(|| "Something went wrong reading the scraped tweet file")
+                {
+                    Err(e) => {
+                        eprintln!("Error reading scraped tweet file: {:?}", e);
+                        continue;
+                    }
+                    Ok(contents) => contents,
+                };
+
+                let classifier_output =
+                    classifiers::classify(&current_tag_tree, tweet_scrape_contents);
+
+                match classifier_output {
+                    Ok(json_string) => {
+                        match classifiers::ClassificationResult::from_json(&json_string) {
+                            Ok(result) => {
+                                println!("Tags: {:?}", result.tags);
+                                println!("Confidence: {:?}", result.confidence);
+                                println!("Reasoning: {}", result.reasoning);
+
+                                // Check if we need to review new tags
+                                if !result.new_tags.is_empty() {
+                                    println!("\n🆕 New tag suggestions:");
+                                    for suggestion in &result.new_tags {
+                                        println!(
+                                            "  - {} (under {})",
+                                            suggestion.name, suggestion.parent
+                                        );
+                                        println!("    Reason: {}", suggestion.reason);
+                                    }
+                                }
+
+                                // Only use high-confidence tags
+                                let confident = result.confident_tags(0.5);
+                                if confident.is_empty() {
+                                    println!("⚠️  Low confidence classification - review needed");
+                                } else {
+                                    println!("✅ Confident tags: {:?}", confident);
+                                }
+                            }
+                            Err(e) => eprintln!("Failed to parse classification: {}", e),
+                        }
+                    }
+                    Err(e) => eprintln!("Classification failed: {}", e),
+                }
+            }
+            Source::Other => {
+                eprintln!("Classification of this source/website is not covered yet!");
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/src/scrapers/mod.rs b/src/scrapers/mod.rs
new file mode 100644
index 0000000..2271156
--- /dev/null
+++ b/src/scrapers/mod.rs
@@ -0,0 +1 @@
+pub mod twitter;
diff --git a/src/scrapers/twitter.rs b/src/scrapers/twitter.rs
new file mode 100644
index 0000000..d0f4e68
--- /dev/null
+++ b/src/scrapers/twitter.rs
@@ -0,0 +1,24 @@
+use anyhow::{Context, Result, bail};
+use std::{path::PathBuf, process::Command};
+
+pub fn scrape(url: &str) -> Result<PathBuf> {
+    let tweet_id = url.split('/').next_back().unwrap();
+    println!("Scraping tweet ID: {}", tweet_id);
+
+    let out = Command::new("python")
+        .arg("scrape_user_tweet_contents.py")
+        .arg("--tweet-ids")
+        .arg(tweet_id)
+        .output()
+        .with_context(|| "Failed to execute tweet scraping command")?;
+    println!("Output command: {:?}", out);
+
+    if PathBuf::from("scraped-tweets")
+        .join(format!("tweet-{}.toml", tweet_id))
+        .exists()
+    {
+        return Ok(PathBuf::from("scraped-tweets").join(format!("tweet-{}.toml", tweet_id)));
+    }
+
+    bail!("Scraping failed for tweet: {}", url)
+}