fix typos in .gitignore

This commit is contained in:
TheGeneralist 2026-01-14 23:36:31 +01:00
parent 35057d7957
commit 6eb3097f3d
Signed by: thegeneralist01
SSH key fingerprint: SHA256:pp9qddbCNmVNoSjevdvQvM5z0DHN7LTa8qBMbcMq/R4
6 changed files with 241 additions and 2 deletions

4
.gitignore vendored
View file

@ -3,10 +3,10 @@
!.gitignore !.gitignore
!docs/ !docs/
!docs/**/ !docs/**
!src/ !src/
!src/**/ !src/**
!Cargo.lock !Cargo.lock
!Cargo.toml !Cargo.toml

2
docs/README.md Normal file
View file

@ -0,0 +1,2 @@
# Facharbeit
Repository for my term paper.

121
src/classifiers.rs Normal file
View file

@ -0,0 +1,121 @@
use std::process::Command;
use serde::{Deserialize, Serialize};
use anyhow::{Context, Result};
pub fn classify(input: &str, current_tag_tree: String) -> Result<String> {
let prompt = format!("You are a resource classifier. Given a hierarchical tag tree and a resource, classify it into 1-3 most specific applicable tags.
# RULES:
- Each level down = narrower specialization
- Assign MOST SPECIFIC tags that fit (prefer leaf nodes when appropriate)
- If no good fit exists, suggest new tag(s) with proposed location in tree
- Output JSON only
# CURRENT TAG TREE:
{current_tag_tree}
# RESOURCE INFORMATION:
{input}
# OUTPUT FORMAT:
{{
\"tags\": [\"path/to/tag1\", \"path/to/tag2\"],
\"confidence\": [0.95, 0.87],
\"new_tags\": [
{{
\"name\": \"suggested_tag\",
\"parent\": \"path/to/parent\",
\"reason\": \"why this tag is needed\"
}}
],
\"reasoning\": \"brief explanation of classification\"
}}");
let out = Command::new("codex")
.arg("e")
.arg(prompt)
.output()
.with_context(|| "Failed to execute tweet scraping command")?;
println!("Output: {:?}", out);
Ok(String::from_utf8_lossy(&out.stdout).to_string())
}
// Yeah
#[derive(Debug, Serialize, Deserialize)]
pub struct ClassificationResult {
pub tags: Vec<String>,
pub confidence: Vec<f32>,
#[serde(default)]
pub new_tags: Vec<NewTagSuggestion>,
pub reasoning: String,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct NewTagSuggestion {
pub name: String,
pub parent: String,
pub reason: String,
}
impl ClassificationResult {
/// Parse from the JSON string returned by the LLM
pub fn from_json(json_str: &str) -> Result<Self, serde_json::Error> {
serde_json::from_str(json_str)
}
/// Get the most confident tag (if any exist)
pub fn primary_tag(&self) -> Option<(&str, f32)> {
self.tags.iter()
.zip(self.confidence.iter())
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.map(|(tag, conf)| (tag.as_str(), *conf))
}
/// Check if classification confidence is above threshold
pub fn is_confident(&self, threshold: f32) -> bool {
self.confidence.iter().any(|&c| c >= threshold)
}
/// Get tags above confidence threshold
pub fn confident_tags(&self, threshold: f32) -> Vec<&str> {
self.tags.iter()
.zip(self.confidence.iter())
.filter(|&(_, &conf)| conf >= threshold)
.map(|(tag, _)| tag.as_str())
.collect()
}
}
// Example usage in your code:
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_example() {
let json = r#"{
"tags": ["cs/theory/algorithms/compression"],
"confidence": [0.42],
"new_tags": [
{
"name": "information_theory",
"parent": "cs/theory",
"reason": "Resource is explicitly about learning information theory concepts (entropy, intuition, applications)."
}
],
"reasoning": "The content is centered on information theory; the closest existing tag is compression under theory/algorithms, but a dedicated information theory tag would fit better."
}"#;
let result = ClassificationResult::from_json(json).unwrap();
assert_eq!(result.tags.len(), 1);
assert_eq!(result.tags[0], "cs/theory/algorithms/compression");
assert_eq!(result.confidence[0], 0.42);
assert_eq!(result.new_tags.len(), 1);
assert_eq!(result.new_tags[0].name, "information_theory");
println!("Primary tag: {:?}", result.primary_tag());
println!("Is confident (>0.5): {}", result.is_confident(0.5));
}
}

91
src/main.rs Normal file
View file

@ -0,0 +1,91 @@
use std::fs;
use anyhow::{Context, Result};
mod classifiers;
mod scrapers;
enum Source {
Twitter,
Other,
}
fn determine_resource_source(line: &str) -> Source {
if line.contains("twitter.com") || line.contains("x.com") {
Source::Twitter
} else {
Source::Other
}
}
fn main() -> Result<()> {
// Read the file
let contents = fs::read_to_string("test-classification-list")
.expect("Something went wrong reading the file");
let current_tag_tree =
fs::read_to_string("tag-tree").expect("Something went wrong reading the tag tree file");
// Determine source
for line in contents.lines() {
let source = determine_resource_source(line);
match source {
Source::Twitter => {
println!("Classifying Twitter resource: {}", line);
// Scrape the Tweet
let tweet_file = scrapers::twitter::scrape(line);
let tweet_scrape_contents = match fs::read_to_string(tweet_file.unwrap())
.with_context(|| "Something went wrong reading the scraped tweet file")
{
Err(e) => {
eprintln!("Error reading scraped tweet file: {:?}", e);
continue;
}
Ok(contents) => contents,
};
let classifier_output =
classifiers::classify(&current_tag_tree, tweet_scrape_contents);
match classifier_output {
Ok(json_string) => {
match classifiers::ClassificationResult::from_json(&json_string) {
Ok(result) => {
println!("Tags: {:?}", result.tags);
println!("Confidence: {:?}", result.confidence);
println!("Reasoning: {}", result.reasoning);
// Check if we need to review new tags
if !result.new_tags.is_empty() {
println!("\n🆕 New tag suggestions:");
for suggestion in &result.new_tags {
println!(
" - {} (under {})",
suggestion.name, suggestion.parent
);
println!(" Reason: {}", suggestion.reason);
}
}
// Only use high-confidence tags
let confident = result.confident_tags(0.5);
if confident.is_empty() {
println!("⚠️ Low confidence classification - review needed");
} else {
println!("✅ Confident tags: {:?}", confident);
}
}
Err(e) => eprintln!("Failed to parse classification: {}", e),
}
}
Err(e) => eprintln!("Classification failed: {}", e),
}
}
Source::Other => {
eprintln!("Classification of this source/website is not covered yet!");
}
}
}
Ok(())
}

1
src/scrapers/mod.rs Normal file
View file

@ -0,0 +1 @@
pub mod twitter;

24
src/scrapers/twitter.rs Normal file
View file

@ -0,0 +1,24 @@
use anyhow::{Context, Result, bail};
use std::{path::PathBuf, process::Command};
pub fn scrape(url: &str) -> Result<PathBuf> {
let tweet_id = url.split('/').next_back().unwrap();
println!("Scraping tweet ID: {}", tweet_id);
let out = Command::new("python")
.arg("scrape_user_tweet_contents.py")
.arg("--tweet-ids")
.arg(tweet_id)
.output()
.with_context(|| "Failed to execute tweet scraping command")?;
println!("Output command: {:?}", out);
if PathBuf::from("scraped-tweets")
.join(format!("tweet-{}.toml", tweet_id))
.exists()
{
return Ok(PathBuf::from("scraped-tweets").join(format!("tweet-{}.toml", tweet_id)));
}
bail!("Scraping failed for tweet: {}", url)
}