1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00

refactor: simplify archive source parsing

This commit is contained in:
TheGeneralist 2026-04-02 14:05:01 +02:00
parent cb0abbb760
commit 514a5e99c7
Signed by: thegeneralist01
SSH key fingerprint: SHA256:pp9qddbCNmVNoSjevdvQvM5z0DHN7LTa8qBMbcMq/R4
4 changed files with 205 additions and 283 deletions

View file

@ -1,4 +1,4 @@
use anyhow::Result;
use anyhow::{Result, bail};
use chrono::Local;
use clap::{Parser, Subcommand};
use std::{
@ -10,12 +10,6 @@ use std::{
mod downloader;
mod hash;
#[derive(Debug, Clone, PartialEq, Eq)]
enum ExplicitArchiveRequest {
Tweet(downloader::tweets::TweetArchiveRequest),
TweetMedia { tweet_id: String },
}
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
struct Args {
@ -72,8 +66,10 @@ fn get_archive_path() -> Option<PathBuf> {
None
}
#[derive(Debug, PartialEq)]
#[derive(Debug, Clone, PartialEq, Eq)]
enum Source {
Tweet(downloader::tweets::TweetArchiveRequest),
TweetMedia { tweet_id: String },
YouTubeVideo,
YouTubePlaylist,
YouTubeChannel,
@ -95,37 +91,6 @@ fn parse_tweet_id(id: &str) -> Option<String> {
}
}
fn parse_explicit_archive_request(path: &str) -> Option<ExplicitArchiveRequest> {
let parts: Vec<&str> = path.split(':').collect();
match parts.as_slice() {
["tweet", id] => parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
})
}),
["tweet", "media", id] => {
parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id })
}
["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
})
})
}
["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Thread,
})
}),
_ => None,
}
}
fn tweet_media_path(tweet_id: &str) -> String {
format!("https://x.com/i/status/{tweet_id}")
}
@ -165,6 +130,40 @@ fn determine_source(path: &str) -> Source {
}
}
let parts: Vec<&str> = path.split(':').collect();
match parts.as_slice() {
["tweet", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
});
}
}
["tweet", "media", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::TweetMedia { tweet_id };
}
}
["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
});
}
}
["x", "thread", id] | ["twitter", "thread", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Thread,
});
}
}
_ => {}
}
// Shorthand schemes: x: or twitter:
if path.starts_with("x:") || path.starts_with("twitter:") {
return Source::X;
@ -261,56 +260,6 @@ fn determine_source(path: &str) -> Source {
Source::Other
}
fn hash_exists(filename: String, store_path: &Path) -> bool {
let mut chars = filename.chars();
let first_letter = chars.next().unwrap();
let second_letter = chars.next().unwrap();
let path = store_path
.join("raw")
.join(first_letter.to_string())
.join(second_letter.to_string())
.join(filename);
println!("Checking {}", path.display());
path.exists()
}
fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> {
let mut chars = hash.chars();
let first_letter = chars.next().unwrap().to_string();
let second_letter = chars.next().unwrap().to_string();
let file_extension = file
.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
fs::create_dir_all(
store_path
.join("raw")
.join(&first_letter)
.join(&second_letter),
)?;
fs::rename(
file,
store_path
.join("raw")
.join(&first_letter)
.join(&second_letter)
.join(format!(
"{hash}{}",
if file_extension.is_empty() {
""
} else {
&file_extension
}
)),
)?;
Ok(())
}
fn initialize_store_directories(store_path: &Path) -> Result<()> {
fs::create_dir_all(store_path.join("raw"))?;
fs::create_dir_all(store_path.join("raw_tweets"))?;
@ -319,6 +268,33 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> {
Ok(())
}
fn archive_non_tweet_source(
source: &Source,
path: &str,
store_path: &Path,
timestamp: &str,
) -> Result<downloader::local::RawArchiveResult> {
let staged_file = match source {
Source::Tweet(_) | Source::Other => unreachable!(),
Source::TweetMedia { tweet_id } => {
downloader::ytdlp::download(tweet_media_path(tweet_id), store_path, timestamp)?
}
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => downloader::ytdlp::download(path.to_string(), store_path, timestamp)?,
Source::Local => downloader::local::save(path.to_string(), store_path, timestamp)?,
Source::YouTubePlaylist | Source::YouTubeChannel => {
bail!("Archiving from this source is not yet implemented.")
}
};
downloader::local::archive_staged_file(&staged_file, store_path)
}
fn main() -> Result<()> {
let args = Args::parse();
@ -344,118 +320,51 @@ fn main() -> Result<()> {
}
};
if let Some(ExplicitArchiveRequest::Tweet(request)) =
parse_explicit_archive_request(path)
{
match downloader::tweets::archive(&request, &store_path, &timestamp) {
Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => {
println!("Tweet archived successfully to {}", output_dir.display());
return Ok(());
}
Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => {
println!("Tweet already archived in {}", output_dir.display());
return Ok(());
}
Err(e) => {
eprintln!("Failed to archive tweet: {e}");
process::exit(1);
}
let source = determine_source(path);
match source {
Source::Other => {
eprintln!("Archiving from this source is not yet implemented.");
process::exit(1);
}
}
let (resolved_path, source) = match parse_explicit_archive_request(path) {
Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => {
(tweet_media_path(&tweet_id), Source::X)
}
None => {
let source = determine_source(path);
if let Source::Other = source {
eprintln!("Archiving from this source is not yet implemented.");
process::exit(1);
}
(path.clone(), source)
}
Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(),
};
let hash = match source {
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => {
match downloader::ytdlp::download(
resolved_path.clone(),
&store_path,
&timestamp,
) {
Ok(h) => h,
Source::Tweet(request) => {
match downloader::tweets::archive(&request, &store_path, &timestamp) {
Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => {
println!("Tweet archived successfully to {}", output_dir.display());
return Ok(());
}
Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => {
println!("Tweet already archived in {}", output_dir.display());
return Ok(());
}
Err(e) => {
eprintln!("Failed to download from YouTube: {e}");
eprintln!("Failed to archive tweet: {e}");
process::exit(1);
}
}
}
Source::Local => {
match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) {
Ok(h) => h,
Err(e) => {
eprintln!("Failed to archive local file: {e}");
process::exit(1);
source => {
let result =
match archive_non_tweet_source(&source, path, &store_path, &timestamp) {
Ok(result) => result,
Err(e) => {
match source {
Source::Local => eprintln!("Failed to archive local file: {e}"),
_ => eprintln!("Failed to archive source: {e}"),
}
process::exit(1);
}
};
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
match result {
downloader::local::RawArchiveResult::Archived(_) => {
println!("File archived successfully.");
}
downloader::local::RawArchiveResult::AlreadyArchived(_) => {
println!("File already archived.");
}
}
}
_ => unreachable!(),
};
let file_extension = match source {
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => ".mp4",
Source::Local => {
let p = Path::new(resolved_path.trim_start_matches("file://"));
&p.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
}
_ => "",
};
let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
// TODO: check for repeated archives?
// There could be one of the following:
// - We are literally archiving the same path over again.
// - We are archiving a different path, which had this file. E.g.: we archived a
// website before which had this YouTube video, and while recursively archiving
// everything, we also archived the YouTube video although it wasn't our main
// target. This means that we should archive again; whereas with the first case...
// Not sure. Need to think about this.
// ----
// Thinking about it a day later...
// If we are specifically archiving a YouTube video, it could also be two of the
// above. So yeah, just create a new DB entry and symlink the Raw to the Structured
// Dir or whatever. it's midnight and my brain ain't wording/braining.
if hash_exists {
println!("File already archived.");
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
} else {
move_temp_to_raw(
&store_path
.join("temp")
.join(&timestamp)
.join(format!("{timestamp}{file_extension}")),
&hash,
&store_path,
)?;
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
println!("File archived successfully.");
}
// TODO: DB INSERT, inserting a record
@ -529,89 +438,83 @@ mod tests {
}
#[test]
fn test_explicit_tweet_archive_parsing() {
fn test_tweet_and_thread_sources() {
let cases = [
(
"tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"x:tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"x:x:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"twitter:x:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"twitter:tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"tweet:media:1234567890",
Some(ExplicitArchiveRequest::TweetMedia {
TestCase {
url: "tweet:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
),
(
"x:thread:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
},
)),
),
(
"twitter:thread:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
},
)),
),
("tweet:thread:1234567890", None),
("x:media:1234567890", None),
("tweet:not-a-number", None),
("tweet:media:not-a-number", None),
},
TestCase {
url: "x:tweet:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
},
TestCase {
url: "x:x:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
},
TestCase {
url: "twitter:x:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
},
TestCase {
url: "twitter:tweet:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
},
TestCase {
url: "tweet:media:1234567890",
expected: Source::TweetMedia {
tweet_id: "1234567890".to_string(),
},
},
TestCase {
url: "x:thread:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
}),
},
TestCase {
url: "twitter:thread:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
}),
},
TestCase {
url: "tweet:thread:1234567890",
expected: Source::Other,
},
TestCase {
url: "tweet:not-a-number",
expected: Source::Other,
},
TestCase {
url: "tweet:media:not-a-number",
expected: Source::Other,
},
];
for (input, expected) in cases {
for case in &cases {
assert_eq!(
parse_explicit_archive_request(input),
expected,
"Failed for input: {}",
input
determine_source(case.url),
case.expected,
"Failed for URL: {}",
case.url
);
}
}