1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00

Refactor tweet archive source handling

This commit is contained in:
TheGeneralist 2026-04-02 14:31:04 +02:00
parent 514a5e99c7
commit 26d94a8289
Signed by: thegeneralist01
SSH key fingerprint: SHA256:pp9qddbCNmVNoSjevdvQvM5z0DHN7LTa8qBMbcMq/R4
4 changed files with 288 additions and 222 deletions

View file

@ -1,4 +1,4 @@
use anyhow::{Result, bail};
use anyhow::Result;
use chrono::Local;
use clap::{Parser, Subcommand};
use std::{
@ -66,14 +66,14 @@ fn get_archive_path() -> Option<PathBuf> {
None
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
enum Source {
Tweet(downloader::tweets::TweetArchiveRequest),
TweetMedia { tweet_id: String },
YouTubeVideo,
YouTubePlaylist,
YouTubeChannel,
X,
Tweet,
TweetThread,
Instagram,
Facebook,
TikTok,
@ -91,8 +91,19 @@ fn parse_tweet_id(id: &str) -> Option<String> {
}
}
fn tweet_media_path(tweet_id: &str) -> String {
format!("https://x.com/i/status/{tweet_id}")
fn tweet_id_from_path(path: &str) -> Option<String> {
path.split(':').next_back().and_then(parse_tweet_id)
}
fn resolve_source_path(path: &str, source: &Source) -> String {
if *source == Source::X && path.starts_with("tweet:media:") {
format!(
"https://x.com/i/status/{}",
tweet_id_from_path(path).unwrap()
)
} else {
path.to_string()
}
}
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
@ -130,42 +141,43 @@ fn determine_source(path: &str) -> Source {
}
}
let parts: Vec<&str> = path.split(':').collect();
match parts.as_slice() {
["tweet", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
});
}
// Shorthand schemes: tweet:, x:, or twitter:
if let Some(after_scheme) = path.strip_prefix("tweet:") {
if after_scheme.starts_with("media:")
&& after_scheme
.strip_prefix("media:")
.and_then(parse_tweet_id)
.is_some()
{
return Source::X;
}
["tweet", "media", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::TweetMedia { tweet_id };
}
if parse_tweet_id(after_scheme).is_some() {
return Source::Tweet;
}
["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
});
}
}
["x", "thread", id] | ["twitter", "thread", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Thread,
});
}
}
_ => {}
}
// Shorthand schemes: x: or twitter:
if path.starts_with("x:") || path.starts_with("twitter:") {
if let Some(after_scheme) = path
.strip_prefix("x:")
.or_else(|| path.strip_prefix("twitter:"))
{
if after_scheme
.strip_prefix("thread:")
.and_then(parse_tweet_id)
.is_some()
{
return Source::TweetThread;
}
if after_scheme
.strip_prefix("tweet:")
.or_else(|| after_scheme.strip_prefix("x:"))
.and_then(parse_tweet_id)
.is_some()
{
return Source::Tweet;
}
return Source::X;
}
@ -260,6 +272,56 @@ fn determine_source(path: &str) -> Source {
Source::Other
}
fn hash_exists(filename: String, store_path: &Path) -> bool {
let mut chars = filename.chars();
let first_letter = chars.next().unwrap();
let second_letter = chars.next().unwrap();
let path = store_path
.join("raw")
.join(first_letter.to_string())
.join(second_letter.to_string())
.join(filename);
println!("Checking {}", path.display());
path.exists()
}
fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> {
let mut chars = hash.chars();
let first_letter = chars.next().unwrap().to_string();
let second_letter = chars.next().unwrap().to_string();
let file_extension = file
.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
fs::create_dir_all(
store_path
.join("raw")
.join(&first_letter)
.join(&second_letter),
)?;
fs::rename(
file,
store_path
.join("raw")
.join(&first_letter)
.join(&second_letter)
.join(format!(
"{hash}{}",
if file_extension.is_empty() {
""
} else {
&file_extension
}
)),
)?;
Ok(())
}
fn initialize_store_directories(store_path: &Path) -> Result<()> {
fs::create_dir_all(store_path.join("raw"))?;
fs::create_dir_all(store_path.join("raw_tweets"))?;
@ -268,33 +330,6 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> {
Ok(())
}
fn archive_non_tweet_source(
source: &Source,
path: &str,
store_path: &Path,
timestamp: &str,
) -> Result<downloader::local::RawArchiveResult> {
let staged_file = match source {
Source::Tweet(_) | Source::Other => unreachable!(),
Source::TweetMedia { tweet_id } => {
downloader::ytdlp::download(tweet_media_path(tweet_id), store_path, timestamp)?
}
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => downloader::ytdlp::download(path.to_string(), store_path, timestamp)?,
Source::Local => downloader::local::save(path.to_string(), store_path, timestamp)?,
Source::YouTubePlaylist | Source::YouTubeChannel => {
bail!("Archiving from this source is not yet implemented.")
}
};
downloader::local::archive_staged_file(&staged_file, store_path)
}
fn main() -> Result<()> {
let args = Args::parse();
@ -321,19 +356,32 @@ fn main() -> Result<()> {
};
let source = determine_source(path);
let resolved_path = resolve_source_path(path, &source);
match source {
Source::Other => {
eprintln!("Archiving from this source is not yet implemented.");
process::exit(1);
}
Source::Tweet(request) => {
match downloader::tweets::archive(&request, &store_path, &timestamp) {
Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => {
println!("Tweet archived successfully to {}", output_dir.display());
Source::Tweet | Source::TweetThread => {
match downloader::tweets::archive(
path,
source == Source::TweetThread,
&store_path,
&timestamp,
) {
Ok(true) => {
println!(
"Tweet archived successfully to {}",
store_path.join("raw_tweets").display()
);
return Ok(());
}
Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => {
println!("Tweet already archived in {}", output_dir.display());
Ok(false) => {
println!(
"Tweet already archived in {}",
store_path.join("raw_tweets").display()
);
return Ok(());
}
Err(e) => {
@ -342,29 +390,88 @@ fn main() -> Result<()> {
}
}
}
source => {
let result =
match archive_non_tweet_source(&source, path, &store_path, &timestamp) {
Ok(result) => result,
Err(e) => {
match source {
Source::Local => eprintln!("Failed to archive local file: {e}"),
_ => eprintln!("Failed to archive source: {e}"),
}
process::exit(1);
}
};
_ => {}
}
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
match result {
downloader::local::RawArchiveResult::Archived(_) => {
println!("File archived successfully.");
}
downloader::local::RawArchiveResult::AlreadyArchived(_) => {
println!("File already archived.");
// Other sources
let hash = match source {
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => {
match downloader::ytdlp::download(
resolved_path.clone(),
&store_path,
&timestamp,
) {
Ok(h) => h,
Err(e) => {
eprintln!("Failed to download from YouTube: {e}");
process::exit(1);
}
}
}
Source::Local => {
match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) {
Ok(h) => h,
Err(e) => {
eprintln!("Failed to archive local file: {e}");
process::exit(1);
}
}
}
_ => unreachable!(),
};
let file_extension = match source {
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => ".mp4",
Source::Local => {
let p = Path::new(resolved_path.trim_start_matches("file://"));
&p.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
}
_ => "",
};
let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
// TODO: check for repeated archives?
// There could be one of the following:
// - We are literally archiving the same path over again.
// - We are archiving a different path, which had this file. E.g.: we archived a
// website before which had this YouTube video, and while recursively archiving
// everything, we also archived the YouTube video although it wasn't our main
// target. This means that we should archive again; whereas with the first case...
// Not sure. Need to think about this.
// ----
// Thinking about it a day later...
// If we are specifically archiving a YouTube video, it could also be two of the
// above. So yeah, just create a new DB entry and symlink the Raw to the Structured
// Dir or whatever. it's midnight and my brain ain't wording/braining.
if hash_exists {
println!("File already archived.");
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
} else {
move_temp_to_raw(
&store_path
.join("temp")
.join(&timestamp)
.join(format!("{timestamp}{file_extension}")),
&hash,
&store_path,
)?;
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
println!("File archived successfully.");
}
// TODO: DB INSERT, inserting a record
@ -431,6 +538,7 @@ fn main() -> Result<()> {
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
struct TestCase<'a> {
url: &'a str,
@ -438,62 +546,39 @@ mod tests {
}
#[test]
fn test_tweet_and_thread_sources() {
fn test_tweet_sources() {
let cases = [
TestCase {
url: "tweet:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
expected: Source::Tweet,
},
TestCase {
url: "x:tweet:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
expected: Source::Tweet,
},
TestCase {
url: "x:x:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
expected: Source::Tweet,
},
TestCase {
url: "twitter:x:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
expected: Source::Tweet,
},
TestCase {
url: "twitter:tweet:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
expected: Source::Tweet,
},
TestCase {
url: "tweet:media:1234567890",
expected: Source::TweetMedia {
tweet_id: "1234567890".to_string(),
},
expected: Source::X,
},
TestCase {
url: "x:thread:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
}),
expected: Source::TweetThread,
},
TestCase {
url: "twitter:thread:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
}),
expected: Source::TweetThread,
},
TestCase {
url: "tweet:thread:1234567890",
@ -519,6 +604,35 @@ mod tests {
}
}
#[test]
fn test_tweet_id_from_path() {
assert_eq!(
tweet_id_from_path("tweet:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(
tweet_id_from_path("tweet:media:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(
tweet_id_from_path("x:thread:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(tweet_id_from_path("tweet:not-a-number"), None);
}
#[test]
fn test_resolve_source_path() {
assert_eq!(
resolve_source_path("tweet:media:1234567890", &Source::X),
"https://x.com/i/status/1234567890"
);
assert_eq!(
resolve_source_path("tweet:1234567890", &Source::Tweet),
"tweet:1234567890"
);
}
#[test]
fn test_youtube_sources() {
// --- YouTube Video URLs ---