1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00

refactor: simplify archive source parsing

This commit is contained in:
TheGeneralist 2026-04-02 14:05:01 +02:00
parent cb0abbb760
commit 514a5e99c7
Signed by: thegeneralist01
SSH key fingerprint: SHA256:pp9qddbCNmVNoSjevdvQvM5z0DHN7LTa8qBMbcMq/R4
4 changed files with 205 additions and 283 deletions

View file

@ -7,7 +7,21 @@ use std::{
use crate::hash::hash_file; use crate::hash::hash_file;
pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<String> { #[derive(Debug, Clone, PartialEq, Eq)]
pub enum RawArchiveResult {
Archived(PathBuf),
AlreadyArchived(PathBuf),
}
impl RawArchiveResult {
pub fn relative_path(&self) -> &Path {
match self {
Self::Archived(path) | Self::AlreadyArchived(path) => path,
}
}
}
pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result<PathBuf> {
println!("Saving path: {path}"); println!("Saving path: {path}");
let temp_dir = store_path.join("temp").join(timestamp); let temp_dir = store_path.join("temp").join(timestamp);
@ -28,10 +42,10 @@ pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<Strin
bail!("yt-dlp failed: {stderr}"); bail!("yt-dlp failed: {stderr}");
} }
hash_file(&out_file) Ok(out_file)
} }
pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> { pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<RawArchiveResult> {
let hash = hash_file(file)?; let hash = hash_file(file)?;
let destination = raw_relative_path(file, &hash)?; let destination = raw_relative_path(file, &hash)?;
let absolute_destination = store_path.join(&destination); let absolute_destination = store_path.join(&destination);
@ -42,11 +56,11 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
if absolute_destination.exists() { if absolute_destination.exists() {
fs::remove_file(file)?; fs::remove_file(file)?;
Ok(RawArchiveResult::AlreadyArchived(destination))
} else { } else {
fs::rename(file, &absolute_destination)?; fs::rename(file, &absolute_destination)?;
Ok(RawArchiveResult::Archived(destination))
} }
Ok(destination)
} }
fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> { fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
@ -79,12 +93,12 @@ mod tests {
let staged = root.join("temp").join("photo.jpg"); let staged = root.join("temp").join("photo.jpg");
fs::write(&staged, b"image-bytes").unwrap(); fs::write(&staged, b"image-bytes").unwrap();
let relative = archive_staged_file(&staged, &root).unwrap(); let result = archive_staged_file(&staged, &root).unwrap();
let absolute = root.join(&relative); let absolute = root.join(result.relative_path());
assert!(absolute.is_file()); assert!(absolute.is_file());
assert!(!staged.exists()); assert!(!staged.exists());
assert!(relative.starts_with("raw")); assert!(result.relative_path().starts_with("raw"));
let _ = fs::remove_dir_all(&root); let _ = fs::remove_dir_all(&root);
} }

View file

@ -277,7 +277,10 @@ fn archive_asset_reference(
} }
let relative_path = local::archive_staged_file(&absolute_path, store_path)?; let relative_path = local::archive_staged_file(&absolute_path, store_path)?;
let relative_path = relative_path.to_string_lossy().replace('\\', "/"); let relative_path = relative_path
.relative_path()
.to_string_lossy()
.replace('\\', "/");
archived_assets.insert(key, relative_path.clone()); archived_assets.insert(key, relative_path.clone());
Ok(relative_path) Ok(relative_path)

View file

@ -1,9 +1,11 @@
use anyhow::{Context, Result, bail}; use anyhow::{Context, Result, bail};
use std::{env, path::Path, process::Command}; use std::{
env,
path::{Path, PathBuf},
process::Command,
};
use crate::hash::hash_file; pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result<PathBuf> {
pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
println!("Downloading with yt-dlp: {path}"); println!("Downloading with yt-dlp: {path}");
let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string()); let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string());
@ -29,5 +31,5 @@ pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result<S
bail!("yt-dlp failed: {stderr}"); bail!("yt-dlp failed: {stderr}");
} }
hash_file(&out_file) Ok(out_file)
} }

View file

@ -1,4 +1,4 @@
use anyhow::Result; use anyhow::{Result, bail};
use chrono::Local; use chrono::Local;
use clap::{Parser, Subcommand}; use clap::{Parser, Subcommand};
use std::{ use std::{
@ -10,12 +10,6 @@ use std::{
mod downloader; mod downloader;
mod hash; mod hash;
#[derive(Debug, Clone, PartialEq, Eq)]
enum ExplicitArchiveRequest {
Tweet(downloader::tweets::TweetArchiveRequest),
TweetMedia { tweet_id: String },
}
#[derive(Parser, Debug)] #[derive(Parser, Debug)]
#[command(version, about, long_about = None)] #[command(version, about, long_about = None)]
struct Args { struct Args {
@ -72,8 +66,10 @@ fn get_archive_path() -> Option<PathBuf> {
None None
} }
#[derive(Debug, PartialEq)] #[derive(Debug, Clone, PartialEq, Eq)]
enum Source { enum Source {
Tweet(downloader::tweets::TweetArchiveRequest),
TweetMedia { tweet_id: String },
YouTubeVideo, YouTubeVideo,
YouTubePlaylist, YouTubePlaylist,
YouTubeChannel, YouTubeChannel,
@ -95,37 +91,6 @@ fn parse_tweet_id(id: &str) -> Option<String> {
} }
} }
fn parse_explicit_archive_request(path: &str) -> Option<ExplicitArchiveRequest> {
let parts: Vec<&str> = path.split(':').collect();
match parts.as_slice() {
["tweet", id] => parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
})
}),
["tweet", "media", id] => {
parse_tweet_id(id).map(|tweet_id| ExplicitArchiveRequest::TweetMedia { tweet_id })
}
["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
})
})
}
["x", "thread", id] | ["twitter", "thread", id] => parse_tweet_id(id).map(|tweet_id| {
ExplicitArchiveRequest::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Thread,
})
}),
_ => None,
}
}
fn tweet_media_path(tweet_id: &str) -> String { fn tweet_media_path(tweet_id: &str) -> String {
format!("https://x.com/i/status/{tweet_id}") format!("https://x.com/i/status/{tweet_id}")
} }
@ -165,6 +130,40 @@ fn determine_source(path: &str) -> Source {
} }
} }
let parts: Vec<&str> = path.split(':').collect();
match parts.as_slice() {
["tweet", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
});
}
}
["tweet", "media", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::TweetMedia { tweet_id };
}
}
["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Tweet,
});
}
}
["x", "thread", id] | ["twitter", "thread", id] => {
if let Some(tweet_id) = parse_tweet_id(id) {
return Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id,
mode: downloader::tweets::TweetArchiveMode::Thread,
});
}
}
_ => {}
}
// Shorthand schemes: x: or twitter: // Shorthand schemes: x: or twitter:
if path.starts_with("x:") || path.starts_with("twitter:") { if path.starts_with("x:") || path.starts_with("twitter:") {
return Source::X; return Source::X;
@ -261,56 +260,6 @@ fn determine_source(path: &str) -> Source {
Source::Other Source::Other
} }
fn hash_exists(filename: String, store_path: &Path) -> bool {
let mut chars = filename.chars();
let first_letter = chars.next().unwrap();
let second_letter = chars.next().unwrap();
let path = store_path
.join("raw")
.join(first_letter.to_string())
.join(second_letter.to_string())
.join(filename);
println!("Checking {}", path.display());
path.exists()
}
fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> {
let mut chars = hash.chars();
let first_letter = chars.next().unwrap().to_string();
let second_letter = chars.next().unwrap().to_string();
let file_extension = file
.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
fs::create_dir_all(
store_path
.join("raw")
.join(&first_letter)
.join(&second_letter),
)?;
fs::rename(
file,
store_path
.join("raw")
.join(&first_letter)
.join(&second_letter)
.join(format!(
"{hash}{}",
if file_extension.is_empty() {
""
} else {
&file_extension
}
)),
)?;
Ok(())
}
fn initialize_store_directories(store_path: &Path) -> Result<()> { fn initialize_store_directories(store_path: &Path) -> Result<()> {
fs::create_dir_all(store_path.join("raw"))?; fs::create_dir_all(store_path.join("raw"))?;
fs::create_dir_all(store_path.join("raw_tweets"))?; fs::create_dir_all(store_path.join("raw_tweets"))?;
@ -319,6 +268,33 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> {
Ok(()) Ok(())
} }
fn archive_non_tweet_source(
source: &Source,
path: &str,
store_path: &Path,
timestamp: &str,
) -> Result<downloader::local::RawArchiveResult> {
let staged_file = match source {
Source::Tweet(_) | Source::Other => unreachable!(),
Source::TweetMedia { tweet_id } => {
downloader::ytdlp::download(tweet_media_path(tweet_id), store_path, timestamp)?
}
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => downloader::ytdlp::download(path.to_string(), store_path, timestamp)?,
Source::Local => downloader::local::save(path.to_string(), store_path, timestamp)?,
Source::YouTubePlaylist | Source::YouTubeChannel => {
bail!("Archiving from this source is not yet implemented.")
}
};
downloader::local::archive_staged_file(&staged_file, store_path)
}
fn main() -> Result<()> { fn main() -> Result<()> {
let args = Args::parse(); let args = Args::parse();
@ -344,118 +320,51 @@ fn main() -> Result<()> {
} }
}; };
if let Some(ExplicitArchiveRequest::Tweet(request)) = let source = determine_source(path);
parse_explicit_archive_request(path) match source {
{ Source::Other => {
match downloader::tweets::archive(&request, &store_path, &timestamp) { eprintln!("Archiving from this source is not yet implemented.");
Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { process::exit(1);
println!("Tweet archived successfully to {}", output_dir.display());
return Ok(());
}
Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => {
println!("Tweet already archived in {}", output_dir.display());
return Ok(());
}
Err(e) => {
eprintln!("Failed to archive tweet: {e}");
process::exit(1);
}
} }
} Source::Tweet(request) => {
match downloader::tweets::archive(&request, &store_path, &timestamp) {
let (resolved_path, source) = match parse_explicit_archive_request(path) { Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => {
Some(ExplicitArchiveRequest::TweetMedia { tweet_id }) => { println!("Tweet archived successfully to {}", output_dir.display());
(tweet_media_path(&tweet_id), Source::X) return Ok(());
} }
None => { Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => {
let source = determine_source(path); println!("Tweet already archived in {}", output_dir.display());
if let Source::Other = source { return Ok(());
eprintln!("Archiving from this source is not yet implemented."); }
process::exit(1);
}
(path.clone(), source)
}
Some(ExplicitArchiveRequest::Tweet(_)) => unreachable!(),
};
let hash = match source {
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => {
match downloader::ytdlp::download(
resolved_path.clone(),
&store_path,
&timestamp,
) {
Ok(h) => h,
Err(e) => { Err(e) => {
eprintln!("Failed to download from YouTube: {e}"); eprintln!("Failed to archive tweet: {e}");
process::exit(1); process::exit(1);
} }
} }
} }
Source::Local => { source => {
match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) { let result =
Ok(h) => h, match archive_non_tweet_source(&source, path, &store_path, &timestamp) {
Err(e) => { Ok(result) => result,
eprintln!("Failed to archive local file: {e}"); Err(e) => {
process::exit(1); match source {
Source::Local => eprintln!("Failed to archive local file: {e}"),
_ => eprintln!("Failed to archive source: {e}"),
}
process::exit(1);
}
};
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
match result {
downloader::local::RawArchiveResult::Archived(_) => {
println!("File archived successfully.");
}
downloader::local::RawArchiveResult::AlreadyArchived(_) => {
println!("File already archived.");
} }
} }
} }
_ => unreachable!(),
};
let file_extension = match source {
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => ".mp4",
Source::Local => {
let p = Path::new(resolved_path.trim_start_matches("file://"));
&p.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
}
_ => "",
};
let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
// TODO: check for repeated archives?
// There could be one of the following:
// - We are literally archiving the same path over again.
// - We are archiving a different path, which had this file. E.g.: we archived a
// website before which had this YouTube video, and while recursively archiving
// everything, we also archived the YouTube video although it wasn't our main
// target. This means that we should archive again; whereas with the first case...
// Not sure. Need to think about this.
// ----
// Thinking about it a day later...
// If we are specifically archiving a YouTube video, it could also be two of the
// above. So yeah, just create a new DB entry and symlink the Raw to the Structured
// Dir or whatever. it's midnight and my brain ain't wording/braining.
if hash_exists {
println!("File already archived.");
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
} else {
move_temp_to_raw(
&store_path
.join("temp")
.join(&timestamp)
.join(format!("{timestamp}{file_extension}")),
&hash,
&store_path,
)?;
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
println!("File archived successfully.");
} }
// TODO: DB INSERT, inserting a record // TODO: DB INSERT, inserting a record
@ -529,89 +438,83 @@ mod tests {
} }
#[test] #[test]
fn test_explicit_tweet_archive_parsing() { fn test_tweet_and_thread_sources() {
let cases = [ let cases = [
( TestCase {
"tweet:1234567890", url: "tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet( expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"x:tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"x:x:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"twitter:x:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"twitter:tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet(
downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
},
)),
),
(
"tweet:media:1234567890",
Some(ExplicitArchiveRequest::TweetMedia {
tweet_id: "1234567890".to_string(), tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}), }),
), },
( TestCase {
"x:thread:1234567890", url: "x:tweet:1234567890",
Some(ExplicitArchiveRequest::Tweet( expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
downloader::tweets::TweetArchiveRequest { tweet_id: "1234567890".to_string(),
tweet_id: "1234567890".to_string(), mode: downloader::tweets::TweetArchiveMode::Tweet,
mode: downloader::tweets::TweetArchiveMode::Thread, }),
}, },
)), TestCase {
), url: "x:x:1234567890",
( expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
"twitter:thread:1234567890", tweet_id: "1234567890".to_string(),
Some(ExplicitArchiveRequest::Tweet( mode: downloader::tweets::TweetArchiveMode::Tweet,
downloader::tweets::TweetArchiveRequest { }),
tweet_id: "1234567890".to_string(), },
mode: downloader::tweets::TweetArchiveMode::Thread, TestCase {
}, url: "twitter:x:1234567890",
)), expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
), tweet_id: "1234567890".to_string(),
("tweet:thread:1234567890", None), mode: downloader::tweets::TweetArchiveMode::Tweet,
("x:media:1234567890", None), }),
("tweet:not-a-number", None), },
("tweet:media:not-a-number", None), TestCase {
url: "twitter:tweet:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
},
TestCase {
url: "tweet:media:1234567890",
expected: Source::TweetMedia {
tweet_id: "1234567890".to_string(),
},
},
TestCase {
url: "x:thread:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
}),
},
TestCase {
url: "twitter:thread:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest {
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
}),
},
TestCase {
url: "tweet:thread:1234567890",
expected: Source::Other,
},
TestCase {
url: "tweet:not-a-number",
expected: Source::Other,
},
TestCase {
url: "tweet:media:not-a-number",
expected: Source::Other,
},
]; ];
for (input, expected) in cases { for case in &cases {
assert_eq!( assert_eq!(
parse_explicit_archive_request(input), determine_source(case.url),
expected, case.expected,
"Failed for input: {}", "Failed for URL: {}",
input case.url
); );
} }
} }