1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00

Refactor tweet archive source handling

This commit is contained in:
TheGeneralist 2026-04-02 14:31:04 +02:00
parent 514a5e99c7
commit 26d94a8289
Signed by: thegeneralist01
SSH key fingerprint: SHA256:pp9qddbCNmVNoSjevdvQvM5z0DHN7LTa8qBMbcMq/R4
4 changed files with 288 additions and 222 deletions

View file

@ -7,21 +7,7 @@ use std::{
use crate::hash::hash_file; use crate::hash::hash_file;
#[derive(Debug, Clone, PartialEq, Eq)] pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
pub enum RawArchiveResult {
Archived(PathBuf),
AlreadyArchived(PathBuf),
}
impl RawArchiveResult {
pub fn relative_path(&self) -> &Path {
match self {
Self::Archived(path) | Self::AlreadyArchived(path) => path,
}
}
}
pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result<PathBuf> {
println!("Saving path: {path}"); println!("Saving path: {path}");
let temp_dir = store_path.join("temp").join(timestamp); let temp_dir = store_path.join("temp").join(timestamp);
@ -42,10 +28,10 @@ pub fn save(path: String, store_path: &Path, timestamp: &str) -> Result<PathBuf>
bail!("yt-dlp failed: {stderr}"); bail!("yt-dlp failed: {stderr}");
} }
Ok(out_file) hash_file(&out_file)
} }
pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<RawArchiveResult> { pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<PathBuf> {
let hash = hash_file(file)?; let hash = hash_file(file)?;
let destination = raw_relative_path(file, &hash)?; let destination = raw_relative_path(file, &hash)?;
let absolute_destination = store_path.join(&destination); let absolute_destination = store_path.join(&destination);
@ -56,11 +42,11 @@ pub fn archive_staged_file(file: &Path, store_path: &Path) -> Result<RawArchiveR
if absolute_destination.exists() { if absolute_destination.exists() {
fs::remove_file(file)?; fs::remove_file(file)?;
Ok(RawArchiveResult::AlreadyArchived(destination))
} else { } else {
fs::rename(file, &absolute_destination)?; fs::rename(file, &absolute_destination)?;
Ok(RawArchiveResult::Archived(destination))
} }
Ok(destination)
} }
fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> { fn raw_relative_path(file: &Path, hash: &str) -> Result<PathBuf> {
@ -93,12 +79,12 @@ mod tests {
let staged = root.join("temp").join("photo.jpg"); let staged = root.join("temp").join("photo.jpg");
fs::write(&staged, b"image-bytes").unwrap(); fs::write(&staged, b"image-bytes").unwrap();
let result = archive_staged_file(&staged, &root).unwrap(); let relative = archive_staged_file(&staged, &root).unwrap();
let absolute = root.join(result.relative_path()); let absolute = root.join(&relative);
assert!(absolute.is_file()); assert!(absolute.is_file());
assert!(!staged.exists()); assert!(!staged.exists());
assert!(result.relative_path().starts_with("raw")); assert!(relative.starts_with("raw"));
let _ = fs::remove_dir_all(&root); let _ = fs::remove_dir_all(&root);
} }

View file

@ -12,22 +12,16 @@ use std::{
use super::local; use super::local;
#[derive(Debug, Clone, PartialEq, Eq)] fn parse_tweet_id(id: &str) -> Option<String> {
pub enum TweetArchiveMode { if !id.is_empty() && id.chars().all(|char| char.is_ascii_digit()) {
Tweet, Some(id.to_string())
Thread, } else {
None
}
} }
#[derive(Debug, Clone, PartialEq, Eq)] fn tweet_id_from_path(path: &str) -> Option<String> {
pub struct TweetArchiveRequest { path.split(':').next_back().and_then(parse_tweet_id)
pub tweet_id: String,
pub mode: TweetArchiveMode,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TweetArchiveResult {
Archived(PathBuf),
Skipped(PathBuf),
} }
fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf { fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
@ -39,14 +33,15 @@ fn resolve_from_cwd(path: PathBuf, cwd: &Path) -> PathBuf {
} }
fn build_scraper_args( fn build_scraper_args(
request: &TweetArchiveRequest, tweet_id: &str,
thread: bool,
output_dir: &Path, output_dir: &Path,
temp_dir: &Path, temp_dir: &Path,
credentials_file: &Path, credentials_file: &Path,
) -> Vec<String> { ) -> Vec<String> {
let mut args = vec![ let mut args = vec![
"--tweet-ids".to_string(), "--tweet-ids".to_string(),
request.tweet_id.clone(), tweet_id.to_string(),
"--output-dir".to_string(), "--output-dir".to_string(),
output_dir.display().to_string(), output_dir.display().to_string(),
"--media-dir".to_string(), "--media-dir".to_string(),
@ -56,34 +51,29 @@ fn build_scraper_args(
credentials_file.display().to_string(), credentials_file.display().to_string(),
]; ];
match request.mode { if thread {
TweetArchiveMode::Tweet => {
args.push("--no-recursive".to_string());
}
TweetArchiveMode::Thread => {
args.push("--recursive-replied-to-tweets".to_string()); args.push("--recursive-replied-to-tweets".to_string());
args.push("--recursive-replied-to-tweets-quotes-retweets".to_string()); args.push("--recursive-replied-to-tweets-quotes-retweets".to_string());
args.push("--download-replied-to-tweets-media".to_string()); args.push("--download-replied-to-tweets-media".to_string());
} } else {
args.push("--no-recursive".to_string());
} }
args args
} }
pub fn archive( pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
request: &TweetArchiveRequest,
store_path: &Path,
timestamp: &str,
) -> Result<TweetArchiveResult> {
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?; let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
let output_dir = store_path.join("raw_tweets"); let output_dir = store_path.join("raw_tweets");
let temp_dir = store_path.join("temp").join(timestamp).join("tweets"); let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
let tweet_id = tweet_id_from_path(path).context("Invalid tweet ID")?;
fs::create_dir_all(&output_dir)?; fs::create_dir_all(&output_dir)?;
fs::create_dir_all(&temp_dir)?; fs::create_dir_all(&temp_dir)?;
let root_toml = output_dir.join(format!("tweet-{}.toml", request.tweet_id)); let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
if request.mode == TweetArchiveMode::Tweet && root_toml.exists() { if !thread && root_toml.exists() {
return Ok(TweetArchiveResult::Skipped(output_dir)); return Ok(false);
} }
let before = tweet_toml_files(&output_dir)?; let before = tweet_toml_files(&output_dir)?;
@ -113,7 +103,7 @@ pub fn archive(
let mut cmd = Command::new(&python); let mut cmd = Command::new(&python);
cmd.current_dir(&temp_dir).arg(&scraper_path); cmd.current_dir(&temp_dir).arg(&scraper_path);
for arg in build_scraper_args(request, &output_dir, &temp_dir, &credentials_file) { for arg in build_scraper_args(&tweet_id, thread, &output_dir, &temp_dir, &credentials_file) {
cmd.arg(arg); cmd.arg(arg);
} }
@ -151,7 +141,7 @@ pub fn archive(
rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?; rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?;
let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp)); let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
Ok(TweetArchiveResult::Archived(output_dir)) Ok(true)
} }
fn cleanup_summary(output_dir: &Path) -> Result<()> { fn cleanup_summary(output_dir: &Path) -> Result<()> {
@ -164,9 +154,11 @@ fn cleanup_summary(output_dir: &Path) -> Result<()> {
fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> { fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
let mut files = HashSet::new(); let mut files = HashSet::new();
for entry in fs::read_dir(output_dir)? { for entry in fs::read_dir(output_dir)? {
let entry = entry?; let entry = entry?;
let path = entry.path(); let path = entry.path();
if path.is_file() if path.is_file()
&& path && path
.file_name() .file_name()
@ -176,6 +168,7 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
files.insert(path); files.insert(path);
} }
} }
Ok(files) Ok(files)
} }
@ -212,6 +205,7 @@ fn rewrite_tweet_outputs(
store_path, store_path,
&mut archived_assets, &mut archived_assets,
)?; )?;
if rewritten != contents { if rewritten != contents {
fs::write(path, rewritten)?; fs::write(path, rewritten)?;
} }
@ -277,10 +271,7 @@ fn archive_asset_reference(
} }
let relative_path = local::archive_staged_file(&absolute_path, store_path)?; let relative_path = local::archive_staged_file(&absolute_path, store_path)?;
let relative_path = relative_path let relative_path = relative_path.to_string_lossy().replace('\\', "/");
.relative_path()
.to_string_lossy()
.replace('\\', "/");
archived_assets.insert(key, relative_path.clone()); archived_assets.insert(key, relative_path.clone());
Ok(relative_path) Ok(relative_path)
@ -290,7 +281,6 @@ fn archive_asset_reference(
mod tests { mod tests {
use super::*; use super::*;
use std::{ use std::{
env, fs,
sync::MutexGuard, sync::MutexGuard,
time::{SystemTime, UNIX_EPOCH}, time::{SystemTime, UNIX_EPOCH},
}; };
@ -323,10 +313,8 @@ mod tests {
#[test] #[test]
fn test_build_scraper_args_for_single_tweet() { fn test_build_scraper_args_for_single_tweet() {
let args = build_scraper_args( let args = build_scraper_args(
&TweetArchiveRequest { "1234567890",
tweet_id: "1234567890".to_string(), false,
mode: TweetArchiveMode::Tweet,
},
Path::new("/tmp/raw_tweets"), Path::new("/tmp/raw_tweets"),
Path::new("/tmp/temp/tweets"), Path::new("/tmp/temp/tweets"),
Path::new("/tmp/twitter-creds.txt"), Path::new("/tmp/twitter-creds.txt"),
@ -338,7 +326,6 @@ mod tests {
assert!(args.contains(&"--download-media".to_string())); assert!(args.contains(&"--download-media".to_string()));
assert!(args.contains(&"--credentials-file".to_string())); assert!(args.contains(&"--credentials-file".to_string()));
assert!(args.contains(&"--no-recursive".to_string())); assert!(args.contains(&"--no-recursive".to_string()));
assert!(!args.contains(&"--no-download-avatars".to_string()));
assert!(!args.contains(&"--recursive-replied-to-tweets".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets".to_string()));
assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string())); assert!(!args.contains(&"--recursive-replied-to-tweets-quotes-retweets".to_string()));
assert!(!args.contains(&"--download-replied-to-tweets-media".to_string())); assert!(!args.contains(&"--download-replied-to-tweets-media".to_string()));
@ -347,10 +334,8 @@ mod tests {
#[test] #[test]
fn test_build_scraper_args_for_thread() { fn test_build_scraper_args_for_thread() {
let args = build_scraper_args( let args = build_scraper_args(
&TweetArchiveRequest { "1234567890",
tweet_id: "1234567890".to_string(), true,
mode: TweetArchiveMode::Thread,
},
Path::new("/tmp/raw_tweets"), Path::new("/tmp/raw_tweets"),
Path::new("/tmp/temp/tweets"), Path::new("/tmp/temp/tweets"),
Path::new("/tmp/twitter-creds.txt"), Path::new("/tmp/twitter-creds.txt"),
@ -459,17 +444,9 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
fs::write(&credentials, "ct0=test;auth_token=test").unwrap(); fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials); set_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE", &credentials);
let result = archive( let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
&TweetArchiveRequest {
tweet_id: "123".to_string(),
mode: TweetArchiveMode::Tweet,
},
&store_path,
"ts",
)
.unwrap();
assert_eq!(result, TweetArchiveResult::Skipped(output_dir)); assert!(!archived);
remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE"); remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
let _ = fs::remove_dir_all(store_path); let _ = fs::remove_dir_all(store_path);
@ -532,7 +509,7 @@ EOF
"#, "#,
) )
.unwrap(); .unwrap();
std::process::Command::new("chmod") Command::new("chmod")
.arg("+x") .arg("+x")
.arg(&script) .arg(&script)
.status() .status()
@ -542,20 +519,11 @@ EOF
set_test_env("ARCHIVR_TWEET_SCRAPER", &script); set_test_env("ARCHIVR_TWEET_SCRAPER", &script);
set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh"); set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
let result = archive( let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
&TweetArchiveRequest {
tweet_id: "123".to_string(),
mode: TweetArchiveMode::Tweet,
},
&store_path,
"ts",
)
.unwrap();
let tweet_file = output_dir.join("tweet-123.toml"); let tweet_file = output_dir.join("tweet-123.toml");
let contents = fs::read_to_string(&tweet_file).unwrap(); let contents = fs::read_to_string(&tweet_file).unwrap();
assert_eq!(result, TweetArchiveResult::Archived(output_dir.clone())); assert!(archived);
assert!(tweet_file.exists()); assert!(tweet_file.exists());
assert!(!output_dir.join("scraping_summary.toml").exists()); assert!(!output_dir.join("scraping_summary.toml").exists());
assert!(contents.contains(r#"avatar_local_path = "raw/"#)); assert!(contents.contains(r#"avatar_local_path = "raw/"#));

View file

@ -1,11 +1,9 @@
use anyhow::{Context, Result, bail}; use anyhow::{Context, Result, bail};
use std::{ use std::{env, path::Path, process::Command};
env,
path::{Path, PathBuf},
process::Command,
};
pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result<PathBuf> { use crate::hash::hash_file;
pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
println!("Downloading with yt-dlp: {path}"); println!("Downloading with yt-dlp: {path}");
let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string()); let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string());
@ -31,5 +29,5 @@ pub fn download(path: String, store_path: &Path, timestamp: &str) -> Result<Path
bail!("yt-dlp failed: {stderr}"); bail!("yt-dlp failed: {stderr}");
} }
Ok(out_file) hash_file(&out_file)
} }

View file

@ -1,4 +1,4 @@
use anyhow::{Result, bail}; use anyhow::Result;
use chrono::Local; use chrono::Local;
use clap::{Parser, Subcommand}; use clap::{Parser, Subcommand};
use std::{ use std::{
@ -66,14 +66,14 @@ fn get_archive_path() -> Option<PathBuf> {
None None
} }
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq, Clone, Copy)]
enum Source { enum Source {
Tweet(downloader::tweets::TweetArchiveRequest),
TweetMedia { tweet_id: String },
YouTubeVideo, YouTubeVideo,
YouTubePlaylist, YouTubePlaylist,
YouTubeChannel, YouTubeChannel,
X, X,
Tweet,
TweetThread,
Instagram, Instagram,
Facebook, Facebook,
TikTok, TikTok,
@ -91,8 +91,19 @@ fn parse_tweet_id(id: &str) -> Option<String> {
} }
} }
fn tweet_media_path(tweet_id: &str) -> String { fn tweet_id_from_path(path: &str) -> Option<String> {
format!("https://x.com/i/status/{tweet_id}") path.split(':').next_back().and_then(parse_tweet_id)
}
fn resolve_source_path(path: &str, source: &Source) -> String {
if *source == Source::X && path.starts_with("tweet:media:") {
format!(
"https://x.com/i/status/{}",
tweet_id_from_path(path).unwrap()
)
} else {
path.to_string()
}
} }
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
@ -130,42 +141,43 @@ fn determine_source(path: &str) -> Source {
} }
} }
let parts: Vec<&str> = path.split(':').collect(); // Shorthand schemes: tweet:, x:, or twitter:
match parts.as_slice() { if let Some(after_scheme) = path.strip_prefix("tweet:") {
["tweet", id] => { if after_scheme.starts_with("media:")
if let Some(tweet_id) = parse_tweet_id(id) { && after_scheme
return Source::Tweet(downloader::tweets::TweetArchiveRequest { .strip_prefix("media:")
tweet_id, .and_then(parse_tweet_id)
mode: downloader::tweets::TweetArchiveMode::Tweet, .is_some()
}); {
} return Source::X;
} }
["tweet", "media", id] => {
if let Some(tweet_id) = parse_tweet_id(id) { if parse_tweet_id(after_scheme).is_some() {
return Source::TweetMedia { tweet_id }; return Source::Tweet;
} }
} }
["x", "tweet", id] | ["x", "x", id] | ["twitter", "x", id] | ["twitter", "tweet", id] => {
if let Some(tweet_id) = parse_tweet_id(id) { if let Some(after_scheme) = path
return Source::Tweet(downloader::tweets::TweetArchiveRequest { .strip_prefix("x:")
tweet_id, .or_else(|| path.strip_prefix("twitter:"))
mode: downloader::tweets::TweetArchiveMode::Tweet, {
}); if after_scheme
} .strip_prefix("thread:")
} .and_then(parse_tweet_id)
["x", "thread", id] | ["twitter", "thread", id] => { .is_some()
if let Some(tweet_id) = parse_tweet_id(id) { {
return Source::Tweet(downloader::tweets::TweetArchiveRequest { return Source::TweetThread;
tweet_id, }
mode: downloader::tweets::TweetArchiveMode::Thread,
}); if after_scheme
} .strip_prefix("tweet:")
} .or_else(|| after_scheme.strip_prefix("x:"))
_ => {} .and_then(parse_tweet_id)
.is_some()
{
return Source::Tweet;
} }
// Shorthand schemes: x: or twitter:
if path.starts_with("x:") || path.starts_with("twitter:") {
return Source::X; return Source::X;
} }
@ -260,6 +272,56 @@ fn determine_source(path: &str) -> Source {
Source::Other Source::Other
} }
fn hash_exists(filename: String, store_path: &Path) -> bool {
let mut chars = filename.chars();
let first_letter = chars.next().unwrap();
let second_letter = chars.next().unwrap();
let path = store_path
.join("raw")
.join(first_letter.to_string())
.join(second_letter.to_string())
.join(filename);
println!("Checking {}", path.display());
path.exists()
}
fn move_temp_to_raw(file: &Path, hash: &String, store_path: &Path) -> Result<()> {
let mut chars = hash.chars();
let first_letter = chars.next().unwrap().to_string();
let second_letter = chars.next().unwrap().to_string();
let file_extension = file
.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
fs::create_dir_all(
store_path
.join("raw")
.join(&first_letter)
.join(&second_letter),
)?;
fs::rename(
file,
store_path
.join("raw")
.join(&first_letter)
.join(&second_letter)
.join(format!(
"{hash}{}",
if file_extension.is_empty() {
""
} else {
&file_extension
}
)),
)?;
Ok(())
}
fn initialize_store_directories(store_path: &Path) -> Result<()> { fn initialize_store_directories(store_path: &Path) -> Result<()> {
fs::create_dir_all(store_path.join("raw"))?; fs::create_dir_all(store_path.join("raw"))?;
fs::create_dir_all(store_path.join("raw_tweets"))?; fs::create_dir_all(store_path.join("raw_tweets"))?;
@ -268,33 +330,6 @@ fn initialize_store_directories(store_path: &Path) -> Result<()> {
Ok(()) Ok(())
} }
fn archive_non_tweet_source(
source: &Source,
path: &str,
store_path: &Path,
timestamp: &str,
) -> Result<downloader::local::RawArchiveResult> {
let staged_file = match source {
Source::Tweet(_) | Source::Other => unreachable!(),
Source::TweetMedia { tweet_id } => {
downloader::ytdlp::download(tweet_media_path(tweet_id), store_path, timestamp)?
}
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => downloader::ytdlp::download(path.to_string(), store_path, timestamp)?,
Source::Local => downloader::local::save(path.to_string(), store_path, timestamp)?,
Source::YouTubePlaylist | Source::YouTubeChannel => {
bail!("Archiving from this source is not yet implemented.")
}
};
downloader::local::archive_staged_file(&staged_file, store_path)
}
fn main() -> Result<()> { fn main() -> Result<()> {
let args = Args::parse(); let args = Args::parse();
@ -321,19 +356,32 @@ fn main() -> Result<()> {
}; };
let source = determine_source(path); let source = determine_source(path);
let resolved_path = resolve_source_path(path, &source);
match source { match source {
Source::Other => { Source::Other => {
eprintln!("Archiving from this source is not yet implemented."); eprintln!("Archiving from this source is not yet implemented.");
process::exit(1); process::exit(1);
} }
Source::Tweet(request) => { Source::Tweet | Source::TweetThread => {
match downloader::tweets::archive(&request, &store_path, &timestamp) { match downloader::tweets::archive(
Ok(downloader::tweets::TweetArchiveResult::Archived(output_dir)) => { path,
println!("Tweet archived successfully to {}", output_dir.display()); source == Source::TweetThread,
&store_path,
&timestamp,
) {
Ok(true) => {
println!(
"Tweet archived successfully to {}",
store_path.join("raw_tweets").display()
);
return Ok(()); return Ok(());
} }
Ok(downloader::tweets::TweetArchiveResult::Skipped(output_dir)) => { Ok(false) => {
println!("Tweet already archived in {}", output_dir.display()); println!(
"Tweet already archived in {}",
store_path.join("raw_tweets").display()
);
return Ok(()); return Ok(());
} }
Err(e) => { Err(e) => {
@ -342,29 +390,88 @@ fn main() -> Result<()> {
} }
} }
} }
source => { _ => {}
let result =
match archive_non_tweet_source(&source, path, &store_path, &timestamp) {
Ok(result) => result,
Err(e) => {
match source {
Source::Local => eprintln!("Failed to archive local file: {e}"),
_ => eprintln!("Failed to archive source: {e}"),
} }
// Other sources
let hash = match source {
Source::YouTubeVideo
| Source::X
| Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => {
match downloader::ytdlp::download(
resolved_path.clone(),
&store_path,
&timestamp,
) {
Ok(h) => h,
Err(e) => {
eprintln!("Failed to download from YouTube: {e}");
process::exit(1); process::exit(1);
} }
}
}
Source::Local => {
match downloader::local::save(resolved_path.clone(), &store_path, &timestamp) {
Ok(h) => h,
Err(e) => {
eprintln!("Failed to archive local file: {e}");
process::exit(1);
}
}
}
_ => unreachable!(),
}; };
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp)); let file_extension = match source {
match result { Source::YouTubeVideo
downloader::local::RawArchiveResult::Archived(_) => { | Source::X
println!("File archived successfully."); | Source::Instagram
| Source::Facebook
| Source::TikTok
| Source::Reddit
| Source::Snapchat => ".mp4",
Source::Local => {
let p = Path::new(resolved_path.trim_start_matches("file://"));
&p.extension()
.map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
} }
downloader::local::RawArchiveResult::AlreadyArchived(_) => { _ => "",
};
let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
// TODO: check for repeated archives?
// There could be one of the following:
// - We are literally archiving the same path over again.
// - We are archiving a different path, which had this file. E.g.: we archived a
// website before which had this YouTube video, and while recursively archiving
// everything, we also archived the YouTube video although it wasn't our main
// target. This means that we should archive again; whereas with the first case...
// Not sure. Need to think about this.
// ----
// Thinking about it a day later...
// If we are specifically archiving a YouTube video, it could also be two of the
// above. So yeah, just create a new DB entry and symlink the Raw to the Structured
// Dir or whatever. it's midnight and my brain ain't wording/braining.
if hash_exists {
println!("File already archived."); println!("File already archived.");
} let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
} } else {
} move_temp_to_raw(
&store_path
.join("temp")
.join(&timestamp)
.join(format!("{timestamp}{file_extension}")),
&hash,
&store_path,
)?;
let _ = fs::remove_dir_all(store_path.join("temp").join(&timestamp));
println!("File archived successfully.");
} }
// TODO: DB INSERT, inserting a record // TODO: DB INSERT, inserting a record
@ -431,6 +538,7 @@ fn main() -> Result<()> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use std::fs;
struct TestCase<'a> { struct TestCase<'a> {
url: &'a str, url: &'a str,
@ -438,62 +546,39 @@ mod tests {
} }
#[test] #[test]
fn test_tweet_and_thread_sources() { fn test_tweet_sources() {
let cases = [ let cases = [
TestCase { TestCase {
url: "tweet:1234567890", url: "tweet:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { expected: Source::Tweet,
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
}, },
TestCase { TestCase {
url: "x:tweet:1234567890", url: "x:tweet:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { expected: Source::Tweet,
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
}, },
TestCase { TestCase {
url: "x:x:1234567890", url: "x:x:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { expected: Source::Tweet,
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
}, },
TestCase { TestCase {
url: "twitter:x:1234567890", url: "twitter:x:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { expected: Source::Tweet,
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
}, },
TestCase { TestCase {
url: "twitter:tweet:1234567890", url: "twitter:tweet:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { expected: Source::Tweet,
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Tweet,
}),
}, },
TestCase { TestCase {
url: "tweet:media:1234567890", url: "tweet:media:1234567890",
expected: Source::TweetMedia { expected: Source::X,
tweet_id: "1234567890".to_string(),
},
}, },
TestCase { TestCase {
url: "x:thread:1234567890", url: "x:thread:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { expected: Source::TweetThread,
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
}),
}, },
TestCase { TestCase {
url: "twitter:thread:1234567890", url: "twitter:thread:1234567890",
expected: Source::Tweet(downloader::tweets::TweetArchiveRequest { expected: Source::TweetThread,
tweet_id: "1234567890".to_string(),
mode: downloader::tweets::TweetArchiveMode::Thread,
}),
}, },
TestCase { TestCase {
url: "tweet:thread:1234567890", url: "tweet:thread:1234567890",
@ -519,6 +604,35 @@ mod tests {
} }
} }
#[test]
fn test_tweet_id_from_path() {
assert_eq!(
tweet_id_from_path("tweet:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(
tweet_id_from_path("tweet:media:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(
tweet_id_from_path("x:thread:1234567890"),
Some("1234567890".to_string())
);
assert_eq!(tweet_id_from_path("tweet:not-a-number"), None);
}
#[test]
fn test_resolve_source_path() {
assert_eq!(
resolve_source_path("tweet:media:1234567890", &Source::X),
"https://x.com/i/status/1234567890"
);
assert_eq!(
resolve_source_path("tweet:1234567890", &Source::Tweet),
"tweet:1234567890"
);
}
#[test] #[test]
fn test_youtube_sources() { fn test_youtube_sources() {
// --- YouTube Video URLs --- // --- YouTube Video URLs ---