diff --git a/src/hash.rs b/src/hash.rs index 4bb1da9..cbf1194 100644 --- a/src/hash.rs +++ b/src/hash.rs @@ -1,6 +1,6 @@ +use anyhow::Result; use sha3::{Digest, Sha3_256}; use std::{fs::File, io::Read, path::Path}; -use anyhow::Result; pub fn hash_file(path: &Path) -> Result { let mut file = File::open(path)?; diff --git a/src/main.rs b/src/main.rs index 4deb3b8..f2e6de1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -32,6 +32,8 @@ enum Command { /// Store path - path to store the archived files in. /// Structure will be: /// store_path/ + /// temp/ + /// ... /// raw/ /// ... /// structured/ @@ -42,6 +44,10 @@ enum Command { /// Name of the archive #[arg(short, long)] name: String, + + /// Wipe existing .archivr repository data + #[arg(long = "force-with-info-removal")] + force_with_info_removal: bool, }, } @@ -58,15 +64,80 @@ fn get_archive_path() -> Option { None } -#[derive(Debug)] +#[derive(Debug, PartialEq)] enum Source { - YouTube, + YouTubeVideo, + YouTubePlaylist, + YouTubeChannel, + X, + Local, Other, } +// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user +// -> should be asked whether they want to archive the whole website or just the video(s) on it. fn determine_source(path: &str) -> Source { - if path.starts_with("http://") || path.starts_with("https://") { - return Source::YouTube; + // INFO: Extractors' URLs can be found here: + // -> https://github.com/yt-dlp/yt-dlp/tree/dfc0a84c192a7357dd1768cc345d590253a14fe5/yt_dlp/extractor + // TEST: X posts can have multiple videos. + + // Shorthand schemes: yt: or youtube: + if path.starts_with("yt:") || path.starts_with("youtube:") { + let after_scheme = if path.starts_with("yt:") { + &path[3..] + } else { + &path[8..] + }; + + // video/ID, short/ID, shorts/ID + if after_scheme.starts_with("video/") + || after_scheme.starts_with("short/") + || after_scheme.starts_with("shorts/") + { + return Source::YouTubeVideo; + } + + // playlist/ID + if after_scheme.starts_with("playlist/") { + return Source::YouTubePlaylist; + } + + // channel/ID, c/ID, user/ID, @handle + if after_scheme.starts_with("channel/") + || after_scheme.starts_with("c/") + || after_scheme.starts_with("user/") + || after_scheme.starts_with("@") + { + return Source::YouTubeChannel; + } + } + + if path.starts_with("file://") { + return Source::Local; + } else if path.starts_with("http://") || path.starts_with("https://") { + // Video URLs (watch, youtu.be, shorts) + let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)").unwrap(); + if video_re.is_match(path) { + return Source::YouTubeVideo; + } + + // Playlist URLs + let playlist_re = + regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+") + .unwrap(); + if playlist_re.is_match(path) { + return Source::YouTubePlaylist; + } + + // Channel or user URLs (channel IDs, /c/, /user/, or @handles) + let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)").unwrap(); + if channel_re.is_match(path) { + return Source::YouTubeChannel; + } + + if path.starts_with("https://x.com/") { + return Source::X; + } } Source::Other } @@ -136,54 +207,92 @@ fn main() -> Result<()> { let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string(); let source = determine_source(path); - if let Source::YouTube = source { - let store_path_string_file = archive_path.unwrap().join("store_path"); - let store_path = match fs::read_to_string(store_path_string_file) { - Ok(p) => PathBuf::from(p.trim()), - Err(e) => { - eprintln!("Failed to read store path: {e}"); - process::exit(1); - } - }; + if let Source::Other = source { + eprintln!("Archiving from this source is not yet implemented."); + process::exit(1); + } - let hash = - match downloader::youtube::download(path.clone(), &store_path, ×tamp) { + let store_path_string_file = archive_path.unwrap().join("store_path"); + let store_path = match fs::read_to_string(store_path_string_file) { + Ok(p) => PathBuf::from(p.trim()), + Err(e) => { + eprintln!("Failed to read store path: {e}"); + process::exit(1); + } + }; + + let hash = match source { + Source::YouTubeVideo | Source::X => { + match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) { Ok(h) => h, Err(e) => { eprintln!("Failed to download from YouTube: {e}"); process::exit(1); } - }; - - let hash_exists = hash_exists(format!("{hash}.mp4"), &store_path); - // TODO: check for repeated archives? - // There could be one of the following: - // - We are literally archiving the same path over again. - // - We are archiving a different path, which had this file. E.g.: we archived a - // website before which had this YouTube video, and while recursively archiving - // everything, we also archived the YouTube video although it wasn't our main - // target. This means that we should archive again; whereas with the first case... - // Not sure. Need to think about this. - // ---- - // Thinking about it a day later... - // If we are specifically archiving a YouTube video, it could also be two of the - // above. So yeah, just create a new DB entry and symlink the Raw to the Structured - // Dir or whatever. it's midnight and my brain ain't wording/braining. - if hash_exists { - println!("File already archived."); - process::exit(0); - } else { - move_temp_to_raw( - &store_path.join("temp").join(format!("{timestamp}.mp4")), - &hash, - &store_path, - )?; - - println!("File archived successfully."); + } } + Source::Local => { + match downloader::local::save(path.clone(), &store_path, ×tamp) { + Ok(h) => h, + Err(e) => { + eprintln!("Failed to archive local file: {e}"); + process::exit(1); + } + } + } + _ => unreachable!(), + }; + + let file_extension = match source { + Source::YouTubeVideo | Source::X => ".mp4", + Source::Local => { + let p = Path::new(path.trim_start_matches("file://")); + &p.extension() + .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) + } + _ => "", + }; + + let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path); + + // TODO: check for repeated archives? + // There could be one of the following: + // - We are literally archiving the same path over again. + // - We are archiving a different path, which had this file. E.g.: we archived a + // website before which had this YouTube video, and while recursively archiving + // everything, we also archived the YouTube video although it wasn't our main + // target. This means that we should archive again; whereas with the first case... + // Not sure. Need to think about this. + // ---- + // Thinking about it a day later... + // If we are specifically archiving a YouTube video, it could also be two of the + // above. So yeah, just create a new DB entry and symlink the Raw to the Structured + // Dir or whatever. it's midnight and my brain ain't wording/braining. + if hash_exists { + println!("File already archived."); + let _ = fs::remove_file( + store_path + .join("temp") + .join(×tamp) + .join(format!("{timestamp}{file_extension}")), + ); + process::exit(0); + } else { + move_temp_to_raw( + &store_path + .join("temp") + .join(×tamp) + .join(format!("{timestamp}{file_extension}")), + &hash, + &store_path, + )?; + + println!("File archived successfully."); } // TODO: DB INSERT, inserting a record + // https://github.com/rusqlite/rusqlite + // Think of the DB schema Ok(()) } @@ -192,6 +301,7 @@ fn main() -> Result<()> { path: ref archive_path_string, store_path: ref store_path_string, name: ref archive_name, + force_with_info_removal, } => { let archive_path = Path::new(&archive_path_string).join(".archivr"); let store_path = if Path::new(&store_path_string).is_relative() { @@ -201,16 +311,26 @@ fn main() -> Result<()> { }; if archive_path.exists() { - // TODO: check if there is nothing inside. if there is nothing inside, use it - eprintln!("Archive already exists at {}", archive_path.display()); - if store_path.exists() { - eprintln!("Store path already exists at {}", store_path.display()); + if !archive_path.is_dir() { + eprintln!( + "Archive path exists and is not a directory: {}", + archive_path.display() + ); + process::exit(1); + } + + if force_with_info_removal { + fs::remove_dir_all(&archive_path)?; + } else if fs::read_dir(&archive_path)?.next().is_some() { + eprintln!( + "Archive already exists at {} and is not empty. Use --force-with-info-removal to reinitialize.", + archive_path.display() + ); process::exit(1); } - process::exit(1); } - if store_path.exists() { - // TODO: check if the structure is correct. If so, use it. + + if store_path.exists() && !force_with_info_removal { eprintln!("Store path already exists at {}", store_path.display()); process::exit(1); } @@ -232,3 +352,176 @@ fn main() -> Result<()> { } // _ => eprintln!("Unknown command: {:?}", args.command), } } + +#[cfg(test)] +mod tests { + use super::*; + + struct TestCase<'a> { + url: &'a str, + expected: Source, + } + + #[test] + fn test_youtube_sources() { + // --- YouTube Video URLs --- + let video_cases = [ + TestCase { + url: "https://www.youtube.com/watch?v=UHxw-L2WyyY", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "https://youtu.be/UHxw-L2WyyY", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "https://www.youtube.com/shorts/EtC99eWiwRI", + expected: Source::YouTubeVideo, + }, + ]; + + for case in &video_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + + // --- YouTube Playlist URLs --- + let playlist_cases = [TestCase { + url: "https://www.youtube.com/playlist?list=PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez", + expected: Source::YouTubePlaylist, + }]; + + for case in &playlist_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + + // --- YouTube Channel URLs --- + let channel_cases = [ + TestCase { + url: "https://www.youtube.com/channel/CoreDumpped", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "https://www.youtube.com/@CoreDumpped", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "https://www.youtube.com/c/YouTubeCreators", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "https://www.youtube.com/user/pewdiepie", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "https://youtube.com/@pewdiepie?si=KOcLN_KPYNpe5f_8", + expected: Source::YouTubeChannel, + }, + ]; + + for case in &channel_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + + // --- Shorthand scheme URLs --- + let shorthand_cases = [ + // Videos + TestCase { + url: "yt:video/UHxw-L2WyyY", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "youtube:video/UHxw-L2WyyY", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "yt:short/EtC99eWiwRI", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "yt:shorts/EtC99eWiwRI", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "youtube:shorts/EtC99eWiwRI", + expected: Source::YouTubeVideo, + }, + // Playlists + TestCase { + url: "yt:playlist/PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez", + expected: Source::YouTubePlaylist, + }, + TestCase { + url: "youtube:playlist/PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez", + expected: Source::YouTubePlaylist, + }, + // Channels + TestCase { + url: "yt:channel/UCxyz123", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "yt:c/YouTubeCreators", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "yt:user/pewdiepie", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "youtube:@CoreDumpped", + expected: Source::YouTubeChannel, + }, + ]; + + for case in &shorthand_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + } + + #[test] + fn test_non_youtube_sources() { + let other_cases = [ + TestCase { + url: "file:///local/path/file.mp4", + expected: Source::Local, + }, + TestCase { + url: "https://x.com/some_post", + expected: Source::X, + }, + TestCase { + url: "https://example.com/", + expected: Source::Other, + }, + ]; + + for case in &other_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + } +}