diff --git a/.gitignore b/.gitignore index 5bf848c..c8ea956 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ * !.gitignore -!*.md + +!docs +!docs/** !src !src/** diff --git a/Cargo.lock b/Cargo.lock index 8678d20..155a9fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "android_system_properties" version = "0.1.5" @@ -75,6 +84,7 @@ dependencies = [ "chrono", "clap", "hex", + "regex", "sha3", "uuid", ] @@ -311,6 +321,12 @@ version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + [[package]] name = "num-traits" version = "0.2.19" @@ -356,6 +372,35 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "regex" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + [[package]] name = "rustversion" version = "1.0.22" diff --git a/Cargo.toml b/Cargo.toml index de61350..f40ba88 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,5 +8,6 @@ anyhow = "1.0.100" chrono = "0.4.42" clap = { version = "4.5.48", features = ["derive"] } hex = "0.4.3" +regex = "1.12.2" sha3 = "0.10.8" uuid = { version = "1.18.1", features = ["v4"] } diff --git a/LICENSE.md b/docs/LICENSE.md similarity index 100% rename from LICENSE.md rename to docs/LICENSE.md diff --git a/README.md b/docs/README.md similarity index 83% rename from README.md rename to docs/README.md index f59bfab..12c6af6 100644 --- a/README.md +++ b/docs/README.md @@ -1,29 +1,26 @@ # archivr -An open-source self-hosted archiving solution. Work in progress. +An open-source self-hosted archiving tool. Work in progress. ## Milestones - [ ] Archiving - [ ] Archiving media files from social media platforms - - [ ] YouTube - - [ ] Twitter + - [X] YouTube Videos + - [X] Twitter Videos - [ ] Instagram - [ ] Facebook - [ ] TikTok - [ ] Reddit - [ ] Snapchat + - [ ] YouTube Posts (?) - (Some of these could be postponed for later.) - - [ ] Archiving local files - - [ ] Archive videos (MP4, WebM) - - [ ] Archive audio files (MP3, WAV) - - [ ] Archive documents (DOCX, XLSX, PPTX) - - [ ] Archive PDFs - - [ ] Archive images (JPEG, PNG, GIF) + - [X] Archiving local files - [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs - [ ] URLs - [ ] Google Drive - [ ] Dropbox - [ ] OneDrive + - [ ] Archiving Twitter threads - [ ] Archive web pages (HTML, CSS, JS, images) - [ ] Archiving emails (???) - [ ] Gmail diff --git a/flake.lock b/flake.lock index fd86e53..d406848 100644 --- a/flake.lock +++ b/flake.lock @@ -2,11 +2,11 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1760284886, - "narHash": "sha256-TK9Kr0BYBQ/1P5kAsnNQhmWWKgmZXwUQr4ZMjCzWf2c=", + "lastModified": 1761672384, + "narHash": "sha256-o9KF3DJL7g7iYMZq9SWgfS1BFlNbsm6xplRjVlOCkXI=", "owner": "nixos", "repo": "nixpkgs", - "rev": "cf3f5c4def3c7b5f1fc012b3d839575dbe552d43", + "rev": "08dacfca559e1d7da38f3cf05f1f45ee9bfd213c", "type": "github" }, "original": { diff --git a/src/downloader/local.rs b/src/downloader/local.rs new file mode 100644 index 0000000..f946a2e --- /dev/null +++ b/src/downloader/local.rs @@ -0,0 +1,28 @@ +use anyhow::{Context, Result, bail}; +use std::{path::Path, process::Command}; + +use crate::hash::hash_file; + +pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result { + println!("Saving path: {path}"); + + let temp_dir = store_path.join("temp").join(timestamp); + std::fs::create_dir_all(&temp_dir)?; + + let in_file = Path::new(path.trim_start_matches("file://")); + let extension = in_file + .extension() + .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())); + let out_file = temp_dir.join(format!("{timestamp}{extension}")); + + let mut binding = Command::new("cp"); + let cmd = binding.arg(in_file).arg(&out_file); + let out = cmd.output().with_context(|| "failed to spawn cp process")?; + + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr); + bail!("yt-dlp failed: {stderr}"); + } + + hash_file(&out_file) +} diff --git a/src/downloader/mod.rs b/src/downloader/mod.rs index 5e12721..e896201 100644 --- a/src/downloader/mod.rs +++ b/src/downloader/mod.rs @@ -1 +1,2 @@ -pub mod youtube; +pub mod local; +pub mod ytdlp; diff --git a/src/downloader/youtube.rs b/src/downloader/ytdlp.rs similarity index 88% rename from src/downloader/youtube.rs rename to src/downloader/ytdlp.rs index 3af75fa..6ecd7b8 100644 --- a/src/downloader/youtube.rs +++ b/src/downloader/ytdlp.rs @@ -4,12 +4,13 @@ use std::{env, path::Path, process::Command}; use crate::hash::hash_file; pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result { - println!("Downloading from YouTube: {path}"); + println!("Downloading with yt-dlp: {path}"); let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string()); - let temp_dir = store_path.join("temp"); + let temp_dir = store_path.join("temp").join(timestamp); std::fs::create_dir_all(&temp_dir)?; + let out_file = temp_dir.join(format!("{timestamp}.mp4")); let out = Command::new(&ytdlp) diff --git a/src/hash.rs b/src/hash.rs index 4bb1da9..cbf1194 100644 --- a/src/hash.rs +++ b/src/hash.rs @@ -1,6 +1,6 @@ +use anyhow::Result; use sha3::{Digest, Sha3_256}; use std::{fs::File, io::Read, path::Path}; -use anyhow::Result; pub fn hash_file(path: &Path) -> Result { let mut file = File::open(path)?; diff --git a/src/main.rs b/src/main.rs index 4deb3b8..240ea55 100644 --- a/src/main.rs +++ b/src/main.rs @@ -32,6 +32,8 @@ enum Command { /// Store path - path to store the archived files in. /// Structure will be: /// store_path/ + /// temp/ + /// ... /// raw/ /// ... /// structured/ @@ -42,6 +44,10 @@ enum Command { /// Name of the archive #[arg(short, long)] name: String, + + /// Wipe existing .archivr repository data + #[arg(long = "force-with-info-removal")] + force_with_info_removal: bool, }, } @@ -58,15 +64,82 @@ fn get_archive_path() -> Option { None } -#[derive(Debug)] +#[derive(Debug, PartialEq)] enum Source { - YouTube, + YouTubeVideo, + YouTubePlaylist, + YouTubeChannel, + X, + Local, Other, } +// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user +// -> should be asked whether they want to archive the whole website or just the video(s) on it. fn determine_source(path: &str) -> Source { - if path.starts_with("http://") || path.starts_with("https://") { - return Source::YouTube; + // INFO: Extractors' URLs can be found here: + // -> https://github.com/yt-dlp/yt-dlp/tree/dfc0a84c192a7357dd1768cc345d590253a14fe5/yt_dlp/extractor + // TEST: X posts can have multiple videos. + + // Shorthand schemes: yt: or youtube: + if let Some(after_scheme) = path + .strip_prefix("yt:") + .or_else(|| path.strip_prefix("youtube:")) + { + // video/ID, short/ID, shorts/ID + if after_scheme.starts_with("video/") + || after_scheme.starts_with("short/") + || after_scheme.starts_with("shorts/") + { + return Source::YouTubeVideo; + } + + // playlist/ID + if after_scheme.starts_with("playlist/") { + return Source::YouTubePlaylist; + } + + // channel/ID, c/ID, user/ID, @handle + if after_scheme.starts_with("channel/") + || after_scheme.starts_with("c/") + || after_scheme.starts_with("user/") + || after_scheme.starts_with("@") + { + return Source::YouTubeChannel; + } + } + + // Shorthand schemes: x: or twitter: + if path.starts_with("x:") || path.starts_with("twitter:") { + return Source::X; + } + + if path.starts_with("file://") { + return Source::Local; + } else if path.starts_with("http://") || path.starts_with("https://") { + // Video URLs (watch, youtu.be, shorts) + let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)").unwrap(); + if video_re.is_match(path) { + return Source::YouTubeVideo; + } + + // Playlist URLs + let playlist_re = + regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+") + .unwrap(); + if playlist_re.is_match(path) { + return Source::YouTubePlaylist; + } + + // Channel or user URLs (channel IDs, /c/, /user/, or @handles) + let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)").unwrap(); + if channel_re.is_match(path) { + return Source::YouTubeChannel; + } + + if path.starts_with("https://x.com/") { + return Source::X; + } } Source::Other } @@ -136,54 +209,92 @@ fn main() -> Result<()> { let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string(); let source = determine_source(path); - if let Source::YouTube = source { - let store_path_string_file = archive_path.unwrap().join("store_path"); - let store_path = match fs::read_to_string(store_path_string_file) { - Ok(p) => PathBuf::from(p.trim()), - Err(e) => { - eprintln!("Failed to read store path: {e}"); - process::exit(1); - } - }; + if let Source::Other = source { + eprintln!("Archiving from this source is not yet implemented."); + process::exit(1); + } - let hash = - match downloader::youtube::download(path.clone(), &store_path, ×tamp) { + let store_path_string_file = archive_path.unwrap().join("store_path"); + let store_path = match fs::read_to_string(store_path_string_file) { + Ok(p) => PathBuf::from(p.trim()), + Err(e) => { + eprintln!("Failed to read store path: {e}"); + process::exit(1); + } + }; + + let hash = match source { + Source::YouTubeVideo | Source::X => { + match downloader::ytdlp::download(path.clone(), &store_path, ×tamp) { Ok(h) => h, Err(e) => { eprintln!("Failed to download from YouTube: {e}"); process::exit(1); } - }; - - let hash_exists = hash_exists(format!("{hash}.mp4"), &store_path); - // TODO: check for repeated archives? - // There could be one of the following: - // - We are literally archiving the same path over again. - // - We are archiving a different path, which had this file. E.g.: we archived a - // website before which had this YouTube video, and while recursively archiving - // everything, we also archived the YouTube video although it wasn't our main - // target. This means that we should archive again; whereas with the first case... - // Not sure. Need to think about this. - // ---- - // Thinking about it a day later... - // If we are specifically archiving a YouTube video, it could also be two of the - // above. So yeah, just create a new DB entry and symlink the Raw to the Structured - // Dir or whatever. it's midnight and my brain ain't wording/braining. - if hash_exists { - println!("File already archived."); - process::exit(0); - } else { - move_temp_to_raw( - &store_path.join("temp").join(format!("{timestamp}.mp4")), - &hash, - &store_path, - )?; - - println!("File archived successfully."); + } } + Source::Local => { + match downloader::local::save(path.clone(), &store_path, ×tamp) { + Ok(h) => h, + Err(e) => { + eprintln!("Failed to archive local file: {e}"); + process::exit(1); + } + } + } + _ => unreachable!(), + }; + + let file_extension = match source { + Source::YouTubeVideo | Source::X => ".mp4", + Source::Local => { + let p = Path::new(path.trim_start_matches("file://")); + &p.extension() + .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy())) + } + _ => "", + }; + + let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path); + + // TODO: check for repeated archives? + // There could be one of the following: + // - We are literally archiving the same path over again. + // - We are archiving a different path, which had this file. E.g.: we archived a + // website before which had this YouTube video, and while recursively archiving + // everything, we also archived the YouTube video although it wasn't our main + // target. This means that we should archive again; whereas with the first case... + // Not sure. Need to think about this. + // ---- + // Thinking about it a day later... + // If we are specifically archiving a YouTube video, it could also be two of the + // above. So yeah, just create a new DB entry and symlink the Raw to the Structured + // Dir or whatever. it's midnight and my brain ain't wording/braining. + if hash_exists { + println!("File already archived."); + let _ = fs::remove_file( + store_path + .join("temp") + .join(×tamp) + .join(format!("{timestamp}{file_extension}")), + ); + process::exit(0); + } else { + move_temp_to_raw( + &store_path + .join("temp") + .join(×tamp) + .join(format!("{timestamp}{file_extension}")), + &hash, + &store_path, + )?; + + println!("File archived successfully."); } // TODO: DB INSERT, inserting a record + // https://github.com/rusqlite/rusqlite + // Think of the DB schema Ok(()) } @@ -192,6 +303,7 @@ fn main() -> Result<()> { path: ref archive_path_string, store_path: ref store_path_string, name: ref archive_name, + force_with_info_removal, } => { let archive_path = Path::new(&archive_path_string).join(".archivr"); let store_path = if Path::new(&store_path_string).is_relative() { @@ -201,16 +313,26 @@ fn main() -> Result<()> { }; if archive_path.exists() { - // TODO: check if there is nothing inside. if there is nothing inside, use it - eprintln!("Archive already exists at {}", archive_path.display()); - if store_path.exists() { - eprintln!("Store path already exists at {}", store_path.display()); + if !archive_path.is_dir() { + eprintln!( + "Archive path exists and is not a directory: {}", + archive_path.display() + ); + process::exit(1); + } + + if force_with_info_removal { + fs::remove_dir_all(&archive_path)?; + } else if fs::read_dir(&archive_path)?.next().is_some() { + eprintln!( + "Archive already exists at {} and is not empty. Use --force-with-info-removal to reinitialize.", + archive_path.display() + ); process::exit(1); } - process::exit(1); } - if store_path.exists() { - // TODO: check if the structure is correct. If so, use it. + + if store_path.exists() && !force_with_info_removal { eprintln!("Store path already exists at {}", store_path.display()); process::exit(1); } @@ -232,3 +354,199 @@ fn main() -> Result<()> { } // _ => eprintln!("Unknown command: {:?}", args.command), } } + +#[cfg(test)] +mod tests { + use super::*; + + struct TestCase<'a> { + url: &'a str, + expected: Source, + } + + #[test] + fn test_youtube_sources() { + // --- YouTube Video URLs --- + let video_cases = [ + TestCase { + url: "https://www.youtube.com/watch?v=UHxw-L2WyyY", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "https://youtu.be/UHxw-L2WyyY", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "https://www.youtube.com/shorts/EtC99eWiwRI", + expected: Source::YouTubeVideo, + }, + ]; + + for case in &video_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + + // --- YouTube Playlist URLs --- + let playlist_cases = [TestCase { + url: "https://www.youtube.com/playlist?list=PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez", + expected: Source::YouTubePlaylist, + }]; + + for case in &playlist_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + + // --- YouTube Channel URLs --- + let channel_cases = [ + TestCase { + url: "https://www.youtube.com/channel/CoreDumpped", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "https://www.youtube.com/@CoreDumpped", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "https://www.youtube.com/c/YouTubeCreators", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "https://www.youtube.com/user/pewdiepie", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "https://youtube.com/@pewdiepie?si=KOcLN_KPYNpe5f_8", + expected: Source::YouTubeChannel, + }, + ]; + + for case in &channel_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + + // --- Shorthand scheme URLs --- + let shorthand_cases = [ + // Videos + TestCase { + url: "yt:video/UHxw-L2WyyY", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "youtube:video/UHxw-L2WyyY", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "yt:short/EtC99eWiwRI", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "yt:shorts/EtC99eWiwRI", + expected: Source::YouTubeVideo, + }, + TestCase { + url: "youtube:shorts/EtC99eWiwRI", + expected: Source::YouTubeVideo, + }, + // Playlists + TestCase { + url: "yt:playlist/PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez", + expected: Source::YouTubePlaylist, + }, + TestCase { + url: "youtube:playlist/PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez", + expected: Source::YouTubePlaylist, + }, + // Channels + TestCase { + url: "yt:channel/UCxyz123", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "yt:c/YouTubeCreators", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "yt:user/pewdiepie", + expected: Source::YouTubeChannel, + }, + TestCase { + url: "youtube:@CoreDumpped", + expected: Source::YouTubeChannel, + }, + ]; + + for case in &shorthand_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + } + + #[test] + fn test_x_sources() { + let x_cases = [ + TestCase { + url: "https://x.com/some_post", + expected: Source::X, + }, + TestCase { + url: "x:1234567890", + expected: Source::X, + }, + TestCase { + url: "twitter:1234567890", + expected: Source::X, + }, + ]; + + for case in &x_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + } + + #[test] + fn test_non_youtube_sources() { + let other_cases = [ + TestCase { + url: "file:///local/path/file.mp4", + expected: Source::Local, + }, + TestCase { + url: "https://example.com/", + expected: Source::Other, + }, + ]; + + for case in &other_cases { + assert_eq!( + determine_source(case.url), + case.expected, + "Failed for URL: {}", + case.url + ); + } + } +}