Merge df05687ad9 into 553cca99ca

2026-03-07 11:39:55 +01:00 · 2026-02-05 14:01:28 +01:00 · 2026-02-05 14:01:28 +01:00 · 51933e608f
commit 51933e608f
parent 553cca99ca df05687ad9
11 changed files with 459 additions and 66 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,9 @@
 *

 !.gitignore
-!*.md
+
+!docs
+!docs/**

 !src
 !src/**
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2,6 +2,15 @@
 # It is not intended for manual editing.
 version = 4

+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@ -75,6 +84,7 @@ dependencies = [
 "chrono",
 "clap",
 "hex",
+ "regex",
 "sha3",
 "uuid",
 ]
@ -311,6 +321,12 @@ version = "0.4.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"

+[[package]]
+name = "memchr"
+version = "2.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
+
 [[package]]
 name = "num-traits"
 version = "0.2.19"
@ -356,6 +372,35 @@ version = "5.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"

+[[package]]
+name = "regex"
+version = "1.12.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
+
 [[package]]
 name = "rustversion"
 version = "1.0.22"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -8,5 +8,6 @@ anyhow = "1.0.100"
 chrono = "0.4.42"
 clap = { version = "4.5.48", features = ["derive"] }
 hex = "0.4.3"
+regex = "1.12.2"
 sha3 = "0.10.8"
 uuid = { version = "1.18.1", features = ["v4"] }
--- a/docs/LICENSE.md
+++ b/docs/LICENSE.md
--- a/docs/README.md
+++ b/docs/README.md
@ -1,29 +1,26 @@
 # archivr

-An open-source self-hosted archiving solution. Work in progress.
+An open-source self-hosted archiving tool. Work in progress.

 ## Milestones
 - [ ] Archiving
    - [ ] Archiving media files from social media platforms
-        - [ ] YouTube
-        - [ ] Twitter
+        - [X] YouTube Videos
+        - [X] Twitter Videos
        - [ ] Instagram
        - [ ] Facebook
        - [ ] TikTok
        - [ ] Reddit
        - [ ] Snapchat
+        - [ ] YouTube Posts (?)
        - (Some of these could be postponed for later.)
-    - [ ] Archiving local files
-        - [ ] Archive videos (MP4, WebM)
-        - [ ] Archive audio files (MP3, WAV)
-        - [ ] Archive documents (DOCX, XLSX, PPTX)
-        - [ ] Archive PDFs
-        - [ ] Archive images (JPEG, PNG, GIF)
+    - [X] Archiving local files
    - [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs
        - [ ] URLs
        - [ ] Google Drive
        - [ ] Dropbox
        - [ ] OneDrive
+    - [ ] Archiving Twitter threads
    - [ ] Archive web pages (HTML, CSS, JS, images)
    - [ ] Archiving emails (???)
        - [ ] Gmail
--- a/flake.lock
+++ b/flake.lock
@ -2,11 +2,11 @@
  "nodes": {
    "nixpkgs": {
      "locked": {
-        "lastModified": 1760284886,
-        "narHash": "sha256-TK9Kr0BYBQ/1P5kAsnNQhmWWKgmZXwUQr4ZMjCzWf2c=",
+        "lastModified": 1761672384,
+        "narHash": "sha256-o9KF3DJL7g7iYMZq9SWgfS1BFlNbsm6xplRjVlOCkXI=",
        "owner": "nixos",
        "repo": "nixpkgs",
-        "rev": "cf3f5c4def3c7b5f1fc012b3d839575dbe552d43",
+        "rev": "08dacfca559e1d7da38f3cf05f1f45ee9bfd213c",
        "type": "github"
      },
      "original": {
--- a/src/downloader/local.rs
+++ b/src/downloader/local.rs
@ -0,0 +1,28 @@
+use anyhow::{Context, Result, bail};
+use std::{path::Path, process::Command};
+
+use crate::hash::hash_file;
+
+pub fn save(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
+    println!("Saving path: {path}");
+
+    let temp_dir = store_path.join("temp").join(timestamp);
+    std::fs::create_dir_all(&temp_dir)?;
+
+    let in_file = Path::new(path.trim_start_matches("file://"));
+    let extension = in_file
+        .extension()
+        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()));
+    let out_file = temp_dir.join(format!("{timestamp}{extension}"));
+
+    let mut binding = Command::new("cp");
+    let cmd = binding.arg(in_file).arg(&out_file);
+    let out = cmd.output().with_context(|| "failed to spawn cp process")?;
+
+    if !out.status.success() {
+        let stderr = String::from_utf8_lossy(&out.stderr);
+        bail!("yt-dlp failed: {stderr}");
+    }
+
+    hash_file(&out_file)
+}
--- a/src/downloader/mod.rs
+++ b/src/downloader/mod.rs
@ -1 +1,2 @@
-pub mod youtube;
+pub mod local;
+pub mod ytdlp;
--- a/src/downloader/youtube.rs
+++ b/src/downloader/youtube.rs
@ -4,12 +4,13 @@ use std::{env, path::Path, process::Command};
 use crate::hash::hash_file;

 pub fn download(path: String, store_path: &Path, timestamp: &String) -> Result<String> {
-    println!("Downloading from YouTube: {path}");
+    println!("Downloading with yt-dlp: {path}");

    let ytdlp = env::var("ARCHIVR_YT_DLP").unwrap_or_else(|_| "yt-dlp".to_string());

-    let temp_dir = store_path.join("temp");
+    let temp_dir = store_path.join("temp").join(timestamp);
    std::fs::create_dir_all(&temp_dir)?;
+
    let out_file = temp_dir.join(format!("{timestamp}.mp4"));

    let out = Command::new(&ytdlp)
--- a/src/hash.rs
+++ b/src/hash.rs
@ -1,6 +1,6 @@
+use anyhow::Result;
 use sha3::{Digest, Sha3_256};
 use std::{fs::File, io::Read, path::Path};
-use anyhow::Result;

 pub fn hash_file(path: &Path) -> Result<String> {
    let mut file = File::open(path)?;
--- a/src/main.rs
+++ b/src/main.rs
@ -32,6 +32,8 @@ enum Command {
        /// Store path - path to store the archived files in.
        /// Structure will be:
        /// store_path/
+        ///   temp/
+        ///     ...
        ///   raw/
        ///     ...
        ///   structured/
@ -42,6 +44,10 @@ enum Command {
        /// Name of the archive
        #[arg(short, long)]
        name: String,
+
+        /// Wipe existing .archivr repository data
+        #[arg(long = "force-with-info-removal")]
+        force_with_info_removal: bool,
    },
 }

@ -58,15 +64,82 @@ fn get_archive_path() -> Option<PathBuf> {
    None
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 enum Source {
-    YouTube,
+    YouTubeVideo,
+    YouTubePlaylist,
+    YouTubeChannel,
+    X,
+    Local,
    Other,
 }

+// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
+// -> should be asked whether they want to archive the whole website or just the video(s) on it.
 fn determine_source(path: &str) -> Source {
-    if path.starts_with("http://") || path.starts_with("https://") {
-        return Source::YouTube;
+    // INFO: Extractors' URLs can be found here:
+    // -> https://github.com/yt-dlp/yt-dlp/tree/dfc0a84c192a7357dd1768cc345d590253a14fe5/yt_dlp/extractor
+    // TEST: X posts can have multiple videos.
+
+    // Shorthand schemes: yt: or youtube:
+    if let Some(after_scheme) = path
+        .strip_prefix("yt:")
+        .or_else(|| path.strip_prefix("youtube:"))
+    {
+        // video/ID, short/ID, shorts/ID
+        if after_scheme.starts_with("video/")
+            || after_scheme.starts_with("short/")
+            || after_scheme.starts_with("shorts/")
+        {
+            return Source::YouTubeVideo;
+        }
+
+        // playlist/ID
+        if after_scheme.starts_with("playlist/") {
+            return Source::YouTubePlaylist;
+        }
+
+        // channel/ID, c/ID, user/ID, @handle
+        if after_scheme.starts_with("channel/")
+            || after_scheme.starts_with("c/")
+            || after_scheme.starts_with("user/")
+            || after_scheme.starts_with("@")
+        {
+            return Source::YouTubeChannel;
+        }
+    }
+
+    // Shorthand schemes: x: or twitter:
+    if path.starts_with("x:") || path.starts_with("twitter:") {
+        return Source::X;
+    }
+
+    if path.starts_with("file://") {
+        return Source::Local;
+    } else if path.starts_with("http://") || path.starts_with("https://") {
+        // Video URLs (watch, youtu.be, shorts)
+        let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)").unwrap();
+        if video_re.is_match(path) {
+            return Source::YouTubeVideo;
+        }
+
+        // Playlist URLs
+        let playlist_re =
+            regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+")
+                .unwrap();
+        if playlist_re.is_match(path) {
+            return Source::YouTubePlaylist;
+        }
+
+        // Channel or user URLs (channel IDs, /c/, /user/, or @handles)
+        let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)").unwrap();
+        if channel_re.is_match(path) {
+            return Source::YouTubeChannel;
+        }
+
+        if path.starts_with("https://x.com/") {
+            return Source::X;
+        }
    }
    Source::Other
 }
@ -136,54 +209,92 @@ fn main() -> Result<()> {
            let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();

            let source = determine_source(path);
-            if let Source::YouTube = source {
-                let store_path_string_file = archive_path.unwrap().join("store_path");
-                let store_path = match fs::read_to_string(store_path_string_file) {
-                    Ok(p) => PathBuf::from(p.trim()),
-                    Err(e) => {
-                        eprintln!("Failed to read store path: {e}");
-                        process::exit(1);
-                    }
-                };
+            if let Source::Other = source {
+                eprintln!("Archiving from this source is not yet implemented.");
+                process::exit(1);
+            }

-                let hash =
-                    match downloader::youtube::download(path.clone(), &store_path, &timestamp) {
+            let store_path_string_file = archive_path.unwrap().join("store_path");
+            let store_path = match fs::read_to_string(store_path_string_file) {
+                Ok(p) => PathBuf::from(p.trim()),
+                Err(e) => {
+                    eprintln!("Failed to read store path: {e}");
+                    process::exit(1);
+                }
+            };
+
+            let hash = match source {
+                Source::YouTubeVideo | Source::X => {
+                    match downloader::ytdlp::download(path.clone(), &store_path, &timestamp) {
                        Ok(h) => h,
                        Err(e) => {
                            eprintln!("Failed to download from YouTube: {e}");
                            process::exit(1);
                        }
-                    };
-
-                let hash_exists = hash_exists(format!("{hash}.mp4"), &store_path);
-                // TODO: check for repeated archives?
-                // There could be one of the following:
-                // - We are literally archiving the same path over again.
-                // - We are archiving a different path, which had this file. E.g.: we archived a
-                // website before which had this YouTube video, and while recursively archiving
-                // everything, we also archived the YouTube video although it wasn't our main
-                // target. This means that we should archive again; whereas with the first case...
-                // Not sure. Need to think about this.
-                // ----
-                // Thinking about it a day later...
-                // If we are specifically archiving a YouTube video, it could also be two of the
-                // above. So yeah, just create a new DB entry and symlink the Raw to the Structured
-                // Dir or whatever. it's midnight and my brain ain't wording/braining.
-                if hash_exists {
-                    println!("File already archived.");
-                    process::exit(0);
-                } else {
-                    move_temp_to_raw(
-                        &store_path.join("temp").join(format!("{timestamp}.mp4")),
-                        &hash,
-                        &store_path,
-                    )?;
-
-                    println!("File archived successfully.");
+                    }
                }
+                Source::Local => {
+                    match downloader::local::save(path.clone(), &store_path, &timestamp) {
+                        Ok(h) => h,
+                        Err(e) => {
+                            eprintln!("Failed to archive local file: {e}");
+                            process::exit(1);
+                        }
+                    }
+                }
+                _ => unreachable!(),
+            };
+
+            let file_extension = match source {
+                Source::YouTubeVideo | Source::X => ".mp4",
+                Source::Local => {
+                    let p = Path::new(path.trim_start_matches("file://"));
+                    &p.extension()
+                        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
+                }
+                _ => "",
+            };
+
+            let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
+
+            // TODO: check for repeated archives?
+            // There could be one of the following:
+            // - We are literally archiving the same path over again.
+            // - We are archiving a different path, which had this file. E.g.: we archived a
+            // website before which had this YouTube video, and while recursively archiving
+            // everything, we also archived the YouTube video although it wasn't our main
+            // target. This means that we should archive again; whereas with the first case...
+            // Not sure. Need to think about this.
+            // ----
+            // Thinking about it a day later...
+            // If we are specifically archiving a YouTube video, it could also be two of the
+            // above. So yeah, just create a new DB entry and symlink the Raw to the Structured
+            // Dir or whatever. it's midnight and my brain ain't wording/braining.
+            if hash_exists {
+                println!("File already archived.");
+                let _ = fs::remove_file(
+                    store_path
+                        .join("temp")
+                        .join(&timestamp)
+                        .join(format!("{timestamp}{file_extension}")),
+                );
+                process::exit(0);
+            } else {
+                move_temp_to_raw(
+                    &store_path
+                        .join("temp")
+                        .join(&timestamp)
+                        .join(format!("{timestamp}{file_extension}")),
+                    &hash,
+                    &store_path,
+                )?;
+
+                println!("File archived successfully.");
            }

            // TODO: DB INSERT, inserting a record
+            // https://github.com/rusqlite/rusqlite
+            // Think of the DB schema

            Ok(())
        }
@ -192,6 +303,7 @@ fn main() -> Result<()> {
            path: ref archive_path_string,
            store_path: ref store_path_string,
            name: ref archive_name,
+            force_with_info_removal,
        } => {
            let archive_path = Path::new(&archive_path_string).join(".archivr");
            let store_path = if Path::new(&store_path_string).is_relative() {
@ -201,16 +313,26 @@ fn main() -> Result<()> {
            };

            if archive_path.exists() {
-                // TODO: check if there is nothing inside. if there is nothing inside, use it
-                eprintln!("Archive already exists at {}", archive_path.display());
-                if store_path.exists() {
-                    eprintln!("Store path already exists at {}", store_path.display());
+                if !archive_path.is_dir() {
+                    eprintln!(
+                        "Archive path exists and is not a directory: {}",
+                        archive_path.display()
+                    );
+                    process::exit(1);
+                }
+
+                if force_with_info_removal {
+                    fs::remove_dir_all(&archive_path)?;
+                } else if fs::read_dir(&archive_path)?.next().is_some() {
+                    eprintln!(
+                        "Archive already exists at {} and is not empty. Use --force-with-info-removal to reinitialize.",
+                        archive_path.display()
+                    );
                    process::exit(1);
                }
-                process::exit(1);
            }
-            if store_path.exists() {
-                // TODO: check if the structure is correct. If so, use it.
+
+            if store_path.exists() && !force_with_info_removal {
                eprintln!("Store path already exists at {}", store_path.display());
                process::exit(1);
            }
@ -232,3 +354,199 @@ fn main() -> Result<()> {
        } // _ => eprintln!("Unknown command: {:?}", args.command),
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    struct TestCase<'a> {
+        url: &'a str,
+        expected: Source,
+    }
+
+    #[test]
+    fn test_youtube_sources() {
+        // --- YouTube Video URLs ---
+        let video_cases = [
+            TestCase {
+                url: "https://www.youtube.com/watch?v=UHxw-L2WyyY",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "https://youtu.be/UHxw-L2WyyY",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "https://www.youtube.com/shorts/EtC99eWiwRI",
+                expected: Source::YouTubeVideo,
+            },
+        ];
+
+        for case in &video_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+
+        // --- YouTube Playlist URLs ---
+        let playlist_cases = [TestCase {
+            url: "https://www.youtube.com/playlist?list=PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez",
+            expected: Source::YouTubePlaylist,
+        }];
+
+        for case in &playlist_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+
+        // --- YouTube Channel URLs ---
+        let channel_cases = [
+            TestCase {
+                url: "https://www.youtube.com/channel/CoreDumpped",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "https://www.youtube.com/@CoreDumpped",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "https://www.youtube.com/c/YouTubeCreators",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "https://www.youtube.com/user/pewdiepie",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "https://youtube.com/@pewdiepie?si=KOcLN_KPYNpe5f_8",
+                expected: Source::YouTubeChannel,
+            },
+        ];
+
+        for case in &channel_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+
+        // --- Shorthand scheme URLs ---
+        let shorthand_cases = [
+            // Videos
+            TestCase {
+                url: "yt:video/UHxw-L2WyyY",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "youtube:video/UHxw-L2WyyY",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "yt:short/EtC99eWiwRI",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "yt:shorts/EtC99eWiwRI",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "youtube:shorts/EtC99eWiwRI",
+                expected: Source::YouTubeVideo,
+            },
+            // Playlists
+            TestCase {
+                url: "yt:playlist/PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez",
+                expected: Source::YouTubePlaylist,
+            },
+            TestCase {
+                url: "youtube:playlist/PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez",
+                expected: Source::YouTubePlaylist,
+            },
+            // Channels
+            TestCase {
+                url: "yt:channel/UCxyz123",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "yt:c/YouTubeCreators",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "yt:user/pewdiepie",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "youtube:@CoreDumpped",
+                expected: Source::YouTubeChannel,
+            },
+        ];
+
+        for case in &shorthand_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+    }
+
+    #[test]
+    fn test_x_sources() {
+        let x_cases = [
+            TestCase {
+                url: "https://x.com/some_post",
+                expected: Source::X,
+            },
+            TestCase {
+                url: "x:1234567890",
+                expected: Source::X,
+            },
+            TestCase {
+                url: "twitter:1234567890",
+                expected: Source::X,
+            },
+        ];
+
+        for case in &x_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+    }
+
+    #[test]
+    fn test_non_youtube_sources() {
+        let other_cases = [
+            TestCase {
+                url: "file:///local/path/file.mp4",
+                expected: Source::Local,
+            },
+            TestCase {
+                url: "https://example.com/",
+                expected: Source::Other,
+            },
+        ];
+
+        for case in &other_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+    }
+}