feat: expand source detection with granular YouTube types

- Split Source::YouTube into YouTubeVideo, YouTubePlaylist, YouTubeChannel - Add Source::X for Twitter/X posts - Add Source::Local for file:// URLs - Add regex-based URL pattern matching for YouTube URLs - Add shorthand schemes (yt:video/ID, youtube:playlist/ID, etc.) - Add comprehensive tests for all URL patterns
2026-07-22 03:05:32 +02:00 · 2026-01-21 20:29:59 +01:00 · 2026-01-21 20:29:59 +01:00 · 56fa1eaeb9
commit 56fa1eaeb9
parent e455f18932
2 changed files with 343 additions and 50 deletions
--- a/src/hash.rs
+++ b/src/hash.rs
@ -1,6 +1,6 @@
+use anyhow::Result;
 use sha3::{Digest, Sha3_256};
 use std::{fs::File, io::Read, path::Path};
-use anyhow::Result;

 pub fn hash_file(path: &Path) -> Result<String> {
    let mut file = File::open(path)?;
--- a/src/main.rs
+++ b/src/main.rs
@ -32,6 +32,8 @@ enum Command {
        /// Store path - path to store the archived files in.
        /// Structure will be:
        /// store_path/
+        ///   temp/
+        ///     ...
        ///   raw/
        ///     ...
        ///   structured/
@ -42,6 +44,10 @@ enum Command {
        /// Name of the archive
        #[arg(short, long)]
        name: String,
+
+        /// Wipe existing .archivr repository data
+        #[arg(long = "force-with-info-removal")]
+        force_with_info_removal: bool,
    },
 }

@ -58,15 +64,80 @@ fn get_archive_path() -> Option<PathBuf> {
    None
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 enum Source {
-    YouTube,
+    YouTubeVideo,
+    YouTubePlaylist,
+    YouTubeChannel,
+    X,
+    Local,
    Other,
 }

+// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
+// -> should be asked whether they want to archive the whole website or just the video(s) on it.
 fn determine_source(path: &str) -> Source {
-    if path.starts_with("http://") || path.starts_with("https://") {
-        return Source::YouTube;
+    // INFO: Extractors' URLs can be found here:
+    // -> https://github.com/yt-dlp/yt-dlp/tree/dfc0a84c192a7357dd1768cc345d590253a14fe5/yt_dlp/extractor
+    // TEST: X posts can have multiple videos.
+
+    // Shorthand schemes: yt: or youtube:
+    if path.starts_with("yt:") || path.starts_with("youtube:") {
+        let after_scheme = if path.starts_with("yt:") {
+            &path[3..]
+        } else {
+            &path[8..]
+        };
+
+        // video/ID, short/ID, shorts/ID
+        if after_scheme.starts_with("video/")
+            || after_scheme.starts_with("short/")
+            || after_scheme.starts_with("shorts/")
+        {
+            return Source::YouTubeVideo;
+        }
+
+        // playlist/ID
+        if after_scheme.starts_with("playlist/") {
+            return Source::YouTubePlaylist;
+        }
+
+        // channel/ID, c/ID, user/ID, @handle
+        if after_scheme.starts_with("channel/")
+            || after_scheme.starts_with("c/")
+            || after_scheme.starts_with("user/")
+            || after_scheme.starts_with("@")
+        {
+            return Source::YouTubeChannel;
+        }
+    }
+
+    if path.starts_with("file://") {
+        return Source::Local;
+    } else if path.starts_with("http://") || path.starts_with("https://") {
+        // Video URLs (watch, youtu.be, shorts)
+        let video_re = regex::Regex::new(r"^https?://(?:www\.)?(?:youtu\.be/[0-9A-Za-z_-]+|youtube\.com/watch\?v=[0-9A-Za-z_-]+|youtube\.com/shorts/[0-9A-Za-z_-]+)").unwrap();
+        if video_re.is_match(path) {
+            return Source::YouTubeVideo;
+        }
+
+        // Playlist URLs
+        let playlist_re =
+            regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/playlist\?list=[0-9A-Za-z_-]+")
+                .unwrap();
+        if playlist_re.is_match(path) {
+            return Source::YouTubePlaylist;
+        }
+
+        // Channel or user URLs (channel IDs, /c/, /user/, or @handles)
+        let channel_re = regex::Regex::new(r"^https?://(?:www\.)?youtube\.com/(?:channel/[0-9A-Za-z_-]+|c/[0-9A-Za-z_-]+|user/[0-9A-Za-z_-]+|@[0-9A-Za-z_-]+)").unwrap();
+        if channel_re.is_match(path) {
+            return Source::YouTubeChannel;
+        }
+
+        if path.starts_with("https://x.com/") {
+            return Source::X;
+        }
    }
    Source::Other
 }
@ -136,54 +207,92 @@ fn main() -> Result<()> {
            let timestamp = Local::now().format("%Y-%m-%dT%H-%M-%S%.3f").to_string();

            let source = determine_source(path);
-            if let Source::YouTube = source {
-                let store_path_string_file = archive_path.unwrap().join("store_path");
-                let store_path = match fs::read_to_string(store_path_string_file) {
-                    Ok(p) => PathBuf::from(p.trim()),
-                    Err(e) => {
-                        eprintln!("Failed to read store path: {e}");
-                        process::exit(1);
-                    }
-                };
+            if let Source::Other = source {
+                eprintln!("Archiving from this source is not yet implemented.");
+                process::exit(1);
+            }

-                let hash =
-                    match downloader::youtube::download(path.clone(), &store_path, &timestamp) {
+            let store_path_string_file = archive_path.unwrap().join("store_path");
+            let store_path = match fs::read_to_string(store_path_string_file) {
+                Ok(p) => PathBuf::from(p.trim()),
+                Err(e) => {
+                    eprintln!("Failed to read store path: {e}");
+                    process::exit(1);
+                }
+            };
+
+            let hash = match source {
+                Source::YouTubeVideo | Source::X => {
+                    match downloader::ytdlp::download(path.clone(), &store_path, &timestamp) {
                        Ok(h) => h,
                        Err(e) => {
                            eprintln!("Failed to download from YouTube: {e}");
                            process::exit(1);
                        }
-                    };
-
-                let hash_exists = hash_exists(format!("{hash}.mp4"), &store_path);
-                // TODO: check for repeated archives?
-                // There could be one of the following:
-                // - We are literally archiving the same path over again.
-                // - We are archiving a different path, which had this file. E.g.: we archived a
-                // website before which had this YouTube video, and while recursively archiving
-                // everything, we also archived the YouTube video although it wasn't our main
-                // target. This means that we should archive again; whereas with the first case...
-                // Not sure. Need to think about this.
-                // ----
-                // Thinking about it a day later...
-                // If we are specifically archiving a YouTube video, it could also be two of the
-                // above. So yeah, just create a new DB entry and symlink the Raw to the Structured
-                // Dir or whatever. it's midnight and my brain ain't wording/braining.
-                if hash_exists {
-                    println!("File already archived.");
-                    process::exit(0);
-                } else {
-                    move_temp_to_raw(
-                        &store_path.join("temp").join(format!("{timestamp}.mp4")),
-                        &hash,
-                        &store_path,
-                    )?;
-
-                    println!("File archived successfully.");
+                    }
                }
+                Source::Local => {
+                    match downloader::local::save(path.clone(), &store_path, &timestamp) {
+                        Ok(h) => h,
+                        Err(e) => {
+                            eprintln!("Failed to archive local file: {e}");
+                            process::exit(1);
+                        }
+                    }
+                }
+                _ => unreachable!(),
+            };
+
+            let file_extension = match source {
+                Source::YouTubeVideo | Source::X => ".mp4",
+                Source::Local => {
+                    let p = Path::new(path.trim_start_matches("file://"));
+                    &p.extension()
+                        .map_or(String::new(), |ext| format!(".{}", ext.to_string_lossy()))
+                }
+                _ => "",
+            };
+
+            let hash_exists = hash_exists(format!("{hash}{file_extension}"), &store_path);
+
+            // TODO: check for repeated archives?
+            // There could be one of the following:
+            // - We are literally archiving the same path over again.
+            // - We are archiving a different path, which had this file. E.g.: we archived a
+            // website before which had this YouTube video, and while recursively archiving
+            // everything, we also archived the YouTube video although it wasn't our main
+            // target. This means that we should archive again; whereas with the first case...
+            // Not sure. Need to think about this.
+            // ----
+            // Thinking about it a day later...
+            // If we are specifically archiving a YouTube video, it could also be two of the
+            // above. So yeah, just create a new DB entry and symlink the Raw to the Structured
+            // Dir or whatever. it's midnight and my brain ain't wording/braining.
+            if hash_exists {
+                println!("File already archived.");
+                let _ = fs::remove_file(
+                    store_path
+                        .join("temp")
+                        .join(&timestamp)
+                        .join(format!("{timestamp}{file_extension}")),
+                );
+                process::exit(0);
+            } else {
+                move_temp_to_raw(
+                    &store_path
+                        .join("temp")
+                        .join(&timestamp)
+                        .join(format!("{timestamp}{file_extension}")),
+                    &hash,
+                    &store_path,
+                )?;
+
+                println!("File archived successfully.");
            }

            // TODO: DB INSERT, inserting a record
+            // https://github.com/rusqlite/rusqlite
+            // Think of the DB schema

            Ok(())
        }
@ -192,6 +301,7 @@ fn main() -> Result<()> {
            path: ref archive_path_string,
            store_path: ref store_path_string,
            name: ref archive_name,
+            force_with_info_removal,
        } => {
            let archive_path = Path::new(&archive_path_string).join(".archivr");
            let store_path = if Path::new(&store_path_string).is_relative() {
@ -201,16 +311,26 @@ fn main() -> Result<()> {
            };

            if archive_path.exists() {
-                // TODO: check if there is nothing inside. if there is nothing inside, use it
-                eprintln!("Archive already exists at {}", archive_path.display());
-                if store_path.exists() {
-                    eprintln!("Store path already exists at {}", store_path.display());
+                if !archive_path.is_dir() {
+                    eprintln!(
+                        "Archive path exists and is not a directory: {}",
+                        archive_path.display()
+                    );
+                    process::exit(1);
+                }
+
+                if force_with_info_removal {
+                    fs::remove_dir_all(&archive_path)?;
+                } else if fs::read_dir(&archive_path)?.next().is_some() {
+                    eprintln!(
+                        "Archive already exists at {} and is not empty. Use --force-with-info-removal to reinitialize.",
+                        archive_path.display()
+                    );
                    process::exit(1);
                }
-                process::exit(1);
            }
-            if store_path.exists() {
-                // TODO: check if the structure is correct. If so, use it.
+
+            if store_path.exists() && !force_with_info_removal {
                eprintln!("Store path already exists at {}", store_path.display());
                process::exit(1);
            }
@ -232,3 +352,176 @@ fn main() -> Result<()> {
        } // _ => eprintln!("Unknown command: {:?}", args.command),
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    struct TestCase<'a> {
+        url: &'a str,
+        expected: Source,
+    }
+
+    #[test]
+    fn test_youtube_sources() {
+        // --- YouTube Video URLs ---
+        let video_cases = [
+            TestCase {
+                url: "https://www.youtube.com/watch?v=UHxw-L2WyyY",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "https://youtu.be/UHxw-L2WyyY",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "https://www.youtube.com/shorts/EtC99eWiwRI",
+                expected: Source::YouTubeVideo,
+            },
+        ];
+
+        for case in &video_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+
+        // --- YouTube Playlist URLs ---
+        let playlist_cases = [TestCase {
+            url: "https://www.youtube.com/playlist?list=PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez",
+            expected: Source::YouTubePlaylist,
+        }];
+
+        for case in &playlist_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+
+        // --- YouTube Channel URLs ---
+        let channel_cases = [
+            TestCase {
+                url: "https://www.youtube.com/channel/CoreDumpped",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "https://www.youtube.com/@CoreDumpped",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "https://www.youtube.com/c/YouTubeCreators",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "https://www.youtube.com/user/pewdiepie",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "https://youtube.com/@pewdiepie?si=KOcLN_KPYNpe5f_8",
+                expected: Source::YouTubeChannel,
+            },
+        ];
+
+        for case in &channel_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+
+        // --- Shorthand scheme URLs ---
+        let shorthand_cases = [
+            // Videos
+            TestCase {
+                url: "yt:video/UHxw-L2WyyY",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "youtube:video/UHxw-L2WyyY",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "yt:short/EtC99eWiwRI",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "yt:shorts/EtC99eWiwRI",
+                expected: Source::YouTubeVideo,
+            },
+            TestCase {
+                url: "youtube:shorts/EtC99eWiwRI",
+                expected: Source::YouTubeVideo,
+            },
+            // Playlists
+            TestCase {
+                url: "yt:playlist/PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez",
+                expected: Source::YouTubePlaylist,
+            },
+            TestCase {
+                url: "youtube:playlist/PL9vTTBa7QaQOoMfpP3ztvgyQkPWDPfJez",
+                expected: Source::YouTubePlaylist,
+            },
+            // Channels
+            TestCase {
+                url: "yt:channel/UCxyz123",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "yt:c/YouTubeCreators",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "yt:user/pewdiepie",
+                expected: Source::YouTubeChannel,
+            },
+            TestCase {
+                url: "youtube:@CoreDumpped",
+                expected: Source::YouTubeChannel,
+            },
+        ];
+
+        for case in &shorthand_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+    }
+
+    #[test]
+    fn test_non_youtube_sources() {
+        let other_cases = [
+            TestCase {
+                url: "file:///local/path/file.mp4",
+                expected: Source::Local,
+            },
+            TestCase {
+                url: "https://x.com/some_post",
+                expected: Source::X,
+            },
+            TestCase {
+                url: "https://example.com/",
+                expected: Source::Other,
+            },
+        ];
+
+        for case in &other_cases {
+            assert_eq!(
+                determine_source(case.url),
+                case.expected,
+                "Failed for URL: {}",
+                case.url
+            );
+        }
+    }
+}