1
Fork 0
mirror of https://github.com/thegeneralist01/archivr synced 2026-05-30 08:36:47 +02:00

Implement social shorthand URL expansion and tweet alias parsing

This commit is contained in:
TheGeneralist 2026-04-03 14:48:51 +02:00
parent 63897f6a63
commit 423883d96f
Signed by: thegeneralist01
SSH key fingerprint: SHA256:pp9qddbCNmVNoSjevdvQvM5z0DHN7LTa8qBMbcMq/R4
2 changed files with 82 additions and 28 deletions

View file

@ -15,13 +15,14 @@ An open-source self-hosted archiving tool. Work in progress.
- [x] Snapchat - [x] Snapchat
- [ ] YouTube Posts (postponed) - [ ] YouTube Posts (postponed)
- [x] Archiving local files - [x] Archiving local files
- [x] Archiving Twitter Tweets & Threads
- [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs - [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs
- [ ] URLs - [ ] URLs
- [ ] Google Drive - [ ] Google Drive
- [ ] Dropbox - [ ] Dropbox
- [ ] OneDrive - [ ] OneDrive
- (Some of these could be postponed for later.) - (Some of these could be postponed for later.)
- [x] Archiving Twitter threads - [ ] Archiving Twitter articles
- [ ] Archive web pages (HTML, CSS, JS, images) - [ ] Archive web pages (HTML, CSS, JS, images)
- [ ] Archiving emails (???) - [ ] Archiving emails (???)
- [ ] Gmail - [ ] Gmail

View file

@ -95,17 +95,36 @@ fn parse_tweet_id(id: &str) -> Option<String> {
// TODO: Get rid of this somehow, probably encoding the ID logic into a struct. // TODO: Get rid of this somehow, probably encoding the ID logic into a struct.
// TODO: Error handling for inputs? // TODO: Error handling for inputs?
fn expand_shorthand_to_url(path: &str, source: &Source) -> String { fn expand_shorthand_to_url(path: &str, source: &Source) -> String {
if *source == Source::X && path.starts_with("tweet:media:") { if *source == Source::X && (path.starts_with("tweet:media:") || path.starts_with("x:media:")) {
format!( return format!(
"https://x.com/i/status/{}", "https://x.com/i/status/{}",
path.split(':') path.split(':')
.next_back() .next_back()
.and_then(parse_tweet_id) .and_then(parse_tweet_id)
.unwrap() .unwrap()
) );
} else {
path.to_string()
} }
if let Some(path) = path.strip_prefix("instagram:") {
if let Some(id) = path.strip_prefix("reel:") {
return format!("https://www.instagram.com/reel/{id}");
}
return format!("https://www.instagram.com/{path}");
}
if let Some(path) = path.strip_prefix("facebook:") {
return format!("https://www.facebook.com/{path}");
}
if let Some(path) = path.strip_prefix("tiktok:") {
return format!("https://www.tiktok.com/{path}");
}
if let Some(path) = path.strip_prefix("reddit:") {
return format!("https://www.reddit.com/{path}");
}
if let Some(path) = path.strip_prefix("snapchat:") {
return format!("https://www.snapchat.com/{path}");
}
path.to_string()
} }
// INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user // INFO: yt-dlp supports a lot of sites; so, when archiving (for example) a website, the user
@ -144,7 +163,14 @@ fn determine_source(path: &str) -> Source {
} }
// Shorthand schemes: tweet:, x:, or twitter: // Shorthand schemes: tweet:, x:, or twitter:
if let Some(after_scheme) = path.strip_prefix("tweet:") { if let Some(after_scheme) = path
.strip_prefix("x:")
.or_else(|| path.strip_prefix("twitter:"))
.or_else(|| path.strip_prefix("tweet:"))
{
// For this scope, in comments, N is an alias for a string of type ('twitter' | 'x' | 'tweet').
// N:media:id
if after_scheme.starts_with("media:") if after_scheme.starts_with("media:")
&& after_scheme && after_scheme
.strip_prefix("media:") .strip_prefix("media:")
@ -154,23 +180,7 @@ fn determine_source(path: &str) -> Source {
return Source::X; return Source::X;
} }
if parse_tweet_id(after_scheme).is_some() { // N:tweet:id or N:x:id
return Source::Tweet;
}
}
if let Some(after_scheme) = path
.strip_prefix("x:")
.or_else(|| path.strip_prefix("twitter:"))
{
if after_scheme
.strip_prefix("thread:")
.and_then(parse_tweet_id)
.is_some()
{
return Source::TweetThread;
}
if after_scheme if after_scheme
.strip_prefix("tweet:") .strip_prefix("tweet:")
.or_else(|| after_scheme.strip_prefix("x:")) .or_else(|| after_scheme.strip_prefix("x:"))
@ -180,7 +190,22 @@ fn determine_source(path: &str) -> Source {
return Source::Tweet; return Source::Tweet;
} }
return Source::X; // N:thread:id
if after_scheme
.strip_prefix("thread:")
.and_then(parse_tweet_id)
.is_some()
{
return Source::TweetThread;
}
// N:id
if parse_tweet_id(after_scheme).is_some() {
return Source::Tweet;
}
// N:non-id
return Source::Other;
} }
// Shorthand schemes for other yt-dlp extractors // Shorthand schemes for other yt-dlp extractors
@ -571,6 +596,10 @@ mod tests {
url: "tweet:media:1234567890", url: "tweet:media:1234567890",
expected: Source::X, expected: Source::X,
}, },
TestCase {
url: "x:media:1234567890",
expected: Source::X,
},
TestCase { TestCase {
url: "x:thread:1234567890", url: "x:thread:1234567890",
expected: Source::TweetThread, expected: Source::TweetThread,
@ -581,7 +610,7 @@ mod tests {
}, },
TestCase { TestCase {
url: "tweet:thread:1234567890", url: "tweet:thread:1234567890",
expected: Source::Other, expected: Source::TweetThread,
}, },
TestCase { TestCase {
url: "tweet:not-a-number", url: "tweet:not-a-number",
@ -591,6 +620,10 @@ mod tests {
url: "tweet:media:not-a-number", url: "tweet:media:not-a-number",
expected: Source::Other, expected: Source::Other,
}, },
TestCase {
url: "x:media:not-a-number",
expected: Source::Other,
},
]; ];
for case in &cases { for case in &cases {
@ -609,6 +642,26 @@ mod tests {
expand_shorthand_to_url("tweet:media:1234567890", &Source::X), expand_shorthand_to_url("tweet:media:1234567890", &Source::X),
"https://x.com/i/status/1234567890" "https://x.com/i/status/1234567890"
); );
assert_eq!(
expand_shorthand_to_url("instagram:reel/ABC123", &Source::Instagram),
"https://www.instagram.com/reel/ABC123"
);
assert_eq!(
expand_shorthand_to_url("facebook:watch?v=123456", &Source::Facebook),
"https://www.facebook.com/watch?v=123456"
);
assert_eq!(
expand_shorthand_to_url("tiktok:@someone/video/123456789", &Source::TikTok),
"https://www.tiktok.com/@someone/video/123456789"
);
assert_eq!(
expand_shorthand_to_url("reddit:r/videos/comments/abc123/example", &Source::Reddit),
"https://www.reddit.com/r/videos/comments/abc123/example"
);
assert_eq!(
expand_shorthand_to_url("snapchat:discover/some-story/1234567890", &Source::Snapchat),
"https://www.snapchat.com/discover/some-story/1234567890"
);
assert_eq!( assert_eq!(
expand_shorthand_to_url("tweet:1234567890", &Source::Tweet), expand_shorthand_to_url("tweet:1234567890", &Source::Tweet),
"tweet:1234567890" "tweet:1234567890"
@ -760,11 +813,11 @@ mod tests {
}, },
TestCase { TestCase {
url: "x:1234567890", url: "x:1234567890",
expected: Source::X, expected: Source::Tweet,
}, },
TestCase { TestCase {
url: "twitter:1234567890", url: "twitter:1234567890",
expected: Source::X, expected: Source::Tweet,
}, },
]; ];