mirror of
https://github.com/thegeneralist01/archivr
synced 2026-05-30 08:36:47 +02:00
feat: add X Article archiving (#6)
* Add X Article archiving * All TOML formats changed to JSON
This commit is contained in:
parent
51e986b6b2
commit
5552591f4f
4 changed files with 895 additions and 654 deletions
|
|
@ -15,14 +15,13 @@ An open-source self-hosted archiving tool. Work in progress.
|
||||||
- [x] Snapchat
|
- [x] Snapchat
|
||||||
- [ ] YouTube Posts (postponed)
|
- [ ] YouTube Posts (postponed)
|
||||||
- [x] Archiving local files
|
- [x] Archiving local files
|
||||||
- [x] Archiving Twitter Tweets & Threads
|
- [x] Archiving Twitter Tweets, Threads, and Articles
|
||||||
- [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs
|
- [ ] Archiving files from cloud storage services (Google Drive, Dropbox, OneDrive) and from URLs
|
||||||
- [ ] URLs
|
- [ ] URLs
|
||||||
- [ ] Google Drive
|
- [ ] Google Drive
|
||||||
- [ ] Dropbox
|
- [ ] Dropbox
|
||||||
- [ ] OneDrive
|
- [ ] OneDrive
|
||||||
- (Some of these could be postponed for later.)
|
- (Some of these could be postponed for later.)
|
||||||
- [ ] Archiving Twitter articles
|
|
||||||
- [ ] Archive web pages (HTML, CSS, JS, images)
|
- [ ] Archive web pages (HTML, CSS, JS, images)
|
||||||
- [ ] Archiving emails (???)
|
- [ ] Archiving emails (???)
|
||||||
- [ ] Gmail
|
- [ ] Gmail
|
||||||
|
|
@ -62,7 +61,7 @@ This project aims to provide a reliable solution for archiving important data fr
|
||||||
- Local files: `file:///absolute/path/to/file.ext`
|
- Local files: `file:///absolute/path/to/file.ext`
|
||||||
- YouTube media: standard video/short URLs, plus [shorthand video inputs](#supported-shorthand-inputs)
|
- YouTube media: standard video/short URLs, plus [shorthand video inputs](#supported-shorthand-inputs)
|
||||||
- X/Twitter media from Tweets: normal Tweet URLs or the `tweet:media:ID` shorthand
|
- X/Twitter media from Tweets: normal Tweet URLs or the `tweet:media:ID` shorthand
|
||||||
- X/Twitter Tweet content scrape: [Tweet and Thread shorthands](#supported-shorthand-inputs). (These are saved as TOML files in `raw_tweets/`)
|
- X/Twitter Tweet content scrape: [Tweet and Thread shorthands](#supported-shorthand-inputs). (These are saved as JSON files in `raw_tweets/`)
|
||||||
- Instagram, Facebook, TikTok, Reddit, Snapchat: direct URLs or platform-prefixed shorthand passed through to `yt-dlp`
|
- Instagram, Facebook, TikTok, Reddit, Snapchat: direct URLs or platform-prefixed shorthand passed through to `yt-dlp`
|
||||||
|
|
||||||
### Supported Shorthand Inputs
|
### Supported Shorthand Inputs
|
||||||
|
|
@ -73,7 +72,7 @@ This project aims to provide a reliable solution for archiving important data fr
|
||||||
- `yt:short/ID`
|
- `yt:short/ID`
|
||||||
- `yt:shorts/ID`
|
- `yt:shorts/ID`
|
||||||
- `youtube:shorts/ID`
|
- `youtube:shorts/ID`
|
||||||
- X/Twitter tweet TOML content:
|
- X/Twitter tweet JSON content:
|
||||||
- `tweet:ID`
|
- `tweet:ID`
|
||||||
- `x:tweet:ID`
|
- `x:tweet:ID`
|
||||||
- `x:x:ID`
|
- `x:x:ID`
|
||||||
|
|
@ -81,7 +80,7 @@ This project aims to provide a reliable solution for archiving important data fr
|
||||||
- `twitter:tweet:ID`
|
- `twitter:tweet:ID`
|
||||||
- X/Twitter media/video download:
|
- X/Twitter media/video download:
|
||||||
- `tweet:media:ID`
|
- `tweet:media:ID`
|
||||||
- X/Twitter thread TOML content:
|
- X/Twitter thread JSON content:
|
||||||
- `x:thread:ID`
|
- `x:thread:ID`
|
||||||
- `twitter:thread:ID`
|
- `twitter:thread:ID`
|
||||||
- Other platform shorthands:
|
- Other platform shorthands:
|
||||||
|
|
|
||||||
26
flake.nix
26
flake.nix
|
|
@ -39,7 +39,10 @@
|
||||||
inherit version;
|
inherit version;
|
||||||
hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
|
hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
|
||||||
};
|
};
|
||||||
nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
|
nativeBuildInputs = [
|
||||||
|
pyPkgs.setuptools
|
||||||
|
pyPkgs.wheel
|
||||||
|
];
|
||||||
propagatedBuildInputs = [
|
propagatedBuildInputs = [
|
||||||
pyPkgs.aiofiles
|
pyPkgs.aiofiles
|
||||||
pyPkgs."nest-asyncio"
|
pyPkgs."nest-asyncio"
|
||||||
|
|
@ -53,13 +56,9 @@
|
||||||
pythonImportsCheck = [ "twitter" ];
|
pythonImportsCheck = [ "twitter" ];
|
||||||
doCheck = false;
|
doCheck = false;
|
||||||
};
|
};
|
||||||
tweetPython = pkgs.python312.withPackages (
|
tweetPython = pkgs.python312.withPackages (ps: [
|
||||||
ps: [
|
|
||||||
ps.tomlkit
|
|
||||||
ps."tomli-w"
|
|
||||||
twitterApiClient
|
twitterApiClient
|
||||||
]
|
]);
|
||||||
);
|
|
||||||
archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
|
archivr_unwrapped = pkgs.rustPlatform.buildRustPackage {
|
||||||
pname = "archivr";
|
pname = "archivr";
|
||||||
version = "0.1.0";
|
version = "0.1.0";
|
||||||
|
|
@ -118,7 +117,10 @@
|
||||||
inherit version;
|
inherit version;
|
||||||
hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
|
hash = "sha256-S5KzQRDIQroc2bJsPLaKR9xocHKniqd9Z055CsC5rbQ=";
|
||||||
};
|
};
|
||||||
nativeBuildInputs = [ pyPkgs.setuptools pyPkgs.wheel ];
|
nativeBuildInputs = [
|
||||||
|
pyPkgs.setuptools
|
||||||
|
pyPkgs.wheel
|
||||||
|
];
|
||||||
propagatedBuildInputs = [
|
propagatedBuildInputs = [
|
||||||
pyPkgs.aiofiles
|
pyPkgs.aiofiles
|
||||||
pyPkgs."nest-asyncio"
|
pyPkgs."nest-asyncio"
|
||||||
|
|
@ -132,13 +134,9 @@
|
||||||
pythonImportsCheck = [ "twitter" ];
|
pythonImportsCheck = [ "twitter" ];
|
||||||
doCheck = false;
|
doCheck = false;
|
||||||
};
|
};
|
||||||
tweetPython = pkgs.python312.withPackages (
|
tweetPython = pkgs.python312.withPackages (ps: [
|
||||||
ps: [
|
|
||||||
ps.tomlkit
|
|
||||||
ps."tomli-w"
|
|
||||||
twitterApiClient
|
twitterApiClient
|
||||||
]
|
]);
|
||||||
);
|
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
default = pkgs.mkShell {
|
default = pkgs.mkShell {
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ fn build_scraper_args(
|
||||||
/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
|
/// Archives a tweet (or full thread) identified by `path` (e.g. `"tweet:123"`).
|
||||||
///
|
///
|
||||||
/// Invokes the Python scraper, then moves all produced media assets into the
|
/// Invokes the Python scraper, then moves all produced media assets into the
|
||||||
/// content-addressed raw store and rewrites the TOML output to use the new
|
/// content-addressed raw store and rewrites the JSON output to use the new
|
||||||
/// store-relative paths. Returns `true` if new content was archived, `false`
|
/// store-relative paths. Returns `true` if new content was archived, `false`
|
||||||
/// if the tweet was already present and `thread` is `false`.
|
/// if the tweet was already present and `thread` is `false`.
|
||||||
///
|
///
|
||||||
|
|
@ -72,7 +72,7 @@ fn build_scraper_args(
|
||||||
/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
|
/// can be overridden via `ARCHIVR_TWEET_SCRAPER` and `ARCHIVR_TWEET_PYTHON`.
|
||||||
pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
|
pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) -> Result<bool> {
|
||||||
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
|
let invocation_cwd = env::current_dir().context("Failed to read current working directory")?;
|
||||||
// Output directory for Tweet TOML files.
|
// Output directory for Tweet JSON files.
|
||||||
let output_dir = store_path.join("raw_tweets");
|
let output_dir = store_path.join("raw_tweets");
|
||||||
// Temporary directory for media assets downloaded by the scraper in `temp/...`.
|
// Temporary directory for media assets downloaded by the scraper in `temp/...`.
|
||||||
let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
|
let temp_dir = store_path.join("temp").join(timestamp).join("tweets");
|
||||||
|
|
@ -81,13 +81,13 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) ->
|
||||||
fs::create_dir_all(&output_dir)?;
|
fs::create_dir_all(&output_dir)?;
|
||||||
fs::create_dir_all(&temp_dir)?;
|
fs::create_dir_all(&temp_dir)?;
|
||||||
|
|
||||||
// Path to the root - the to-be-archived tweet's TOML file.
|
// Path to the root - the to-be-archived tweet's JSON file.
|
||||||
let root_toml = output_dir.join(format!("tweet-{tweet_id}.toml"));
|
let root_json = output_dir.join(format!("tweet-{tweet_id}.json"));
|
||||||
if !thread && root_toml.exists() {
|
if !thread && root_json.exists() {
|
||||||
return Ok(false);
|
return Ok(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
let before = tweet_toml_files(&output_dir)?;
|
let before = tweet_json_files(&output_dir)?;
|
||||||
|
|
||||||
let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
|
let python = env::var_os("ARCHIVR_TWEET_PYTHON").unwrap_or_else(|| OsString::from("python3"));
|
||||||
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
|
let scraper_path = env::var_os("ARCHIVR_TWEET_SCRAPER")
|
||||||
|
|
@ -135,37 +135,37 @@ pub fn archive(path: &str, thread: bool, store_path: &Path, timestamp: &str) ->
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if !root_toml.exists() {
|
if !root_json.exists() {
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||||
bail!(
|
bail!(
|
||||||
"Tweet scraper completed but did not create expected TOML file: {}\nstdout:\n{}\nstderr:\n{}",
|
"Tweet scraper completed but did not create expected JSON file: {}\nstdout:\n{}\nstderr:\n{}",
|
||||||
root_toml.display(),
|
root_json.display(),
|
||||||
stdout.trim(),
|
stdout.trim(),
|
||||||
stderr.trim()
|
stderr.trim()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup_summary(&output_dir)?;
|
cleanup_summary(&output_dir)?;
|
||||||
let after = tweet_toml_files(&output_dir)?;
|
let after = tweet_json_files(&output_dir)?;
|
||||||
let new_tomls = new_tweet_tomls(&before, &after);
|
let new_jsons = new_tweet_jsons(&before, &after);
|
||||||
rewrite_tweet_outputs(&new_tomls, &output_dir, &temp_dir, store_path)?;
|
rewrite_tweet_outputs(&new_jsons, &output_dir, &temp_dir, store_path)?;
|
||||||
let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
|
let _ = fs::remove_dir_all(store_path.join("temp").join(timestamp));
|
||||||
|
|
||||||
Ok(true)
|
Ok(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Removes the `scraping_summary.toml` file left by the scraper, if present.
|
/// Removes the `scraping_summary.json` file left by the scraper, if present.
|
||||||
fn cleanup_summary(output_dir: &Path) -> Result<()> {
|
fn cleanup_summary(output_dir: &Path) -> Result<()> {
|
||||||
let summary_path = output_dir.join("scraping_summary.toml");
|
let summary_path = output_dir.join("scraping_summary.json");
|
||||||
if summary_path.exists() {
|
if summary_path.exists() {
|
||||||
fs::remove_file(summary_path)?;
|
fs::remove_file(summary_path)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the set of `tweet-*.toml` files present in `output_dir`.
|
/// Returns the set of `tweet-*.json` files present in `output_dir`.
|
||||||
fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
|
fn tweet_json_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
|
||||||
let mut files = HashSet::new();
|
let mut files = HashSet::new();
|
||||||
|
|
||||||
for entry in fs::read_dir(output_dir)? {
|
for entry in fs::read_dir(output_dir)? {
|
||||||
|
|
@ -176,7 +176,7 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
|
||||||
&& path
|
&& path
|
||||||
.file_name()
|
.file_name()
|
||||||
.and_then(|name| name.to_str())
|
.and_then(|name| name.to_str())
|
||||||
.is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".toml"))
|
.is_some_and(|name| name.starts_with("tweet-") && name.ends_with(".json"))
|
||||||
{
|
{
|
||||||
files.insert(path);
|
files.insert(path);
|
||||||
}
|
}
|
||||||
|
|
@ -185,38 +185,38 @@ fn tweet_toml_files(output_dir: &Path) -> Result<HashSet<PathBuf>> {
|
||||||
Ok(files)
|
Ok(files)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the sorted list of TOML files present in `after` but not in `before`.
|
/// Returns the sorted list of JSON files present in `after` but not in `before`.
|
||||||
fn new_tweet_tomls(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
|
fn new_tweet_jsons(before: &HashSet<PathBuf>, after: &HashSet<PathBuf>) -> Vec<PathBuf> {
|
||||||
let mut files = after.difference(before).cloned().collect::<Vec<_>>();
|
let mut files = after.difference(before).cloned().collect::<Vec<_>>();
|
||||||
files.sort();
|
files.sort();
|
||||||
files
|
files
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a lazily-compiled regex matching `avatar_local_path = "..."` in TOML.
|
/// Returns a lazily-compiled regex matching `"avatar_local_path": "..."` in JSON.
|
||||||
fn avatar_regex() -> &'static Regex {
|
fn avatar_regex() -> &'static Regex {
|
||||||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||||||
REGEX.get_or_init(|| Regex::new(r#"avatar_local_path = "([^"\n]+)""#).unwrap())
|
REGEX.get_or_init(|| Regex::new(r#""avatar_local_path": "([^"\n]+)""#).unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a lazily-compiled regex matching `local_path = "..."` in TOML.
|
/// Returns a lazily-compiled regex matching `"local_path": "..."` in JSON.
|
||||||
fn media_regex() -> &'static Regex {
|
fn media_regex() -> &'static Regex {
|
||||||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||||||
REGEX.get_or_init(|| Regex::new(r#"(?m)\blocal_path = "([^"\n]+)""#).unwrap())
|
REGEX.get_or_init(|| Regex::new(r#"(?m)"local_path": "([^"\n]+)""#).unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Rewrites asset paths in each newly-created TOML file, moving assets into
|
/// Rewrites asset paths in each newly-created JSON file, moving assets into
|
||||||
/// the content-addressed store. Files are written back only if content changed.
|
/// the content-addressed store. Files are written back only if content changed.
|
||||||
fn rewrite_tweet_outputs(
|
fn rewrite_tweet_outputs(
|
||||||
tweet_tomls: &[PathBuf],
|
tweet_jsons: &[PathBuf],
|
||||||
output_dir: &Path,
|
output_dir: &Path,
|
||||||
temp_dir: &Path,
|
temp_dir: &Path,
|
||||||
store_path: &Path,
|
store_path: &Path,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut archived_assets = HashMap::new();
|
let mut archived_assets = HashMap::new();
|
||||||
|
|
||||||
for path in tweet_tomls {
|
for path in tweet_jsons {
|
||||||
let contents = fs::read_to_string(path)?;
|
let contents = fs::read_to_string(path)?;
|
||||||
let rewritten = rewrite_toml_asset_paths(
|
let rewritten = rewrite_json_asset_paths(
|
||||||
&contents,
|
&contents,
|
||||||
output_dir,
|
output_dir,
|
||||||
temp_dir,
|
temp_dir,
|
||||||
|
|
@ -234,9 +234,9 @@ fn rewrite_tweet_outputs(
|
||||||
|
|
||||||
/// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
|
/// Rewrites all `avatar_local_path` and `local_path` references in `contents`,
|
||||||
/// archiving each referenced file into the raw store and returning the updated
|
/// archiving each referenced file into the raw store and returning the updated
|
||||||
/// TOML string. `archived_assets` is a cache to avoid re-archiving the same
|
/// JSON string. `archived_assets` is a cache to avoid re-archiving the same
|
||||||
/// file when it is referenced by multiple tweets.
|
/// file when it is referenced by multiple tweets.
|
||||||
fn rewrite_toml_asset_paths(
|
fn rewrite_json_asset_paths(
|
||||||
contents: &str,
|
contents: &str,
|
||||||
output_dir: &Path,
|
output_dir: &Path,
|
||||||
temp_dir: &Path,
|
temp_dir: &Path,
|
||||||
|
|
@ -250,8 +250,8 @@ fn rewrite_toml_asset_paths(
|
||||||
let new_path =
|
let new_path =
|
||||||
archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?;
|
archive_asset_reference(&old_path, output_dir, store_path, "avatar", archived_assets)?;
|
||||||
rewritten = rewritten.replace(
|
rewritten = rewritten.replace(
|
||||||
&format!(r#"avatar_local_path = "{old_path}""#),
|
&format!(r#""avatar_local_path": "{old_path}""#),
|
||||||
&format!(r#"avatar_local_path = "{new_path}""#),
|
&format!(r#""avatar_local_path": "{new_path}""#),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -260,8 +260,8 @@ fn rewrite_toml_asset_paths(
|
||||||
let new_path =
|
let new_path =
|
||||||
archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?;
|
archive_asset_reference(&old_path, temp_dir, store_path, "media", archived_assets)?;
|
||||||
rewritten = rewritten.replace(
|
rewritten = rewritten.replace(
|
||||||
&format!(r#"local_path = "{old_path}""#),
|
&format!(r#""local_path": "{old_path}""#),
|
||||||
&format!(r#"local_path = "{new_path}""#),
|
&format!(r#""local_path": "{new_path}""#),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -377,19 +377,19 @@ mod tests {
|
||||||
fn test_cleanup_summary_removes_summary_only() {
|
fn test_cleanup_summary_removes_summary_only() {
|
||||||
let output_dir = unique_path("archivr-tweet-summary");
|
let output_dir = unique_path("archivr-tweet-summary");
|
||||||
fs::create_dir_all(&output_dir).unwrap();
|
fs::create_dir_all(&output_dir).unwrap();
|
||||||
fs::write(output_dir.join("scraping_summary.toml"), "summary").unwrap();
|
fs::write(output_dir.join("scraping_summary.json"), "summary").unwrap();
|
||||||
fs::write(output_dir.join("tweet-1.toml"), "tweet").unwrap();
|
fs::write(output_dir.join("tweet-1.json"), "tweet").unwrap();
|
||||||
|
|
||||||
cleanup_summary(&output_dir).unwrap();
|
cleanup_summary(&output_dir).unwrap();
|
||||||
|
|
||||||
assert!(!output_dir.join("scraping_summary.toml").exists());
|
assert!(!output_dir.join("scraping_summary.json").exists());
|
||||||
assert!(output_dir.join("tweet-1.toml").exists());
|
assert!(output_dir.join("tweet-1.json").exists());
|
||||||
|
|
||||||
let _ = fs::remove_dir_all(output_dir);
|
let _ = fs::remove_dir_all(output_dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_rewrite_toml_asset_paths_rearchives_assets() {
|
fn test_rewrite_json_asset_paths_rearchives_assets() {
|
||||||
let store_path = unique_path("archivr-tweet-store");
|
let store_path = unique_path("archivr-tweet-store");
|
||||||
let output_dir = store_path.join("raw_tweets");
|
let output_dir = store_path.join("raw_tweets");
|
||||||
let temp_dir = store_path.join("temp").join("ts").join("tweets");
|
let temp_dir = store_path.join("temp").join("ts").join("tweets");
|
||||||
|
|
@ -408,15 +408,12 @@ mod tests {
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let contents = r#"
|
let contents = r#"{
|
||||||
[entities]
|
"entities": { "media": [{ "local_path": "media/123/media_1.jpg" }] },
|
||||||
media = [{ local_path = "media/123/media_1.jpg" }]
|
"author": { "avatar_local_path": "../temp/ts/tweets/media/avatars/avatar.jpg" }
|
||||||
|
}"#;
|
||||||
|
|
||||||
[author]
|
let rewritten = rewrite_json_asset_paths(
|
||||||
avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
|
|
||||||
"#;
|
|
||||||
|
|
||||||
let rewritten = rewrite_toml_asset_paths(
|
|
||||||
contents,
|
contents,
|
||||||
&output_dir,
|
&output_dir,
|
||||||
&temp_dir,
|
&temp_dir,
|
||||||
|
|
@ -425,8 +422,8 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
assert!(rewritten.contains(r#"avatar_local_path = "raw/"#));
|
assert!(rewritten.contains(r#""avatar_local_path": "raw/"#));
|
||||||
assert!(rewritten.contains(r#"local_path = "raw/"#));
|
assert!(rewritten.contains(r#""local_path": "raw/"#));
|
||||||
assert!(
|
assert!(
|
||||||
!temp_dir
|
!temp_dir
|
||||||
.join("media")
|
.join("media")
|
||||||
|
|
@ -464,7 +461,7 @@ avatar_local_path = "../temp/ts/tweets/media/avatars/avatar.jpg"
|
||||||
let output_dir = store_path.join("raw_tweets");
|
let output_dir = store_path.join("raw_tweets");
|
||||||
fs::create_dir_all(&output_dir).unwrap();
|
fs::create_dir_all(&output_dir).unwrap();
|
||||||
fs::create_dir_all(store_path.join("temp")).unwrap();
|
fs::create_dir_all(store_path.join("temp")).unwrap();
|
||||||
fs::write(output_dir.join("tweet-123.toml"), "id = \"123\"").unwrap();
|
fs::write(output_dir.join("tweet-123.json"), r#"{"id":"123"}"#).unwrap();
|
||||||
|
|
||||||
let credentials = store_path.join("creds.txt");
|
let credentials = store_path.join("creds.txt");
|
||||||
fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
|
fs::write(&credentials, "ct0=test;auth_token=test").unwrap();
|
||||||
|
|
@ -522,15 +519,13 @@ done
|
||||||
mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id"
|
mkdir -p "$output_dir" "$media_dir/avatars" "$media_dir/$tweet_id"
|
||||||
printf 'avatar' > "$media_dir/avatars/author.jpg"
|
printf 'avatar' > "$media_dir/avatars/author.jpg"
|
||||||
printf 'media' > "$media_dir/$tweet_id/media_1.jpg"
|
printf 'media' > "$media_dir/$tweet_id/media_1.jpg"
|
||||||
printf 'summary = true\n' > "$output_dir/scraping_summary.toml"
|
printf '{"summary":true}\n' > "$output_dir/scraping_summary.json"
|
||||||
cat > "$output_dir/tweet-$tweet_id.toml" <<EOF
|
cat > "$output_dir/tweet-$tweet_id.json" <<EOF
|
||||||
id = "$tweet_id"
|
{
|
||||||
|
"id": "$tweet_id",
|
||||||
[entities]
|
"entities": { "media": [{ "local_path": "media/$tweet_id/media_1.jpg" }] },
|
||||||
media = [{ local_path = "media/$tweet_id/media_1.jpg" }]
|
"author": { "avatar_local_path": "../temp/ts/tweets/media/avatars/author.jpg" }
|
||||||
|
}
|
||||||
[author]
|
|
||||||
avatar_local_path = "../temp/ts/tweets/media/avatars/author.jpg"
|
|
||||||
EOF
|
EOF
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
|
|
@ -546,14 +541,14 @@ EOF
|
||||||
set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
|
set_test_env("ARCHIVR_TWEET_PYTHON", "/bin/sh");
|
||||||
|
|
||||||
let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
|
let archived = archive("tweet:123", false, &store_path, "ts").unwrap();
|
||||||
let tweet_file = output_dir.join("tweet-123.toml");
|
let tweet_file = output_dir.join("tweet-123.json");
|
||||||
let contents = fs::read_to_string(&tweet_file).unwrap();
|
let contents = fs::read_to_string(&tweet_file).unwrap();
|
||||||
|
|
||||||
assert!(archived);
|
assert!(archived);
|
||||||
assert!(tweet_file.exists());
|
assert!(tweet_file.exists());
|
||||||
assert!(!output_dir.join("scraping_summary.toml").exists());
|
assert!(!output_dir.join("scraping_summary.json").exists());
|
||||||
assert!(contents.contains(r#"avatar_local_path = "raw/"#));
|
assert!(contents.contains(r#""avatar_local_path": "raw/"#));
|
||||||
assert!(contents.contains(r#"local_path = "raw/"#));
|
assert!(contents.contains(r#""local_path": "raw/"#));
|
||||||
assert!(!store_path.join("temp").join("ts").exists());
|
assert!(!store_path.join("temp").join("ts").exists());
|
||||||
|
|
||||||
remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
|
remove_test_env("ARCHIVR_TWITTER_CREDENTIALS_FILE");
|
||||||
|
|
|
||||||
1159
vendor/twitter/scrape_user_tweet_contents.py
vendored
1159
vendor/twitter/scrape_user_tweet_contents.py
vendored
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue