diff --git a/README.md b/README.md index 61c143e9..5ded344a 100644 --- a/README.md +++ b/README.md @@ -23,39 +23,28 @@ curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instruct Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a free central archive, but they require all archives to be public, and they can't save every type of content. -*ArchiveBox is an open source tool that helps you archive web content on your own (or privately within an organization): save copies of browser bookmarks, preserve evidence for legal cases, backup photos from FB / Insta / Flickr, download your media from YT / Soundcloud / etc., snapshot research papers & academic citations, and more...* +*ArchiveBox is an open source tool that helps organizations and individuals archive web content and retain control over their data: save copies of browser bookmarks, preserve evidence for legal cases, backup photos from FB / Insta / Flickr, download your media from YT / Soundcloud / etc., snapshot research papers & academic citations, and more...* -> โก๏ธ *Use ArchiveBox as a [command-line package](#quickstart) and/or [self-hosted web app](#quickstart) on Linux, macOS, or in [Docker](#quickstart).* +> โก๏ธ *Use ArchiveBox on [Linux](#quickstart)/[macOS](#quickstart)/[Windows](#quickstart)/[Docker](#quickstart) as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*
mkdir ~/archivebox; cd ~/archivebox # create a dir somewhere for your archivebox data
+
+# Option A: Get ArchiveBox with Docker Compose (recommended):
+curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml # edit options in this file as-needed
+docker compose run archivebox init --setup
+# docker compose run archivebox add 'https://example.com'
+# docker compose run archivebox help
+# docker compose up
+
+
+# Option B: Or use it as a plain Docker container:
+docker run -it -v $PWD:/data archivebox/archivebox init --setup
+# docker run -it -v $PWD:/data archivebox/archivebox add 'https://example.com'
+# docker run -it -v $PWD:/data archivebox/archivebox help
+# docker run -it -v $PWD:/data -p 8000:8000 archivebox/archivebox
+
+
+# Option C: Or install it with your preferred pkg manager (see Quickstart below for apt, brew, and more)
pip install archivebox
-
-# Or use the optional auto setup script to install it
+archivebox init --setup
+# archviebox add 'https://example.com'
+# archivebox help
+# archivebox server 0.0.0.0:8000
+
+
+# Option D: Or use the optional auto setup script to install it
curl -sSL 'https://get.archivebox.io' | sh
-```
+
+http://localhost:8000
to see your server's Web UI โก๏ธ
+docker-compose
(macOS/Linux/Windows) ๐ recommended (click to expand)docker-compose.yml
file into a new empty directory (can be anywhere).
mkdir ~/archivebox && cd ~/archivebox
-curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/docker-compose.yml'
+# Read and edit docker-compose.yml options as-needed after downloading
+curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml
docker compose run archivebox init --setup
docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox
# completely optional, CLI can always be used without running a server
# docker run -v $PWD:/data -it [subcommand] [--args]
+docker run -v $PWD:/data -it archivebox/archivebox help
apt
(Ubuntu/Debian)pip
(macOS/Linux/BSD)pip3
(or pipx
).
+pip3 install archivebox
+
+mkdir ~/archivebox && cd ~/archivebox
+archivebox init --setup
+# install any missing extras like wget/git/ripgrep/etc. manually as needed
+
+archivebox server 0.0.0.0:8000
+# completely optional, CLI can always be used without running a server
+# archivebox [subcommand] [--args]
+archivebox help
+
+pip-archivebox
repo for more details about this distribution.
+apt
(Ubuntu/Debian/etc.)archivebox server 0.0.0.0:8000
# completely optional, CLI can always be used without running a server
# archivebox [subcommand] [--args]
+archivebox help
debian-a
brew
(macOS)brew
(macOS only)archivebox server 0.0.0.0:8000
# completely optional, CLI can always be used without running a server
# archivebox [subcommand] [--args]
+archivebox help
homebr
pip
(macOS/Linux/BSD)pip3
.
-pip3 install archivebox
-
-mkdir ~/archivebox && cd ~/archivebox
-archivebox init --setup
-# install any missing extras like wget/git/ripgrep/etc. manually as needed
-
-archivebox server 0.0.0.0:8000
-# completely optional, CLI can always be used without running a server
-# archivebox [subcommand] [--args]
-
-pip-archivebox
repo for more details about this distribution.
-pacman
/ pkg
/ nix
(Arch/FreeBSD/NixOS/more)
-docker compose up -d # start the Web UI server in the background
-docker compose run archivebox add 'https://example.com' # add a test URL to snapshot w/ Docker Compose
-
-archivebox list 'https://example.com' # fetch it with pip-installed archivebox on the host
-docker compose run archivebox list 'https://example.com' # or w/ Docker Compose
-docker run -it -v $PWD:/data archivebox/archivebox list 'https://example.com' # or w/ Docker, all equivalent
-
-
-
archivebox init --setup # safe to run init multiple times (also how you update versions)
-archivebox version # get archivebox version info and more
+archivebox version # get archivebox version info + check dependencies
+archivebox help # get list of archivebox subcommands that can be run
archivebox add --depth=1 'https://news.ycombinator.com'
-
# make sure you have `docker-compose.yml` from the Quickstart instructions first
docker compose run archivebox init --setup
docker compose run archivebox version
+docker compose run archivebox help
docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
+# to start webserver: docker compose up
-
docker run -v $PWD:/data -it archivebox/archivebox init --setup
docker run -v $PWD:/data -it archivebox/archivebox version
+docker run -v $PWD:/data -it archivebox/archivebox help
+docker run -v $PWD:/data -it archivebox/archivebox add --depth=1 'https://news.ycombinator.com'
+# to start webserver: docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox
+
+
+archivebox shell # explore the Python library API in a REPL
+sqlite3 ./index.sqlite3 # run SQL queries directly on your index
+ls ./archive/*/index.html # or inspect snapshot data directly on the filesystem
+
+
+# Start the server on bare metal (pip/apt/brew/etc):
+archivebox manage createsuperuser # create a new admin user via CLI
+archivebox server 0.0.0.0:8000 # start the server
+
+# Or with Docker Compose:
+nano docker-compose.yml # setup initial ADMIN_USERNAME & ADMIN_PASSWORD
+docker compose up # start the server
+
+# Or with a Docker container:
+docker run -v $PWD:/data -it archivebox/archivebox archivebox manage createsuperuser
+docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox
+
+
+Open http://localhost:8000
to see your server's Web UI โก๏ธ
+
+archivebox config --set PUBLIC_ADD_VIEW=True # allow guests to submit URLs
+archivebox config --set PUBLIC_SNAPSHOTS=True # allow guests to see snapshot content
+archivebox config --set PUBLIC_INDEX=True # allow guests to see list of all snapshots
+# or
+docker compose run archivebox config --set ...
+
+# restart the server to apply any config changes
+
+
+archivebox add --depth=1 'https://example.com' # add a URL with pip-installed archivebox on the host
+docker compose run archivebox add --depth=1 'https://example.com' # or w/ Docker Compose
+docker run -it -v $PWD:/data archivebox/archivebox add --depth=1 'https://example.com' # or w/ Docker, all equivalent
./archive/{Snapshot.id}/
index.html
& index.json
HTML and JSON index files containing metadata and detailssinglefile.html
HTML snapshot rendered with headless Chrome using SingleFileexample.com/page-name.html
wget clone of the site with warc/TIMESTAMP.gz
output.pdf
Printed PDF of site using headless chromescreenshot.png
1440x900 screenshot of site using headless chromeoutput.html
DOM Dump of the HTML after rendering using headless chromearticle.html/json
Article text extraction using Readability & Mercuryarchive.org.txt
A link to the saved site on archive.orgmedia/
all audio/video files + playlists, including subtitles & metadata with youtube-dl (or yt-dlp)git/
clone of any repository found on GitHub, Bitbucket, or GitLab linksarchivebox config # view the entire config
archivebox config --get CHROME_BINARY # view a specific value
-
+
archivebox config --set CHROME_BINARY=chromium # persist a config using CLI
# OR
echo CHROME_BINARY=chromium >> ArchiveBox.conf # persist a config using file
# OR
env CHROME_BINARY=chromium archivebox ... # run with a one-off config
-```
+
+These methods also work the same way when run inside Docker, see the Docker Configuration wiki page for details.
+
# e.g. archivebox config --set TIMEOUT=120
-
+# or docker compose run archivebox config --set TIMEOUT=120
+
TIMEOUT=120 # default: 60 add more seconds on slower networks
CHECK_SSL_VALIDITY=True # default: False True = allow saving URLs w/ bad SSL
SAVE_ARCHIVE_DOT_ORG=False # default: True False = disable Archive.org saving
MAX_MEDIA_SIZE=1500m # default: 750m raise/lower youtubedl output size
-
+
PUBLIC_INDEX=True # default: True whether anon users can view index
PUBLIC_SNAPSHOTS=True # default: True whether anon users can view pages
PUBLIC_ADD_VIEW=False # default: False whether anon users can add new URLs
-
+
CHROME_USER_AGENT="Mozilla/5.0 ..." # change these to get around bot blocking
WGET_USER_AGENT="Mozilla/5.0 ..."
CURL_USER_AGENT="Mozilla/5.0 ..."
-```
-
+
+archivebox
CLI commands are designed to be run from inside an ArchiveBox data folder, starting with archivebox init
to initialize a new collection inside an empty directory.
mkdir ~/archivebox && cd ~/archivebox # just an example, can be anywhere
@@ -774,7 +823,7 @@ Each snapshot subfolder ./archive/TIMESTAMP/
includes a static
@@ -783,14 +832,17 @@ You can export the main index to browse it statically as plain HTML files in a f
> *NOTE: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.*
-```bash
+```bash|
+# do a one-off single URL archive wihout needing a data dir initialized
+archivebox oneshot 'https://example.com'
+
# archivebox list --help
archivebox list --html --with-headers > index.html # export to static html table
archivebox list --json --with-headers > index.json # export to json blob
archivebox list --csv=timestamp,url,title > index.csv # export to csv spreadsheet
# (if using Docker Compose, add the -T flag when piping)
-# docker compose run -T archivebox list --html --filter-type=search snozzberries > index.json
+# docker compose run -T archivebox list --html 'https://example.com' > index.json
```
The paths in the static exports are relative, make sure to keep them next to your `./archive` folder when backing them up or viewing them.
@@ -806,8 +858,6 @@ The paths in the static exports are relative, make sure to keep them next to you
----
-
@@ -823,7 +873,7 @@ If you're importing pages with private content or URLs containing secret tokens
-Click to expand...
+Expand to learn about privacy, permissions, and user accounts...
```bash
@@ -838,6 +888,7 @@ archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in
archivebox config --set PUBLIC_INDEX=False
archivebox config --set PUBLIC_SNAPSHOTS=False
archivebox config --set PUBLIC_ADD_VIEW=False
+archivebox manage createsuperuser
# if extra paranoid or anti-Google:
archivebox config --set SAVE_FAVICON=False # disable favicon fetching (it calls a Google API passing the URL's domain part only)
@@ -867,7 +918,7 @@ Be aware that malicious archived JS can access the contents of other pages in yo
-Click to expand...
+Expand to see risks and mitigations...
```bash
@@ -903,7 +954,7 @@ For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) active
-Click to expand...
+Click to learn how to set up user agents, cookies, and site logins...
@@ -926,7 +977,7 @@ ArchiveBox appends a hash with the current date `https://example.com#2020-10-24`
-Click to expand...
+Click to learn how the `Re-Snapshot` feature works...
@@ -954,12 +1005,11 @@ Improved support for saving multiple snapshots of a single URL without this hash
### Storage Requirements
-Because ArchiveBox is designed to ingest a large volume of URLs with multiple copies of each URL stored by different 3rd-party tools, it can be quite disk-space intensive.
-There also also some special requirements when using filesystems like NFS/SMB/FUSE.
+Because ArchiveBox is designed to ingest a large volume of URLs with multiple copies of each URL stored by different 3rd-party tools, it can be quite disk-space intensive. There are also some special requirements when using filesystems like NFS/SMB/FUSE.
-Click to expand...
+Click to learn more about ArchiveBox's filesystem and hosting requirements...
@@ -1030,10 +1080,6 @@ If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to
EIN: 81-2908499
)