diff --git a/.dockerignore b/.dockerignore index 08408d22..27ad7a81 100644 --- a/.dockerignore +++ b/.dockerignore @@ -28,4 +28,5 @@ assets/ docker/ data/ +data*/ output/ diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index ff0edb0f..d3fbf26a 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,3 +1,3 @@ github: pirate patreon: theSquashSH -custom: ["https://twitter.com/ArchiveBoxApp", "https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"] +custom: ["https://hcb.hackclub.com/donations/start/archivebox", "https://paypal.me/NicholasSweeting"] diff --git a/.github/ISSUE_TEMPLATE/documentation_change.md b/.github/ISSUE_TEMPLATE/documentation_change.md index a02e9374..99b8775f 100644 --- a/.github/ISSUE_TEMPLATE/documentation_change.md +++ b/.github/ISSUE_TEMPLATE/documentation_change.md @@ -6,6 +6,7 @@ labels: '' assignees: '' --- + ## Wiki Page URL diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..269438fa --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,12 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" + target-branch: "dev" + schedule: + interval: "weekly" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000..a6d4e276 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,92 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ "dev" ] + pull_request: + branches: [ "dev" ] + schedule: + - cron: '33 17 * * 6' + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, + # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. + # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how + # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + # ℹ️ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + - if: matrix.build-mode == 'manual' + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 75c7658c..871f0260 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -81,6 +81,13 @@ jobs: - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} + + - name: Update README + uses: peter-evans/dockerhub-description@v4 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + repository: archivebox/archivebox # This ugly bit is necessary if you don't want your cache to grow forever # until it hits GitHub's limit of 5GB. diff --git a/.gitignore b/.gitignore index 22cad1c0..27d833f0 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,10 @@ venv/ .docker-venv/ node_modules/ +# Ignore dev lockfiles (should always be built fresh) +requirements-dev.txt +pdm.dev.lock + # Packaging artifacts .pdm-python .pdm-build @@ -25,6 +29,7 @@ data/ data1/ data2/ data3/ +data*/ output/ # vim diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 7224eee9..d90ccf6c 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -30,5 +30,4 @@ formats: # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html python: install: - - requirements: requirements.txt - - requirements: docs/requirements.txt \ No newline at end of file + - requirements: docs/requirements.txt diff --git a/Dockerfile b/Dockerfile index 454effe8..fb6f302c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ # docker run -v "$PWD/data":/data -p 8000:8000 archivebox server # Multi-arch build: # docker buildx create --use -# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev +# docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:latest -t archivebox/archivebox:dev # # Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development). @@ -20,10 +20,24 @@ FROM python:3.11-slim-bookworm LABEL name="archivebox" \ maintainer="Nick Sweeting " \ - description="All-in-one personal internet archiving container" \ + description="All-in-one self-hosted internet archiving solution" \ homepage="https://github.com/ArchiveBox/ArchiveBox" \ - documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker" - + documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker" \ + org.opencontainers.image.title="ArchiveBox" \ + org.opencontainers.image.vendor="ArchiveBox" \ + org.opencontainers.image.description="All-in-one self-hosted internet archiving solution" \ + org.opencontainers.image.source="https://github.com/ArchiveBox/ArchiveBox" \ + com.docker.image.source.entrypoint="Dockerfile" \ + # TODO: release ArchiveBox as a Docker Desktop extension (requires these labels): + # https://docs.docker.com/desktop/extensions-sdk/architecture/metadata/ + com.docker.desktop.extension.api.version=">= 1.4.7" \ + com.docker.desktop.extension.icon="https://archivebox.io/icon.png" \ + com.docker.extension.publisher-url="https://archivebox.io" \ + com.docker.extension.screenshots='[{"alt": "Screenshot of Admin UI", "url": "https://github.com/ArchiveBox/ArchiveBox/assets/511499/e8e0b6f8-8fdf-4b7f-8124-c10d8699bdb2"}]' \ + com.docker.extension.detailed-description='See here for detailed documentation: https://wiki.archivebox.io' \ + com.docker.extension.changelog='See here for release notes: https://github.com/ArchiveBox/ArchiveBox/releases' \ + com.docker.extension.categories='database,utility-tools' + ARG TARGETPLATFORM ARG TARGETOS ARG TARGETARCH @@ -194,10 +208,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T && playwright install --with-deps chromium \ && export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \ else \ - # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.) - apt-get install -qq -y -t bookworm-backports --no-install-recommends \ - chromium \ - && export CHROME_BINARY="$(which chromium)"; \ + # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.) + # apt-get install -qq -y -t bookworm-backports --no-install-recommends \ + # chromium \ + # && export CHROME_BINARY="$(which chromium)"; \ + echo 'armv7 no longer supported in versions after v0.7.3' \ + exit 1; \ fi \ && rm -rf /var/lib/apt/lists/* \ && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \ @@ -266,9 +282,15 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T # Setup ArchiveBox runtime config WORKDIR "$DATA_DIR" -ENV IN_DOCKER=True +ENV IN_DOCKER=True \ + DISPLAY=novnc:0.0 \ + CUSTOM_TEMPLATES_DIR=/data/templates \ + CHROME_USER_DATA_DIR=/data/personas/Default/chromium \ + GOOGLE_API_KEY=no \ + GOOGLE_DEFAULT_CLIENT_ID=no \ + GOOGLE_DEFAULT_CLIENT_SECRET=no \ + ALLOWED_HOSTS=* ## No need to set explicitly, these values will be autodetected by archivebox in docker: - # CHROME_SANDBOX=False \ # WGET_BINARY="wget" \ # YOUTUBEDL_BINARY="yt-dlp" \ # CHROME_BINARY="/usr/bin/chromium-browser" \ @@ -293,9 +315,8 @@ WORKDIR "$DATA_DIR" VOLUME "$DATA_DIR" EXPOSE 8000 -# Optional: -# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \ -# CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1 +HEALTHCHECK --interval=30s --timeout=20s --retries=15 \ + CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK' ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"] CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"] diff --git a/README.md b/README.md index eae35350..3b2372a2 100644 --- a/README.md +++ b/README.md @@ -10,54 +10,103 @@ -     +     -
+
+
**ArchiveBox is a powerful, self-hosted internet archiving solution to collect, save, and view websites offline.** -Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a free central archive, but they require all archives to be public, and they can't save every type of content. +Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but saved URLs have to be public, and they can't save every type of content. -*ArchiveBox is an open source tool that helps you archive web content on your own (or privately within an organization): save copies of browser bookmarks, preserve evidence for legal cases, backup photos from FB / Insta / Flickr, download your media from YT / Soundcloud / etc., snapshot research papers & academic citations, and more...* +*ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* +
-> ➡️ *Use ArchiveBox as a [command-line package](#quickstart) and/or [self-hosted web app](#quickstart) on Linux, macOS, or in [Docker](#quickstart).* +> ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️. +*Once installed, it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [Python library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* + +

+
-📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from browser bookmarks or history, feeds like RSS, bookmark services like Pocket/Pinboard, and more. See input formats for a full list. +📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://github.com/ArchiveBox/archivebox-browser-extension), and more. +See Input Formats for a full list of supported input formats... + +
snapshot detail page -💾 **It saves snapshots of the URLs you feed it in several redundant formats.** -It also detects any content featured *inside* each webpage & extracts it out into a folder: -- `HTML/Generic websites -> HTML, PDF, PNG, WARC, Singlefile` -- `YouTube/SoundCloud/etc. -> MP3/MP4 + subtitles, description, thumbnail` -- `News articles -> article body TXT + title, author, featured images` -- `Github/Gitlab/etc. links -> git cloned source code` -- *[and more...](#output-formats)* +**It saves snapshots of the URLs you feed it in several redundant formats.** +It also detects any content featured *inside* pages & extracts it out into a folder: +- 🌐 **HTML**/**Any websites** ➡️ `original HTML+CSS+JS`, `singlefile HTML`, `screenshot PNG`, `PDF`, `WARC`, `title`, `article text`, `favicon`, `headers`, ... +- 🎥 **Social Media**/**News** ➡️ `post content TXT`, `comments`, `title`, `author`, `images`, ... +- 🎬 **YouTube**/**SoundCloud**/etc. ➡️ `MP3/MP4`s, `subtitles`, `metadata`, `thumbnail`, ... +- 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ... +- ✨ *and more, see [Output Formats](#output-formats) below...* -It uses normal filesystem folders to organize archives (no complicated proprietary formats), and offers a CLI + web UI. +You can run ArchiveBox as a Docker web app to manage these snapshots, or continue accessing the same collection using the `pip`-installed CLI, Python API, and SQLite3 APIs. +All the ways of using it are equivalent, and provide matching features like adding tags, scheduling regular crawls, viewing logs, and more... ---- +
+
-🏛️ ArchiveBox is used by many *[professionals](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102) and [hobbyists](https://zulip.archivebox.io/#narrow/stream/158-development)* who save content off the web, for example: - -- **Individuals:** - `backing up browser bookmarks/history`, `saving FB/Insta/etc. content`, `shopping lists` -- **Journalists:** - `crawling and collecting research`, `preserving quoted material`, `fact-checking and review` -- **Lawyers:** - `evidence collection`, `hashing & integrity verifying`, `search, tagging, & review` -- **Researchers:** - `collecting AI training sets`, `feeding analysis / web crawling pipelines` +🛠️ ArchiveBox uses [standard tools](#dependencies) like Chrome, [`wget`](https://www.gnu.org/software/wget/), & [`yt-dlp`](https://github.com/yt-dlp/yt-dlp), and stores data in [ordinary files & folders](#archive-layout). +*(no complex proprietary formats, all data is readable without needing to run ArchiveBox)* The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down. + +
+
+ + +**📦  Install ArchiveBox using your preferred method: `docker` / `pip` / `apt` / etc. ([see full Quickstart below](#quickstart)).** + + +
Expand for quick copy-pastable install commands...   ⤵️ +
+
# Option A: Get ArchiveBox with Docker Compose (recommended):
+mkdir -p ~/archivebox/data && cd ~/archivebox
+curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml   # edit options in this file as-needed
+docker compose run archivebox init --setup
+# docker compose run archivebox add 'https://example.com'
+# docker compose run archivebox help
+# docker compose up
+
+
+# Option B: Or use it as a plain Docker container: +mkdir -p ~/archivebox/data && cd ~/archivebox/data +docker run -it -v $PWD:/data archivebox/archivebox init --setup +# docker run -it -v $PWD:/data archivebox/archivebox add 'https://example.com' +# docker run -it -v $PWD:/data archivebox/archivebox help +# docker run -it -v $PWD:/data -p 8000:8000 archivebox/archivebox +
+
+# Option C: Or install it with your preferred pkg manager (see Quickstart below for apt, brew, and more) +pip install archivebox +mkdir -p ~/archivebox/data && cd ~/archivebox/data +archivebox init --setup +# archviebox add 'https://example.com' +# archivebox help +# archivebox server 0.0.0.0:8000 +
+
+# Option D: Or use the optional auto setup script to install it +curl -fsSL 'https://get.archivebox.io' | sh +
+
+Open http://localhost:8000 to see your server's Web UI ➡️ +
+
+ +


bookshelf graphic   logo   bookshelf graphic @@ -66,38 +115,6 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
. . . . . . . . . . . . . . . . . . . . . . . . . . . .

-
- -
- -**📦  Get ArchiveBox with `docker` / `apt` / `brew` / `pip3` / `nix` / etc. ([see Quickstart below](#quickstart)).** - -```bash -# Get ArchiveBox with Docker or Docker Compose (recommended) -docker run -v $PWD/data:/data -p 8000:8000 -it archivebox/archivebox - -# Or install with your preferred package manager (see Quickstart below for apt, brew, and more) -pip install archivebox - -# Or use the optional auto setup script to install it -curl -sSL 'https://get.archivebox.io' | sh -``` - -**🔢 Example usage: adding links to archive.** -```bash -archivebox add 'https://example.com' # add URLs one at a time -archivebox add < ~/Downloads/bookmarks.json # or pipe in URLs in any text-based format -archivebox schedule --every=day --depth=1 https://example.com/rss.xml # or auto-import URLs regularly on a schedule -``` -**🔢 Example usage: viewing the archived content.** -```bash -archivebox server 0.0.0.0:8000 # use the interactive web UI -archivebox list 'https://example.com' # use the CLI commands (--help for more) -ls ./archive/*/index.json # or browse directly via the filesystem -``` - -
-

cli init screenshot cli init screenshot server snapshot admin screenshot @@ -107,13 +124,13 @@ ls ./archive/*/index.json # or browse directly via the filesyste ## Key Features -- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE), doesn't require signing up online, stores all data locally -- [**Powerful, intuitive command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular optional dependencies](#dependencies) +- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE), own your own data & maintain your privacy by self-hosting +- [**Powerful command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular dependencies](#dependencies) and [support for Google Drive/NFS/SMB/S3/B2/etc.](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage) - [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) - [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (yt-dlp), articles (readability), code (git), etc.](#output-formats) - [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats) -- [**Uses standard, durable, long-term formats**](#saves-lots-of-useful-stuff-for-each-imported-link) like HTML, JSON, PDF, PNG, MP4, TXT, and WARC -- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) (ALPHA) +- [**Uses standard, durable, long-term formats**](#output-formats) like HTML, JSON, PDF, PNG, MP4, TXT, and WARC +- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) - [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode) - Advanced users: support for archiving [content requiring login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (see wiki security caveats!) - Planned: support for running [JS during archiving](https://github.com/ArchiveBox/ArchiveBox/issues/51) to adblock, [autoscroll](https://github.com/ArchiveBox/ArchiveBox/issues/80), [modal-hide](https://github.com/ArchiveBox/ArchiveBox/issues/175), [thread-expand](https://github.com/ArchiveBox/ArchiveBox/issues/345) @@ -122,12 +139,22 @@ ls ./archive/*/index.json # or browse directly via the filesyste ## 🤝 Professional Integration -*[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102) if your institution/org wants to use ArchiveBox professionally.* +ArchiveBox is free for everyone to self-host, but we also provide support, security review, and custom integrations to help NGOs, governments, and other organizations [run ArchiveBox professionally](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102): -- setup & support, team permissioning, hashing, audit logging, backups, custom archiving etc. -- for **individuals**, **NGOs**, **academia**, **governments**, **journalism**, **law**, and more... +- **Journalists:** + `crawling during research`, `preserving cited pages`, `fact-checking & review` +- **Lawyers:** + `collecting & preserving evidence`, `detecting changes`, `tagging & review` +- **Researchers:** + `analyzing social media trends`, `getting LLM training data`, `crawling pipelines` +- **Individuals:** + `saving bookmarks`, `preserving portfolio content`, `legacy / memoirs archival` +- **Governments:** + `snapshoting public service sites`, `recordkeeping compliance` -*We are a 501(c)(3) nonprofit and all our work goes towards supporting open-source development.* +> ***[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your org wants help using ArchiveBox professionally.* +> We offer: setup & support, hosting, custom features, security, hashing & audit logging/chain-of-custody, etc. +> *ArchiveBox has 🏛️ 501(c)(3) [nonprofit status](https://hackclub.com/hcb/) and all our work supports open-source development.*
@@ -136,38 +163,43 @@ ls ./archive/*/index.json # or browse directly via the filesyste grassgrass
+ + # Quickstart -**🖥  Supported OSs:** Linux/BSD, macOS, Windows (Docker)   **👾  CPUs:** `amd64` (`x86_64`), `arm64` (`arm8`), `arm7` (raspi>=3)
-Note: On `arm7` the `playwright` package is not available, so `chromium` must be installed manually if needed. +**🖥  [Supported OSs](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#supported-systems):** Linux/BSD, macOS, Windows (Docker)   **👾  CPUs:** `amd64` (`x86_64`), `arm64`, `arm7` (raspi>=3)

#### ✳️  Easy Setup -
+
Docker docker-compose (macOS/Linux/Windows)   👈  recommended   (click to expand)
-👍 Docker Compose is recommended for the easiest install/update UX + best security + all the extras out-of-the-box. +👍 Docker Compose is recommended for the easiest install/update UX + best security + all extras out-of-the-box.

  1. Install Docker on your system (if not already installed).
  2. Download the docker-compose.yml file into a new empty directory (can be anywhere). -
    mkdir ~/archivebox && cd ~/archivebox
    -curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/docker-compose.yml'
    +
    mkdir -p ~/archivebox/data && cd ~/archivebox
    +# Read and edit docker-compose.yml options as-needed after downloading
    +curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
     
  3. -
  4. Run the initial setup and create an admin user. +
  5. Run the initial setup to create an admin user (or set ADMIN_USER/PASS in docker-compose.yml)
    docker compose run archivebox init --setup
     
  6. Next steps: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    docker compose up
     # completely optional, CLI can always be used without running a server
    -# docker compose run [-T] archivebox [subcommand] [--args]
    +# docker compose run [-T] archivebox [subcommand] [--help]
     docker compose run archivebox add 'https://example.com'
    -
  7. +docker compose run archivebox help +
    +For more info, see Install: Docker Compose in the Wiki. ➡️ +
-See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive. +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.

@@ -177,15 +209,17 @@ See below for more usage examples using the C
  1. Install Docker on your system (if not already installed).
  2. Create a new empty directory and initialize your collection (can be anywhere). -
    mkdir ~/archivebox && cd ~/archivebox
    +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
     docker run -v $PWD:/data -it archivebox/archivebox init --setup
     
  3. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox
     # completely optional, CLI can always be used without running a server
    -# docker run -v $PWD:/data -it [subcommand] [--args]
    +# docker run -v $PWD:/data -it [subcommand] [--help]
    +docker run -v $PWD:/data -it archivebox/archivebox help
     
    +For more info, see Install: Docker Compose in the Wiki. ➡️
@@ -199,7 +233,8 @@ See below for more usage examples using the C
  1. Install Docker on your system (optional, highly recommended but not required).
  2. Run the automatic setup script. -
    curl -sSL 'https://get.archivebox.io' | sh
    +
    curl -fsSL 'https://get.archivebox.io' | sh
    +For more info, see Install: Bare Metal in the Wiki. ➡️
@@ -214,8 +249,47 @@ See "Against curl | sh as a #### 🛠  Package Manager Setup + +
-aptitude apt (Ubuntu/Debian) +Pip pip (macOS/Linux/BSD) +
+
    + +
  1. Install Python >= v3.10 and Node >= v18 on your system (if not already installed).
  2. +
  3. Install the ArchiveBox package using pip3 (or pipx). +
    pip3 install --upgrade archivebox yt-dlp playwright
    +playwright install --with-deps chromium
    +archivebox version
    +# install any missing extras shown using apt/brew/pkg/etc. see Wiki for instructions
    +#    python@3.10 node curl wget git ripgrep ...
    +
    +See the Install: Bare Metal Wiki for full install instructions for each OS... +
  4. +
  5. Create a new empty directory and initialize your collection (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data   # for example
    +archivebox init --setup   # instantialize a new collection
    +# (--setup auto-installs and link JS dependencies: singlefile, readability, mercury, etc.)
    +
    +
  6. +
  7. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin. +
    archivebox server 0.0.0.0:8000
    +# completely optional, CLI can always be used without running a server
    +# archivebox [subcommand] [--help]
    +archivebox help
    +
    +
  8. +
+ +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
+
+See the pip-archivebox repo for more details about this distribution. +

+
+ + +
+aptitude apt (Ubuntu/Debian/etc.)
  1. Add the ArchiveBox repository to your sources.
    @@ -226,82 +300,63 @@ sudo apt update
  2. Install the ArchiveBox package using apt.
    sudo apt install archivebox
    -sudo python3 -m pip install --upgrade --ignore-installed archivebox   # pip needed because apt only provides a broken older version of Django
    +# update to newest version with pip (sometimes apt package is outdated)
    +pip install --upgrade --ignore-installed archivebox yt-dlp playwright
    +playwright install --with-deps chromium    # install chromium and its system dependencies
    +archivebox version                         # make sure all dependencies are installed
     
  3. Create a new empty directory and initialize your collection (can be anywhere). -
    mkdir ~/archivebox && cd ~/archivebox
    -archivebox init --setup           # if any problems, install with pip instead
    +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
    +archivebox init --setup
     
    -Note: If you encounter issues with NPM/NodeJS, install a more recent version.

    +Note: If you encounter issues or want more granular instructions, see the Install: Bare Metal Wiki.

  4. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    archivebox server 0.0.0.0:8000
     # completely optional, CLI can always be used without running a server
    -# archivebox [subcommand] [--args]
    +# archivebox [subcommand] [--help]
    +archivebox help
     
See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
-See the debian-archivebox repo for more details about this distribution. +See the debian-archivebox repo for more details about this distribution.

-homebrew brew (macOS) +homebrew brew (macOS only)
  1. Install Homebrew on your system (if not already installed).
  2. Install the ArchiveBox package using brew.
    brew tap archivebox/archivebox
     brew install archivebox
    +# update to newest version with pip (sometimes brew package is outdated)
    +pip install --upgrade --ignore-installed archivebox yt-dlp playwright
    +playwright install --with-deps chromium    # install chromium and its system dependencies
    +archivebox version                         # make sure all dependencies are installed
     
    +See the Install: Bare Metal Wiki for more granular instructions for macOS... ➡️
  3. Create a new empty directory and initialize your collection (can be anywhere). -
    mkdir ~/archivebox && cd ~/archivebox
    -archivebox init --setup         # if any problems, install with pip instead
    -
    -
  4. -
  5. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin. -
    archivebox server 0.0.0.0:8000
    -# completely optional, CLI can always be used without running a server
    -# archivebox [subcommand] [--args]
    -
    -
  6. -
- -See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
-See the homebrew-archivebox repo for more details about this distribution. -

-
- -
-Pip pip (macOS/Linux/BSD) -
-
    - -
  1. Install Python >= v3.9 and Node >= v18 on your system (if not already installed).
  2. -
  3. Install the ArchiveBox package using pip3. -
    pip3 install archivebox
    -
    -
  4. -
  5. Create a new empty directory and initialize your collection (can be anywhere). -
    mkdir ~/archivebox && cd ~/archivebox
    +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
     archivebox init --setup
    -# install any missing extras like wget/git/ripgrep/etc. manually as needed
     
  6. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    archivebox server 0.0.0.0:8000
     # completely optional, CLI can always be used without running a server
    -# archivebox [subcommand] [--args]
    -
    +# archivebox [subcommand] [--help] +archivebox help +

See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
-See the pip-archivebox repo for more details about this distribution. +See the homebrew-archivebox repo for more details about this distribution.

@@ -309,12 +364,11 @@ See the pip-archive Arch pacman / FreeBSD pkg / Nix nix (Arch/FreeBSD/NixOS/more)
-> [!WARNING] -> *These are contributed by external volunteers and may lag behind the official `pip` channel.* +> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*
-Self-hosting PlatformsTrueNAS / YunoHost / Cloudron / UNRAID / etc. (self-hosting solutions) +Self-hosting Platforms TrueNAS / UNRAID / YunoHost / Cloudron / etc. (self-hosting solutions)
-> [!WARNING] -> *These are contributed by external volunteers and may lag behind the official `pip` channel.* +> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*