Merge branch 'dev' into specific-version-banner

This commit is contained in:
Nick Sweeting 2024-01-19 04:01:32 -08:00 committed by GitHub
commit d0cd84a2af
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 293 additions and 1052 deletions

View file

@ -35,7 +35,7 @@ jobs:
cache: true cache: true
- name: Install dependencies - name: Install dependencies
run: pdm install --fail-fast --no-lock --group :all --no-self run: pdm install --fail-fast --no-lock --dev --group=':all' --no-self
- name: Build package - name: Build package
run: | run: |

View file

@ -167,7 +167,6 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
curl wget git yt-dlp ffmpeg ripgrep \ curl wget git yt-dlp ffmpeg ripgrep \
# Packages we have also needed in the past: # Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \ # youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
# fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/* \ && rm -rf /var/lib/apt/lists/* \
# Save version info # Save version info
&& ( \ && ( \
@ -183,6 +182,11 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \ echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
&& apt-get update -qq \ && apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
# chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway
# libxss1 dbus dbus-x11 upower \
# && service dbus start \
&& if [[ "$TARGETPLATFORM" == *amd64* || "$TARGETPLATFORM" == *arm64* ]]; then \ && if [[ "$TARGETPLATFORM" == *amd64* || "$TARGETPLATFORM" == *arm64* ]]; then \
# install Chromium using playwright # install Chromium using playwright
pip install playwright \ pip install playwright \
@ -192,7 +196,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
else \ else \
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.) # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
apt-get install -qq -y -t bookworm-backports --no-install-recommends \ apt-get install -qq -y -t bookworm-backports --no-install-recommends \
chromium fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ chromium \
&& export CHROME_BINARY="$(which chromium)"; \ && export CHROME_BINARY="$(which chromium)"; \
fi \ fi \
&& rm -rf /var/lib/apt/lists/* \ && rm -rf /var/lib/apt/lists/* \

171
README.md
View file

@ -1,27 +1,16 @@
<div align="center"> <div align="center" style="text-align: center; width: 100%">
<em><img src="https://archivebox.io/icon.png" height="90px"></em> <img src="https://archivebox.io/icon.png" height="90px"/>
<h1>ArchiveBox<br/><sub>Open-source self-hosted web archiving.</sub></h1> <h1>ArchiveBox<br/><sub>Open-source self-hosted web archiving.</sub></h1>
<br/> <br/>
▶️ <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart">Quickstart</a> | ▶️ <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart">Quickstart</a> | <a href="https://demo.archivebox.io">Demo</a> | <a href="https://github.com/ArchiveBox/ArchiveBox">GitHub</a> | <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Documentation</a> | <a href="#background--motivation">Info & Motivation</a> | <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a>
<a href="https://demo.archivebox.io">Demo</a> |
<a href="https://github.com/ArchiveBox/ArchiveBox">GitHub</a> |
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Documentation</a> |
<a href="#background--motivation">Info & Motivation</a> |
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a>
<br/> <br/>
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>--> <!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
<a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a> <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a> <a href="https://github.com/ArchiveBox/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/ArchiveBox/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a> &nbsp; <a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a> &nbsp; <a href="https://pypi.org/project/archivebox/"><img src="https://img.shields.io/pypi/dm/archivebox?label=PyPI%20Installs&labelColor=orange&color=yellow"/></a> <a href="https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj"><img src="https://img.shields.io/chrome-web-store/users/habonpimjphpdnmcfkaockjnffodikoj?label=Chrome%20Web%20Store&color=%231973e8"/></a> <a href="https://hub.docker.com/r/archivebox/archivebox"><img src="https://img.shields.io/docker/pulls/archivebox/archivebox.svg?label=Docker+Pulls"/></a>
<a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
<a href="https://github.com/ArchiveBox/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/ArchiveBox/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a> &nbsp;
<a href="https://pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-yellow.svg?logo=python&logoColor=yellow"/></a>
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
<a href="https://hub.docker.com/r/archivebox/archivebox"><img src="https://img.shields.io/badge/Docker-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
<!--<pre lang="bash" align="left"><code style="white-space: pre-line; text-align: left" align="left"> <!--<pre lang="bash" align="left"><code style="white-space: pre-line; text-align: left" align="left">
curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instructions below) curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instructions below)
@ -42,7 +31,7 @@ Without active preservation effort, everything on the internet eventually dissap
📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from browser bookmarks or history, feeds like RSS, bookmark services like Pocket/Pinboard, and more. See <a href="#input-formats">input formats</a> for a full list. 📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from browser bookmarks or history, feeds like RSS, bookmark services like Pocket/Pinboard, and more. See <a href="#input-formats">input formats</a> for a full list.
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/90f1ce3c-75bb-401d-88ed-6297694b76ae" alt="snapshot detail page" align="right" width="190px"/> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/90f1ce3c-75bb-401d-88ed-6297694b76ae" alt="snapshot detail page" align="right" width="190px" style="float: right"/>
💾 **It saves snapshots of the URLs you feed it in several redundant formats.** 💾 **It saves snapshots of the URLs you feed it in several redundant formats.**
It also detects any content featured *inside* each webpage & extracts it out into a folder: It also detects any content featured *inside* each webpage & extracts it out into a folder:
@ -69,7 +58,7 @@ It uses normal filesystem folders to organize archives (no complicated proprieta
The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down. The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down.
<div align="center"> <div align="center" style="text-align: center">
<br/><br/> <br/><br/>
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/5a7d95f2-6977-4de6-9f08-42851a1fe1d2" height="70px" alt="bookshelf graphic"> &nbsp; <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/b2765a33-0d1e-4019-a1db-920c7e00e20e" height="75px" alt="logo" align="top"/> &nbsp; <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/5a7d95f2-6977-4de6-9f08-42851a1fe1d2" height="70px" alt="bookshelf graphic"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/5a7d95f2-6977-4de6-9f08-42851a1fe1d2" height="70px" alt="bookshelf graphic"> &nbsp; <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/b2765a33-0d1e-4019-a1db-920c7e00e20e" height="75px" alt="logo" align="top"/> &nbsp; <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/5a7d95f2-6977-4de6-9f08-42851a1fe1d2" height="70px" alt="bookshelf graphic">
<br/><br/> <br/><br/>
@ -85,10 +74,10 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
```bash ```bash
# Get ArchiveBox with Docker or Docker Compose (recommended) # Get ArchiveBox with Docker or Docker Compose (recommended)
docker run -v $PWD/data:/data -it archivebox/archivebox:dev init --setup docker run -v $PWD/data:/data -p 8000:8000 -it archivebox/archivebox
# Or install with your preferred package manager (see Quickstart below for apt, brew, and more) # Or install with your preferred package manager (see Quickstart below for apt, brew, and more)
pip3 install archivebox pip install archivebox
# Or use the optional auto setup script to install it # Or use the optional auto setup script to install it
curl -sSL 'https://get.archivebox.io' | sh curl -sSL 'https://get.archivebox.io' | sh
@ -107,7 +96,7 @@ archivebox list 'https://example.com' # use the CLI commands (--help for mor
ls ./archive/*/index.json # or browse directly via the filesystem ls ./archive/*/index.json # or browse directly via the filesystem
``` ```
<div align="center"> <div align="center" style="text-align: center">
<br/><br/> <br/><br/>
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/8d67382c-e0ce-4286-89f7-7915f09b930c" width="22%" alt="cli init screenshot" align="top"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/8d67382c-e0ce-4286-89f7-7915f09b930c" width="22%" alt="cli init screenshot" align="top">
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/dad2bc51-e7e5-484e-bb26-f956ed692d16" width="22%" alt="cli init screenshot" align="top"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/dad2bc51-e7e5-484e-bb26-f956ed692d16" width="22%" alt="cli init screenshot" align="top">
@ -143,7 +132,7 @@ ls ./archive/*/index.json # or browse directly via the filesyste
<br/> <br/>
<div align="center"> <div align="center" style="text-align: center">
<br/> <br/>
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/0db52ea7-4a2c-441d-b47f-5553a5d8fe96" width="49%" alt="grass"/><img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/0db52ea7-4a2c-441d-b47f-5553a5d8fe96" width="49%" alt="grass"/> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/0db52ea7-4a2c-441d-b47f-5553a5d8fe96" width="49%" alt="grass"/><img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/0db52ea7-4a2c-441d-b47f-5553a5d8fe96" width="49%" alt="grass"/>
</div> </div>
@ -327,6 +316,7 @@ See the <a href="https://github.com/ArchiveBox/pip-archivebox"><code>pip-archive
<li>Arch: <a href="https://aur.archlinux.org/packages/archivebox/"><code>yay -S archivebox</code></a> (contributed by <a href="https://github.com/imlonghao"><code>@imlonghao</code></a>)</li> <li>Arch: <a href="https://aur.archlinux.org/packages/archivebox/"><code>yay -S archivebox</code></a> (contributed by <a href="https://github.com/imlonghao"><code>@imlonghao</code></a>)</li>
<li>FreeBSD: <a href="https://github.com/ArchiveBox/ArchiveBox#%EF%B8%8F-easy-setup"><code>curl -sSL 'https://get.archivebox.io' | sh</code></a> (uses <code>pkg</code> + <code>pip3</code> under-the-hood)</li> <li>FreeBSD: <a href="https://github.com/ArchiveBox/ArchiveBox#%EF%B8%8F-easy-setup"><code>curl -sSL 'https://get.archivebox.io' | sh</code></a> (uses <code>pkg</code> + <code>pip3</code> under-the-hood)</li>
<li>Nix: <a href="https://github.com/NixOS/nixpkgs/blob/master/pkgs/applications/misc/archivebox/default.nix"><code>nix-env --install archivebox</code></a> (contributed by <a href="https://github.com/siraben"><code>@siraben</code></a>)</li> <li>Nix: <a href="https://github.com/NixOS/nixpkgs/blob/master/pkgs/applications/misc/archivebox/default.nix"><code>nix-env --install archivebox</code></a> (contributed by <a href="https://github.com/siraben"><code>@siraben</code></a>)</li>
<li>Guix: <a href="https://packages.guix.gnu.org/packages/archivebox/"><code>guix install archivebox</code></a> (contributed by <a href="https://github.com/rakino"><code>@rakino</code></a>)</li>
<li>More: <a href="https://github.com/ArchiveBox/ArchiveBox/issues/new"><i>contribute another distribution...!</i></a></li> <li>More: <a href="https://github.com/ArchiveBox/ArchiveBox/issues/new"><i>contribute another distribution...!</i></a></li>
</ul> </ul>
See <a href="#%EF%B8%8F-cli-usage">below</a> for usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive. See <a href="#%EF%B8%8F-cli-usage">below</a> for usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
@ -356,6 +346,27 @@ See <a href="#%EF%B8%8F-cli-usage">below</a> for usage examples using the CLI, W
<br/> <br/>
</details> </details>
<details>
<summary><img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/0c46e949-00fe-49c8-a613-ee14501c014c" alt="Self-hosting Platforms" height="28px" align="top"/><b>TrueNAS / YunoHost / Cloudron / UNRAID / etc.</b> (self-hosting solutions)</summary>
<br/>
> [!WARNING]
> *These are contributed by external volunteers and may lag behind the official `pip` channel.*
<ul>
<li><a href="https://dev.to/finloop/setting-up-archivebox-on-truenas-scale-1788">TrueNAS</a></li>
<li><a href="https://unraid.net/community/apps?q=archivebox#r">UnRaid</a></li>
<li><a href="https://github.com/YunoHost-Apps/archivebox_ynh">Yunohost</a></li>
<li><a href="https://www.cloudron.io/store/io.archivebox.cloudronapp.html">Cloudron</a></li>
<li><a href="https://github.com/ArchiveBox/ArchiveBox/pull/922/files#diff-00f0606e18b2618c3cc1667ca7c2b703b537af690ca71eba1330633587dcb1ee">AppImage</a></li>
<li><a href="https://github.com/ArchiveBox/ArchiveBox/issues/986">Umbrel</a> (need contributors...)</li>
<li>More: <a href="https://github.com/ArchiveBox/ArchiveBox/issues/new"><i>contribute another distribution...!</i></a></li>
</ul>
See <a href="#%EF%B8%8F-cli-usage">below</a> for usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
<br/><br/>
</details>
<details> <details>
<summary><img src="https://user-images.githubusercontent.com/511499/117448723-1663b180-af0d-11eb-837f-d43959227810.png" alt="paid" height="27px" align="top"/> Paid hosting solutions (cloud VPS)</summary> <summary><img src="https://user-images.githubusercontent.com/511499/117448723-1663b180-af0d-11eb-837f-d43959227810.png" alt="paid" height="27px" align="top"/> Paid hosting solutions (cloud VPS)</summary>
<br/> <br/>
@ -423,7 +434,7 @@ archivebox help
#### 🖥&nbsp; Web UI Usage #### 🖥&nbsp; Web UI Usage
```bash ```bash
archivebox manage createsuperuser # set an admin password archivebox manage createsuperuser # create admin user via CLI (or use ADMIN_PASSWORD env variable)
archivebox server 0.0.0.0:8000 # open http://127.0.0.1:8000 to view it archivebox server 0.0.0.0:8000 # open http://127.0.0.1:8000 to view it
# you can also configure whether or not login is required for most features # you can also configure whether or not login is required for most features
@ -441,12 +452,12 @@ ls ./archive/*/index.html # or inspect snapshots on the filesystem
``` ```
<br/> <br/>
<div align="center"> <div align="center" style="text-align: center">
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/65f82532-18dd-49c5-86f1-02b1f3100e1e" width="49%" alt="grass"/><img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/65f82532-18dd-49c5-86f1-02b1f3100e1e" width="49%" alt="grass"/> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/65f82532-18dd-49c5-86f1-02b1f3100e1e" width="49%" alt="grass"/><img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/65f82532-18dd-49c5-86f1-02b1f3100e1e" width="49%" alt="grass"/>
</div> </div>
<br/> <br/>
<div align="center"> <div align="center" style="text-align: center">
<sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub> <sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
<br/><br/> <br/><br/>
<a href="https://demo.archivebox.io">DEMO: <code>https://demo.archivebox.io</code></a><br/> <a href="https://demo.archivebox.io">DEMO: <code>https://demo.archivebox.io</code></a><br/>
@ -458,7 +469,7 @@ ls ./archive/*/index.html # or inspect snapshots on the filesystem
--- ---
<div align="center"> <div align="center" style="text-align: center">
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/ac1f897a-8baa-4f8b-8ee8-7443611f258b" width="96%" alt="lego"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/ac1f897a-8baa-4f8b-8ee8-7443611f258b" width="96%" alt="lego">
</div> </div>
@ -476,9 +487,9 @@ ArchiveBox supports many input formats for URLs, including Pocket & Pinboard exp
- <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/64078483-21d7-4eb1-aa6e-9ad55afe45b8" height="22px"/> TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file) - <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/64078483-21d7-4eb1-aa6e-9ad55afe45b8" height="22px"/> TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file)
- <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/32b494e6-4de1-4984-8d88-dc02f18e5c34" height="22px"/> [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)) - <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/32b494e6-4de1-4984-8d88-dc02f18e5c34" height="22px"/> [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive))
- <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/ff20d251-5347-4b85-ae9b-83037d0ac01e" height="22px"/> Browser extension [`archivebox-exporter`](https://github.com/tjhorner/archivebox-exporter) (realtime archiving from Chrome/Chromium/Firefox) - <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/ff20d251-5347-4b85-ae9b-83037d0ac01e" height="22px"/> Browser extension [`archivebox-exporter`](https://github.com/tjhorner/archivebox-exporter) (realtime archiving from Chrome/Chromium/Firefox)
- <img src="https://getpocket.com/favicon.ico" height="22px"/> [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) - <img src="https://getpocket.com/favicon.ico" height="22px"/> [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [Firefox Sync](https://github.com/ArchiveBox/ArchiveBox/issues/648), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/e1e5bd78-b0b6-45dc-914c-e1046fee4bc4" width="330px" align="right"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/e1e5bd78-b0b6-45dc-914c-e1046fee4bc4" width="330px" align="right" style="float: right"/>
```bash ```bash
@ -505,14 +516,14 @@ It also includes a built-in scheduled import feature with `archivebox schedule`
Inside each Snapshot folder, ArchiveBox saves these different types of extractor outputs as plain files: Inside each Snapshot folder, ArchiveBox saves these different types of extractor outputs as plain files:
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/ace0954a-ddac-4520-9d18-1c77b1ec50b2" width="330px" align="right"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/ace0954a-ddac-4520-9d18-1c77b1ec50b2" width="330px" align="right" style="float: right"/>
`./archive/<timestamp>/*` `./archive/TIMESTAMP/*`
- **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details - **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
- **Title**, **Favicon**, **Headers** Response headers, site favicon, and parsed site title - **Title**, **Favicon**, **Headers** Response headers, site favicon, and parsed site title
- **SingleFile:** `singlefile.html` HTML snapshot rendered with headless Chrome using SingleFile - **SingleFile:** `singlefile.html` HTML snapshot rendered with headless Chrome using SingleFile
- **Wget Clone:** `example.com/page-name.html` wget clone of the site with `warc/<timestamp>.gz` - **Wget Clone:** `example.com/page-name.html` wget clone of the site with `warc/TIMESTAMP.gz`
- Chrome Headless - Chrome Headless
- **PDF:** `output.pdf` Printed PDF of site using headless chrome - **PDF:** `output.pdf` Printed PDF of site using headless chrome
- **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome - **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome
@ -529,7 +540,7 @@ It does everything out-of-the-box by default, but you can disable or tweak [indi
## Configuration ## Configuration
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/ea672e6b-4df5-49d8-b550-7f450951fd27" width="330px" align="right"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/ea672e6b-4df5-49d8-b550-7f450951fd27" width="330px" align="right" style="float: right"/>
ArchiveBox can be configured via environment variables, by using the `archivebox config` CLI, or by editing `./ArchiveBox.conf` directly. ArchiveBox can be configured via environment variables, by using the `archivebox config` CLI, or by editing `./ArchiveBox.conf` directly.
@ -579,12 +590,11 @@ To achieve high-fidelity archives in as many situations as possible, ArchiveBox
<details> <details>
<summary><i>Expand to learn more about ArchiveBox's dependencies...</i></summary><br/> <summary><i>Expand to learn more about ArchiveBox's dependencies...</i></summary><br/>
> *TIP: For better security, easier updating, and to avoid polluting your host system with extra dependencies,* > *TIP: For better security, easier updating, and to avoid polluting your host system with extra dependencies,**it is strongly recommended to use the [⭐️ official Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker)** with everything pre-installed for the best experience.*
> ***it is strongly recommended to use the [⭐️ official Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker)** with everything pre-installed for the best experience.*
These optional dependencies used for archiving sites include: These optional dependencies used for archiving sites include:
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/62a02155-05d7-4f3e-8de5-75a50a145c4f" alt="archivebox --version CLI output screenshot showing dependencies installed" width="330px" align="right"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/62a02155-05d7-4f3e-8de5-75a50a145c4f" alt="archivebox --version CLI output screenshot showing dependencies installed" width="330px" align="right" style="float: right"/>
- `chromium` / `chrome` (for screenshots, PDF, DOM HTML, and headless JS scripts) - `chromium` / `chrome` (for screenshots, PDF, DOM HTML, and headless JS scripts)
@ -630,24 +640,20 @@ Data folders can be created anywhere (`~/archivebox` or `$PWD/data` as seen in o
<br/> <br/>
<details> <details>
<summary><i>Expand to learn more about the layout of Archivebox's data on-disk...</i></summary> <summary><i>Expand to learn more about the layout of Archivebox's data on-disk...</i></summary><br/>
<br/>
All `archivebox` CLI commands are designed to be run from inside an ArchiveBox data folder, starting with `archivebox init` to initialize a new collection inside an empty directory. All <code>archivebox</code> CLI commands are designed to be run from inside an ArchiveBox data folder, starting with <code>archivebox init</code> to initialize a new collection inside an empty directory.
```bash <pre lang="bash"><code style="white-space: pre-line">mkdir ~/archivebox && cd ~/archivebox # just an example, can be anywhere
mkdir ~/archivebox && cd ~/archivebox # just an example, can be anywhere archivebox init</code></pre>
archivebox init
```
The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard `index.sqlite3` database in the root of the data folder (it can also be [exported as static JSON/HTML](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive#2-export-and-host-it-as-static-html)), and the archive snapshots are organized by date-added timestamp in the `./archive/` subfolder. The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard <code>index.sqlite3</code> database in the root of the data folder (it can also be <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive#2-export-and-host-it-as-static-html">exported as static JSON/HTML</a>), and the archive snapshots are organized by date-added timestamp in the <code>./archive/</code> subfolder.
<img src="https://user-images.githubusercontent.com/511499/117453293-c7b91600-af12-11eb-8a3f-aa48b0f9da3c.png" width="400px" align="right"> <img src="https://user-images.githubusercontent.com/511499/117453293-c7b91600-af12-11eb-8a3f-aa48b0f9da3c.png" width="400px" align="right" style="float: right"/>
```bash <pre lang="bash"><code style="white-space: pre-line">/data/
/data/
index.sqlite3 index.sqlite3
ArchiveBox.conf ArchiveBox.conf
archive/ archive/
@ -660,18 +666,18 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
warc/1617687755.warc.gz warc/1617687755.warc.gz
git/somerepo.git git/somerepo.git
... ...
``` </code></pre>
Each snapshot subfolder `./archive/<timestamp>/` includes a static `index.json` and `index.html` describing its contents, and the snapshot extractor outputs are plain files within the folder. Each snapshot subfolder <code>./archive/TIMESTAMP/</code> includes a static <code>index.json</code> and <code>index.html</code> describing its contents, and the snapshot extractor outputs are plain files within the folder.
#### Learn More
- https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Disk-Layout
- https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#large-archives
- https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#output-folder
- https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive
- https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives
<h4>Learn More</h4>
<ul>
<li>https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Disk-Layout</li>
<li>https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#large-archives</li>
<li>https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#output-folder</li>
<li>https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive</li>
<li>https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives</li>
</ul>
</details> </details>
<br/> <br/>
@ -683,12 +689,10 @@ You can export the main index to browse it statically as plain HTML files in a f
<br/> <br/>
<details> <details>
<summary><i>Expand to learn how to export your ArchiveBox collection...</i></summary> <summary><i>Expand to learn how to export your ArchiveBox collection...</i></summary><br/>
<br/>
> *NOTE: These exports are not paginated, exporting many URLs or the entire archive at once may be slow.* > *NOTE: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.*
> *Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.*
```bash ```bash
# archivebox list --help # archivebox list --help
@ -715,7 +719,7 @@ The paths in the static exports are relative, make sure to keep them next to you
--- ---
<div align="center"> <div align="center" style="text-align: center">
<img src="https://docs.monadical.com/uploads/upload_b6900afc422ae699bfefa2dcda3306f3.png" width="100%" alt="security graphic"/> <img src="https://docs.monadical.com/uploads/upload_b6900afc422ae699bfefa2dcda3306f3.png" width="100%" alt="security graphic"/>
</div> </div>
@ -942,7 +946,7 @@ If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to
<br/> <br/>
<div align="center"> <div align="center" style="text-align: center">
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/ca85432e-a2df-40c6-968f-51a1ef99b24e" width="100%" alt="paisley graphic"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/ca85432e-a2df-40c6-968f-51a1ef99b24e" width="100%" alt="paisley graphic">
</div> </div>
@ -962,7 +966,7 @@ Vast treasure troves of knowledge are lost every day on the internet to link rot
Whether it's to resist censorship by saving articles before they get taken down or edited, or just to save a collection of early 2010's flash games you love to play, having the tools to archive internet content enables to you save the stuff you care most about before it disappears. Whether it's to resist censorship by saving articles before they get taken down or edited, or just to save a collection of early 2010's flash games you love to play, having the tools to archive internet content enables to you save the stuff you care most about before it disappears.
<div align="center"> <div align="center" style="text-align: center">
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/71e36bc5-1c94-44e2-92b6-405fa898c734" width="40%"/><br/> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/71e36bc5-1c94-44e2-92b6-405fa898c734" width="40%"/><br/>
<sup><i>Image from <a href="https://perma.cc/">Perma.cc</a>...</i><br/></sup> <sup><i>Image from <a href="https://perma.cc/">Perma.cc</a>...</i><br/></sup>
</div> </div>
@ -980,30 +984,29 @@ ArchiveBox archives the sites in **several different formats** beyond what publi
## Comparison to Other Projects ## Comparison to Other Projects
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/4cac62a9-e8fb-425b-85a3-ca644aa6dd42" width="5%" align="right" alt="comparison"/> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/4cac62a9-e8fb-425b-85a3-ca644aa6dd42" width="5%" align="right" alt="comparison" style="float: right"/>
> [!TIP] > **Check out our [community wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for a list of web archiving tools and orgs.**
> **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
A variety of open and closed-source archiving projects exist, but few provide a nice UI and CLI to manage a large, high-fidelity archive collection over time. A variety of open and closed-source archiving projects exist, but few provide a nice UI and CLI to manage a large, high-fidelity archive collection over time.
ArchiveBox tries to be a robust, set-and-forget archiving solution suitable for archiving RSS feeds, bookmarks, or your entire browsing history (beware, it may be too big to store), ~~including private/authenticated content that you wouldn't otherwise share with a centralized service~~ (this is not recommended due to JS replay security concerns). <br/>
<details>
<summary><i>Click to read more...</i></summary><br/>
### Comparison With Centralized Public Archives ArchiveBox tries to be a robust, set-and-forget archiving solution suitable for archiving RSS feeds, bookmarks, or your entire browsing history (beware, it may be too big to store), including private/authenticated content that you wouldn't otherwise share with a centralized service.
<h3>Comparison With Centralized Public Archives</h3>
Not all content is suitable to be archived in a centralized collection, whether because it's private, copyrighted, too large, or too complex. ArchiveBox hopes to fill that gap. Not all content is suitable to be archived in a centralized collection, whether because it's private, copyrighted, too large, or too complex. ArchiveBox hopes to fill that gap.
By having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle. The eventual goal is to work towards federated archiving where users can share portions of their collections with each other. By having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle. The eventual goal is to work towards federated archiving where users can share portions of their collections with each other.
### Comparison With Other Self-Hosted Archiving Options <h3>Comparison With Other Self-Hosted Archiving Options</h3>
ArchiveBox differentiates itself from [similar self-hosted projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by providing both a comprehensive CLI interface for managing your archive, a Web UI that can be used either independently or together with the CLI, and a simple on-disk data format that can be used without either. ArchiveBox differentiates itself from [similar self-hosted projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by providing both a comprehensive CLI interface for managing your archive, a Web UI that can be used either independently or together with the CLI, and a simple on-disk data format that can be used without either.
<details>
<summary><i>Click to see the <b>⭐️ officially recommended</b> alternatives to ArchiveBox...</i></summary>
<br/>
*If you want better fidelity for very complex interactive pages with heavy JS/streams/API requests, check out [ArchiveWeb.page](https://archiveweb.page) and [ReplayWeb.page](https://replayweb.page).* *If you want better fidelity for very complex interactive pages with heavy JS/streams/API requests, check out [ArchiveWeb.page](https://archiveweb.page) and [ReplayWeb.page](https://replayweb.page).*
@ -1019,17 +1022,23 @@ ArchiveBox is neither the highest fidelity nor the simplest tool available for s
<br/> <br/>
<div align="center"> <div align="center" style="text-align: center">
<br/> <br/>
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/04808ac2-3133-44fd-8703-3387e06dc851" width="100%" alt="dependencies graphic"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/04808ac2-3133-44fd-8703-3387e06dc851" width="100%" alt="dependencies graphic">
</div> </div>
## Internet Archiving Ecosystem ## Internet Archiving Ecosystem
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/78d8a725-97f4-47f5-b983-1f62843ddc51" width="14%" align="right"/> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/78d8a725-97f4-47f5-b983-1f62843ddc51" width="14%" align="right" style="float: right"/>
Whether you want to learn which organizations are the big players in the web archiving space, want to find a specific open-source tool for your web archiving need, or just want to see where archivists hang out online, our Community Wiki page serves as an index of the broader web archiving community. Check it out to learn about some of the coolest web archiving projects and communities on the web! Our Community Wiki page serves as an index of the broader web archiving community.
<ul>
<li>See where archivists hang out online</li>
<li>Explore other open-source tools for your web archiving needs</li>
<li>Learn which organizations are the big players in the web archiving space</li>
</ul>
<details> <details>
<summary><i>Explore our index of web archiving software, blogs, and communities around the world...</i></summary> <summary><i>Explore our index of web archiving software, blogs, and communities around the world...</i></summary>
@ -1062,13 +1071,13 @@ Whether you want to learn which organizations are the big players in the web arc
--- ---
<div align="center"> <div align="center" style="text-align: center">
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/897f7a88-1265-4aab-b80c-b1640afaad1f" width="100%" alt="documentation graphic"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/897f7a88-1265-4aab-b80c-b1640afaad1f" width="100%" alt="documentation graphic">
</div> </div>
# Documentation # Documentation
<img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/> <img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right" style="float: right"/>
We use the [GitHub wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation. We use the [GitHub wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation.
@ -1113,7 +1122,7 @@ You can also access the docs locally by looking in the [`ArchiveBox/docs/`](http
--- ---
<div align="center"> <div align="center" style="text-align: center">
<img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/e895e79f-5c7d-429b-ad8a-7df2cc183ca3" width="100%" alt="development"> <img src="https://github.com/ArchiveBox/ArchiveBox/assets/511499/e895e79f-5c7d-429b-ad8a-7df2cc183ca3" width="100%" alt="development">
</div> </div>
@ -1285,7 +1294,7 @@ https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-dj
ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page. ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page.
Extractors take the URL of a page to archive, write their output to the filesystem `archive/<timestamp>/<extractorname>/...`, and return an [`ArchiveResult`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/core/models.py#:~:text=return%20qs-,class%20ArchiveResult,-(models.Model)%3A) entry which is saved to the database (visible on the `Log` page in the UI). Extractors take the URL of a page to archive, write their output to the filesystem `archive/TIMESTAMP/EXTRACTOR/...`, and return an [`ArchiveResult`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/core/models.py#:~:text=return%20qs-,class%20ArchiveResult,-(models.Model)%3A) entry which is saved to the database (visible on the `Log` page in the UI).
*Check out how we added **[`archivebox/extractors/singlefile.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/singlefile.py)** as an example of the process: [Issue #399](https://github.com/ArchiveBox/ArchiveBox/issues/399) + [PR #403](https://github.com/ArchiveBox/ArchiveBox/pull/403).* *Check out how we added **[`archivebox/extractors/singlefile.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/singlefile.py)** as an example of the process: [Issue #399](https://github.com/ArchiveBox/ArchiveBox/issues/399) + [PR #403](https://github.com/ArchiveBox/ArchiveBox/pull/403).*
@ -1297,7 +1306,7 @@ Extractors take the URL of a page to archive, write their output to the filesyst
1. [Open an issue](https://github.com/ArchiveBox/ArchiveBox/issues/new?assignees=&labels=changes%3A+behavior%2Cstatus%3A+idea+phase&template=feature_request.md&title=Feature+Request%3A+...) with your propsoed implementation (please link to the pages of any new external dependencies you plan on using) 1. [Open an issue](https://github.com/ArchiveBox/ArchiveBox/issues/new?assignees=&labels=changes%3A+behavior%2Cstatus%3A+idea+phase&template=feature_request.md&title=Feature+Request%3A+...) with your propsoed implementation (please link to the pages of any new external dependencies you plan on using)
2. Ensure any dependencies needed are easily installable via a package managers like `apt`, `brew`, `pip3`, `npm` 2. Ensure any dependencies needed are easily installable via a package managers like `apt`, `brew`, `pip3`, `npm`
(Ideally, prefer to use external programs available via `pip3` or `npm`, however we do support using any binary installable via package manager that exposes a CLI/Python API and writes output to stdout or the filesystem.) (Ideally, prefer to use external programs available via `pip3` or `npm`, however we do support using any binary installable via package manager that exposes a CLI/Python API and writes output to stdout or the filesystem.)
3. Create a new file in [`archivebox/extractors/<extractorname>.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors) (copy an existing extractor like [`singlefile.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/singlefile.py) as a template) 3. Create a new file in [`archivebox/extractors/EXTRACTOR.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors) (copy an existing extractor like [`singlefile.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/singlefile.py) as a template)
4. Add config settings to enable/disable any new dependencies and the extractor as a whole, e.g. `USE_DEPENDENCYNAME`, `SAVE_EXTRACTORNAME`, `EXTRACTORNAME_SOMEOTHEROPTION` in [`archivebox/config.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/config.py) 4. Add config settings to enable/disable any new dependencies and the extractor as a whole, e.g. `USE_DEPENDENCYNAME`, `SAVE_EXTRACTORNAME`, `EXTRACTORNAME_SOMEOTHEROPTION` in [`archivebox/config.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/config.py)
5. Add a preview section to [`archivebox/templates/core/snapshot.html`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/templates/core/snapshot.html) to view the output, and a column to [`archivebox/templates/core/index_row.html`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/templates/core/index_row.html) with an icon for your extractor 5. Add a preview section to [`archivebox/templates/core/snapshot.html`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/templates/core/snapshot.html) to view the output, and a column to [`archivebox/templates/core/index_row.html`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/templates/core/index_row.html) with an icon for your extractor
6. Add an integration test for your extractor in [`tests/test_extractors.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/tests/test_extractors.py) 6. Add an integration test for your extractor in [`tests/test_extractors.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/tests/test_extractors.py)
@ -1364,7 +1373,7 @@ Extractors take the URL of a page to archive, write their output to the filesyst
--- ---
<div align="center"> <div align="center" style="text-align: center">
<br/><br/> <br/><br/>
<img src="https://raw.githubusercontent.com/Monadical-SAS/redux-time/HEAD/examples/static/jeremy.jpg" height="40px"/> <img src="https://raw.githubusercontent.com/Monadical-SAS/redux-time/HEAD/examples/static/jeremy.jpg" height="40px"/>
<br/> <br/>

View file

@ -6,6 +6,7 @@ from contextlib import redirect_stdout
from datetime import datetime, timezone from datetime import datetime, timezone
from django.contrib import admin from django.contrib import admin
from django.db.models import Count
from django.urls import path from django.urls import path
from django.utils.html import format_html from django.utils.html import format_html
from django.utils.safestring import mark_safe from django.utils.safestring import mark_safe
@ -117,7 +118,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
def get_queryset(self, request): def get_queryset(self, request):
self.request = request self.request = request
return super().get_queryset(request).prefetch_related('tags') return super().get_queryset(request).prefetch_related('tags').annotate(archiveresult_count=Count('archiveresult'))
def tag_list(self, obj): def tag_list(self, obj):
return ', '.join(obj.tags.values_list('name', flat=True)) return ', '.join(obj.tags.values_list('name', flat=True))
@ -199,7 +200,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
def files(self, obj): def files(self, obj):
return snapshot_icons(obj) return snapshot_icons(obj)
files.admin_order_field = 'updated' files.admin_order_field = 'archiveresult_count'
files.short_description = 'Files Saved' files.short_description = 'Files Saved'
def size(self, obj): def size(self, obj):
@ -216,7 +217,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
size_txt, size_txt,
) )
size.admin_order_field = 'archiveresult__count' size.admin_order_field = 'archiveresult_count'
def url_str(self, obj): def url_str(self, obj):
return format_html( return format_html(

View file

@ -202,4 +202,9 @@ def wget_output_path(link: Link) -> Optional[str]:
if search_dir.is_dir(): if search_dir.is_dir():
return domain(link.url).replace(":", "+") return domain(link.url).replace(":", "+")
# fallback to just the domain dir without port
search_dir = Path(link.link_dir) / domain(link.url).split(":", 1)[0]
if search_dir.is_dir():
return domain(link.url).split(":", 1)[0]
return None return None

View file

@ -379,11 +379,15 @@ class Link:
output_paths = ( output_paths = (
domain(self.url), domain(self.url),
'output.html',
'output.pdf', 'output.pdf',
'screenshot.png', 'screenshot.png',
'output.html', 'singlefile.html',
'readability/content.html',
'mercury/content.html',
'htmltotext.txt',
'media', 'media',
'singlefile.html' 'git',
) )
return any( return any(

View file

@ -1,6 +1,6 @@
{ {
"name": "archivebox", "name": "archivebox",
"version": "0.7.2", "version": "0.7.3",
"description": "ArchiveBox: The self-hosted internet archive", "description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>", "author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox", "repository": "github:ArchiveBox/ArchiveBox",

View file

@ -5,7 +5,7 @@
<a href="{% url 'Home' %}">Snapshots</a> | <a href="{% url 'Home' %}">Snapshots</a> |
<a href="/admin/core/tag/">Tags</a> | <a href="/admin/core/tag/">Tags</a> |
<a href="/admin/core/archiveresult/?o=-1">Log</a> &nbsp; &nbsp; <a href="/admin/core/archiveresult/?o=-1">Log</a> &nbsp; &nbsp;
<a href="{% url 'Docs' %}">Docs</a> | <a href="{% url 'Docs' %}" target="_blank" rel="noopener noreferrer">Docs</a> |
<a href="{% url 'public-index' %}">Public</a> | <a href="{% url 'public-index' %}">Public</a> |
<a href="/admin/">Admin</a> <a href="/admin/">Admin</a>
&nbsp; &nbsp; &nbsp; &nbsp;

File diff suppressed because one or more lines are too long

View file

@ -221,6 +221,8 @@ def get_headers(url: str, timeout: int=None) -> str:
def chrome_args(**options) -> List[str]: def chrome_args(**options) -> List[str]:
"""helper to build up a chrome shell command with arguments""" """helper to build up a chrome shell command with arguments"""
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
from .config import CHROME_OPTIONS, CHROME_VERSION from .config import CHROME_OPTIONS, CHROME_VERSION
options = {**CHROME_OPTIONS, **options} options = {**CHROME_OPTIONS, **options}
@ -248,14 +250,19 @@ def chrome_args(**options) -> List[str]:
"--disable-software-rasterizer", "--disable-software-rasterizer",
"--run-all-compositor-stages-before-draw", "--run-all-compositor-stages-before-draw",
"--hide-scrollbars", "--hide-scrollbars",
"--window-size=1440,2000",
"--autoplay-policy=no-user-gesture-required", "--autoplay-policy=no-user-gesture-required",
"--no-first-run", "--no-first-run",
"--use-fake-ui-for-media-stream", "--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream", "--use-fake-device-for-media-stream",
"--disable-sync", "--disable-sync",
# "--password-store=basic",
) )
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
# set window size for screenshot/pdf/etc. rendering
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
if not options['CHECK_SSL_VALIDITY']: if not options['CHECK_SSL_VALIDITY']:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors') cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
@ -263,9 +270,6 @@ def chrome_args(**options) -> List[str]:
if options['CHROME_USER_AGENT']: if options['CHROME_USER_AGENT']:
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),) cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
if options['RESOLUTION']:
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
if options['CHROME_TIMEOUT']: if options['CHROME_TIMEOUT']:
cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),) cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),)

View file

@ -91,12 +91,16 @@ if ! chown $PUID:$PGID "$DATA_DIR"/* > /dev/null 2>&1; then
fi fi
# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to install chrome at runtime # also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to 'playwright install chromium' at runtime
export PLAYWRIGHT_BROWSERS_PATH="${PLAYWRIGHT_BROWSERS_PATH:-/browsers}" export PLAYWRIGHT_BROWSERS_PATH="${PLAYWRIGHT_BROWSERS_PATH:-/browsers}"
mkdir -p "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete" mkdir -p "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete"
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/*
rm -Rf "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete" rm -Rf "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete"
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"
if [[ -d "$PLAYWRIGHT_BROWSERS_PATH/.links" ]]; then
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/*
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.*
chown -h $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.links/*
fi
# (this check is written in blood in 2023, QEMU silently breaks things in ways that are not obvious) # (this check is written in blood in 2023, QEMU silently breaks things in ways that are not obvious)
@ -107,7 +111,7 @@ if [[ "$IN_QEMU" == "True" ]]; then
echo -e " See here for more info: https://github.com/microsoft/playwright/issues/17395#issuecomment-1250830493\n" > /dev/stderr echo -e " See here for more info: https://github.com/microsoft/playwright/issues/17395#issuecomment-1250830493\n" > /dev/stderr
fi fi
# check disk space free on / and /data, warn on <500Mb free, error on <100Mb free # check disk space free on /, /data, and /data/archive, warn on <500Mb free, error on <100Mb free
export ROOT_USAGE="$(df --output=pcent,avail / | tail -n 1 | xargs)" export ROOT_USAGE="$(df --output=pcent,avail / | tail -n 1 | xargs)"
export ROOT_USED_PCT="${ROOT_USAGE%%%*}" export ROOT_USED_PCT="${ROOT_USAGE%%%*}"
export ROOT_AVAIL_KB="$(echo "$ROOT_USAGE" | awk '{print $2}')" export ROOT_AVAIL_KB="$(echo "$ROOT_USAGE" | awk '{print $2}')"
@ -124,23 +128,48 @@ elif [[ "$ROOT_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then
df -kh / > /dev/stderr df -kh / > /dev/stderr
fi fi
export DATA_USAGE="$(df --output=pcent,avail /data | tail -n 1 | xargs)" export DATA_USAGE="$(df --output=pcent,avail "$DATA_DIR" | tail -n 1 | xargs)"
export DATA_USED_PCT="${DATA_USAGE%%%*}" export DATA_USED_PCT="${DATA_USAGE%%%*}"
export DATA_AVAIL_KB="$(echo "$DATA_USAGE" | awk '{print $2}')" export DATA_AVAIL_KB="$(echo "$DATA_USAGE" | awk '{print $2}')"
if [[ "$DATA_AVAIL_KB" -lt 100000 ]]; then if [[ "$DATA_AVAIL_KB" -lt 100000 ]]; then
echo -e "\n[!] Warning: Docker data volume is completely out of space! (${DATA_USED_PCT}% used on /data)" > /dev/stderr echo -e "\n[!] Warning: Docker data volume is completely out of space! (${DATA_USED_PCT}% used on $DATA_DIR)" > /dev/stderr
echo -e " you need to free up at least 100Mb on the drive holding your data directory" > /dev/stderr echo -e " you need to free up at least 100Mb on the drive holding your data directory" > /dev/stderr
echo -e " \$ ncdu -x data\n" > /dev/stderr echo -e " \$ ncdu -x data\n" > /dev/stderr
df -kh /data > /dev/stderr df -kh "$DATA_DIR" > /dev/stderr
sleep 5 sleep 5
elif [[ "$DATA_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then elif [[ "$DATA_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then
echo -e "\n[!] Warning: Docker data volume is running out of space! (${DATA_USED_PCT}% used on /data)" > /dev/stderr echo -e "\n[!] Warning: Docker data volume is running out of space! (${DATA_USED_PCT}% used on $DATA_DIR)" > /dev/stderr
echo -e " you may need to free up space on the drive holding your data directory soon" > /dev/stderr echo -e " you may need to free up space on the drive holding your data directory soon" > /dev/stderr
echo -e " \$ ncdu -x data\n" > /dev/stderr echo -e " \$ ncdu -x data\n" > /dev/stderr
df -kh /data > /dev/stderr df -kh "$DATA_DIR" > /dev/stderr
else
# data/ has space available, but check data/archive separately, because it might be on a network mount or external drive
if [[ -d "$DATA_DIR/archive" ]]; then
export ARCHIVE_USAGE="$(df --output=pcent,avail "$DATA_DIR/archive" | tail -n 1 | xargs)"
export ARCHIVE_USED_PCT="${ARCHIVE_USAGE%%%*}"
export ARCHIVE_AVAIL_KB="$(echo "$ARCHIVE_USAGE" | awk '{print $2}')"
if [[ "$ARCHIVE_AVAIL_KB" -lt 100000 ]]; then
echo -e "\n[!] Warning: data/archive folder is completely out of space! (${ARCHIVE_USED_PCT}% used on $DATA_DIR/archive)" > /dev/stderr
echo -e " you need to free up at least 100Mb on the drive holding your data/archive directory" > /dev/stderr
echo -e " \$ ncdu -x data/archive\n" > /dev/stderr
df -kh "$DATA_DIR/archive" > /dev/stderr
sleep 5
elif [[ "$ARCHIVE_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then
echo -e "\n[!] Warning: data/archive folder is running out of space! (${ARCHIVE_USED_PCT}% used on $DATA_DIR/archive)" > /dev/stderr
echo -e " you may need to free up space on the drive holding your data/archive directory soon" > /dev/stderr
echo -e " \$ ncdu -x data/archive\n" > /dev/stderr
df -kh "$DATA_DIR/archive" > /dev/stderr
fi
fi
fi fi
# set DBUS_SYSTEM_BUS_ADDRESS & DBUS_SESSION_BUS_ADDRESS
# (dbus is not actually needed, it makes chrome log fewer warnings but isn't worth making our docker images bigger)
# service dbus start >/dev/null 2>&1 &
# export $(dbus-launch --close-stderr)
export ARCHIVEBOX_BIN_PATH="$(which archivebox)" export ARCHIVEBOX_BIN_PATH="$(which archivebox)"
# Drop permissions to run commands as the archivebox user # Drop permissions to run commands as the archivebox user

View file

@ -26,24 +26,24 @@ if (which docker-compose > /dev/null && docker pull archivebox/archivebox:latest
if [ -f "./index.sqlite3" ]; then if [ -f "./index.sqlite3" ]; then
mv ~/archivebox/* ~/archivebox/data/ mv ~/archivebox/* ~/archivebox/data/
fi fi
curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml' curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/docker-compose.yml'
docker-compose run --rm archivebox init --setup docker compose run --rm archivebox init --setup
echo echo
echo "[+] Starting ArchiveBox server using: docker-compose up -d..." echo "[+] Starting ArchiveBox server using: docker compose up -d..."
docker-compose up -d docker compose up -d
sleep 7 sleep 7
open http://127.0.0.1:8000 || true open http://127.0.0.1:8000 || true
echo echo
echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox/data. Usage:" echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox/data. Usage:"
echo " cd ~/archivebox" echo " cd ~/archivebox"
echo " docker-compose ps" echo " docker compose ps"
echo " docker-compose down" echo " docker compose down"
echo " docker-compose pull" echo " docker compose pull"
echo " docker-compose up" echo " docker compose up"
echo " docker-compose run archivebox manage createsuperuser" echo " docker compose run archivebox manage createsuperuser"
echo " docker-compose run archivebox add 'https://example.com'" echo " docker compose run archivebox add 'https://example.com'"
echo " docker-compose run archivebox list" echo " docker compose run archivebox list"
echo " docker-compose run archivebox help" echo " docker compose run archivebox help"
exit 0 exit 0
elif (which docker > /dev/null && docker pull archivebox/archivebox:latest); then elif (which docker > /dev/null && docker pull archivebox/archivebox:latest); then
echo "[+] Initializing an ArchiveBox data folder at ~/archivebox using Docker..." echo "[+] Initializing an ArchiveBox data folder at ~/archivebox using Docker..."
@ -189,12 +189,12 @@ which open > /dev/null && open http://127.0.0.1:8000 || true
echo echo
echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox. Usage:" echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox. Usage:"
echo " cd ~/archivebox" echo " cd ~/archivebox # see your data dir"
echo " ps aux | grep archivebox" echo " ps aux | grep archivebox # see server process pid"
echo " pkill -f archivebox" echo " pkill -f archivebox # stop the server"
echo " python3 -m pip install --upgrade archivebox" echo " archivebox server --quick-init 0.0.0.0:8000 # start server process"
echo " archivebox server --quick-init 0.0.0.0:8000" echo " pip install --upgrade archivebox; archivebox init # update versions"
echo " archivebox manage createsuperuser" echo " archivebox manage createsuperuser # add an admin user+pass"
echo " archivebox add 'https://example.com'" echo " archivebox add 'https://example.com'" # archive a new URL
echo " archivebox list" echo " archivebox list # see URLs archived"
echo " archivebox help" echo " archivebox help # see more help & examples"

View file

@ -6,7 +6,7 @@
[server] [server]
log_level = "warn" log_level = "debug"
[channel] [channel]

4
package-lock.json generated
View file

@ -1,12 +1,12 @@
{ {
"name": "archivebox", "name": "archivebox",
"version": "0.7.2", "version": "0.7.3",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "archivebox", "name": "archivebox",
"version": "0.7.2", "version": "0.7.3",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@postlight/parser": "^2.2.3", "@postlight/parser": "^2.2.3",

View file

@ -1,6 +1,6 @@
{ {
"name": "archivebox", "name": "archivebox",
"version": "0.7.2", "version": "0.7.3",
"description": "ArchiveBox: The self-hosted internet archive", "description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>", "author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox", "repository": "github:ArchiveBox/ArchiveBox",

906
pdm.lock generated

File diff suppressed because it is too large Load diff

View file

@ -1,11 +1,16 @@
[project] [project]
name = "archivebox" name = "archivebox"
version = "0.7.2" version = "0.7.3"
description = "Self-hosted internet archiving solution." description = "Self-hosted internet archiving solution."
authors = [ authors = [
{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}, {name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"},
] ]
license = {text = "MIT"}
readme = "README.md"
package-dir = "archivebox"
requires-python = ">=3.9,<3.12"
dependencies = [ dependencies = [
# pdm update [--unconstrained]
"croniter>=0.3.34", "croniter>=0.3.34",
"dateparser>=1.0.0", "dateparser>=1.0.0",
"django-extensions>=3.0.3", "django-extensions>=3.0.3",
@ -18,9 +23,6 @@ dependencies = [
"yt-dlp>=2023.10.13", "yt-dlp>=2023.10.13",
# "playwright>=1.39.0; platform_machine != 'armv7l'", # "playwright>=1.39.0; platform_machine != 'armv7l'",
] ]
requires-python = ">=3.9,<3.12"
readme = "README.md"
license = {text = "MIT"}
classifiers = [ classifiers = [
"Development Status :: 4 - Beta", "Development Status :: 4 - Beta",
"Environment :: Console", "Environment :: Console",
@ -54,26 +56,45 @@ classifiers = [
"Typing :: Typed", "Typing :: Typed",
] ]
# pdm lock -G:all [project.optional-dependencies]
# pdm install -G:all # pdm update [--group=':all'] [--unconstrained]
sonic = [
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
"sonic-client>=0.0.5",
]
ldap = [
# apt install libldap2-dev libsasl2-dev
"setuptools>=69.0.3",
"python-ldap>=3.4.3",
"django-auth-ldap>=4.1.0",
]
# playwright = [
# platform_machine isnt respected by pdm export -o requirements.txt, this breaks arm/v7
# "playwright>=1.39.0; platform_machine != 'armv7l'",
# ]
# pdm install -G:all --dev
# pdm update --dev [--unconstrained]
[tool.pdm.dev-dependencies] [tool.pdm.dev-dependencies]
dev = [ dev = [
# build # building
"setuptools>=69.0.3", "setuptools>=69.0.3",
"wheel", "wheel",
"pdm", "pdm",
"homebrew-pypi-poet>=0.10.0", "homebrew-pypi-poet>=0.10.0",
# docs # documentation
"recommonmark", "recommonmark",
"sphinx", "sphinx",
"sphinx-rtd-theme", "sphinx-rtd-theme",
# debug # debugging
"django-debug-toolbar", "django-debug-toolbar",
"djdt_flamegraph", "djdt_flamegraph",
"ipdb", "ipdb",
# test # testing
"pytest", "pytest",
# lint # linting
"flake8", "flake8",
"mypy", "mypy",
"django-stubs", "django-stubs",
@ -84,22 +105,6 @@ lint = "./bin/lint.sh"
test = "./bin/test.sh" test = "./bin/test.sh"
# all = {composite = ["lint mypackage/", "test -v tests/"]} # all = {composite = ["lint mypackage/", "test -v tests/"]}
[project.optional-dependencies]
sonic = [
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
"sonic-client>=0.0.5",
]
ldap = [
# apt install libldap2-dev libsasl2-dev
"python-ldap>=3.4.3",
"django-auth-ldap>=4.1.0",
]
# playwright = [
# platform_machine isnt respected by pdm export -o requirements.txt, this breaks arm/v7
# "playwright>=1.39.0; platform_machine != 'armv7l'",
# ]
[project.scripts] [project.scripts]
archivebox = "archivebox.cli:main" archivebox = "archivebox.cli:main"

View file

@ -31,7 +31,7 @@ pure-eval==0.2.2
pyasn1==0.5.1 pyasn1==0.5.1
pyasn1-modules==0.3.0 pyasn1-modules==0.3.0
pycparser==2.21; implementation_name != "cpython" pycparser==2.21; implementation_name != "cpython"
pycryptodomex==3.19.1 pycryptodomex==3.20.0
pygments==2.17.2 pygments==2.17.2
python-crontab==3.0.0 python-crontab==3.0.0
python-dateutil==2.8.2 python-dateutil==2.8.2
@ -49,6 +49,6 @@ tzdata==2023.4; platform_system == "Windows"
tzlocal==5.2 tzlocal==5.2
urllib3==2.1.0 urllib3==2.1.0
w3lib==2.1.2 w3lib==2.1.2
wcwidth==0.2.12 wcwidth==0.2.13
websockets==12.0 websockets==12.0
yt-dlp==2023.12.30 yt-dlp==2023.12.30