From 5de45dbf30c277fd39c2c4388cdcae1f159efb6b Mon Sep 17 00:00:00 2001 From: Ben Muthalaly Date: Mon, 8 Jan 2024 22:55:30 -0600 Subject: [PATCH 001/227] Show upgrade notification in admin snapshot view --- archivebox/core/admin.py | 14 +++++++++++++- archivebox/core/urls.py | 8 -------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index c4974c3a..30aacc90 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -23,8 +23,16 @@ from core.mixins import SearchResultsAdminMixin from index.html import snapshot_icons from logging_util import printable_filesize from main import add, remove -from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE from extractors import archive_links +from config import ( + OUTPUT_DIR, + SNAPSHOTS_PER_PAGE, + VERSION, + VERSIONS_AVAILABLE, + CAN_UPGRADE +) + +GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE} # Admin URLs # /admin/ @@ -96,6 +104,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): action_form = SnapshotActionForm + def changelist_view(self, request, extra_context=None): + extra_context = extra_context or {} + return super().changelist_view(request, extra_context | GLOBAL_CONTEXT) + def get_urls(self): urls = super().get_urls() custom_urls = [ diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index f89273ff..1111ead4 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -8,11 +8,6 @@ from django.views.generic.base import RedirectView from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView -# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306 -# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE -# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE} - - # print('DEBUG', settings.DEBUG) urlpatterns = [ @@ -36,9 +31,6 @@ urlpatterns = [ path('accounts/', include('django.contrib.auth.urls')), path('admin/', admin.site.urls), - # do not add extra_context like this as not all admin views (e.g. ModelAdmin.autocomplete_view accept extra kwargs) - # path('admin/', admin.site.urls, {'extra_context': GLOBAL_CONTEXT}), - path('health/', HealthCheckView.as_view(), name='healthcheck'), path('error/', lambda _: 1/0), From 5bdcbaeebdfeef1c293c8aba5895388bcb3e9cd1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 23 Jan 2024 10:45:12 -0800 Subject: [PATCH 002/227] Add link to archivebox-proxy in README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e1917b6b..44b38fe6 100644 --- a/README.md +++ b/README.md @@ -476,6 +476,8 @@ ArchiveBox supports many input formats for URLs, including Pocket & Pinboard exp - [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)) - Browser extension [`archivebox-exporter`](https://github.com/tjhorner/archivebox-exporter) (realtime archiving from Chrome/Chromium/Firefox) - [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) +- Proxy archiving with [`archivebox-proxy`](https://codeberg.org/brunoschroeder/archivebox-proxy) (realtime archiving of all traffic from any browser or device) + @@ -486,7 +488,7 @@ archivebox add 'https://example.com/some/page' archivebox add < ~/Downloads/firefox_bookmarks_export.html archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12' echo 'http://example.com' | archivebox add -echo 'any_text_with [urls](https://example.com) in it' | archivebox add +echo 'any text with urls in it' | archivebox add # if using Docker, add -i when piping stdin: # echo 'https://example.com' | docker run -v $PWD:/data -i archivebox/archivebox add From cf7babebd48e482c5bb2e3e8c1ab3aa76e17123e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 23 Jan 2024 10:46:20 -0800 Subject: [PATCH 003/227] Update README.md fix image src --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 44b38fe6..b4491e2a 100644 --- a/README.md +++ b/README.md @@ -476,7 +476,7 @@ ArchiveBox supports many input formats for URLs, including Pocket & Pinboard exp - [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)) - Browser extension [`archivebox-exporter`](https://github.com/tjhorner/archivebox-exporter) (realtime archiving from Chrome/Chromium/Firefox) - [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) -- Proxy archiving with [`archivebox-proxy`](https://codeberg.org/brunoschroeder/archivebox-proxy) (realtime archiving of all traffic from any browser or device) +- Proxy archiving with [`archivebox-proxy`](https://codeberg.org/brunoschroeder/archivebox-proxy) (realtime archiving of all traffic from any browser or device) From 702c70fcfaa4550f68974548def75966b4622627 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 23 Jan 2024 10:47:08 -0800 Subject: [PATCH 004/227] add link to proxy archiving issue --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b4491e2a..08901575 100644 --- a/README.md +++ b/README.md @@ -476,7 +476,7 @@ ArchiveBox supports many input formats for URLs, including Pocket & Pinboard exp - [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)) - Browser extension [`archivebox-exporter`](https://github.com/tjhorner/archivebox-exporter) (realtime archiving from Chrome/Chromium/Firefox) - [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) -- Proxy archiving with [`archivebox-proxy`](https://codeberg.org/brunoschroeder/archivebox-proxy) (realtime archiving of all traffic from any browser or device) +- Proxy archiving with [`archivebox-proxy`](https://codeberg.org/brunoschroeder/archivebox-proxy) ([realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any browser or device) From 7036428d7e2aad2b314daafc7eddcfca00f7fa6d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 23 Jan 2024 13:13:22 -0800 Subject: [PATCH 005/227] Update documentation_change.md --- .github/ISSUE_TEMPLATE/documentation_change.md | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ISSUE_TEMPLATE/documentation_change.md b/.github/ISSUE_TEMPLATE/documentation_change.md index a02e9374..99b8775f 100644 --- a/.github/ISSUE_TEMPLATE/documentation_change.md +++ b/.github/ISSUE_TEMPLATE/documentation_change.md @@ -6,6 +6,7 @@ labels: '' assignees: '' --- + ## Wiki Page URL From 481554c5213875123d9ebbf9b3e96ac6114c6630 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 23 Jan 2024 13:27:23 -0800 Subject: [PATCH 006/227] update README links to related projects --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8418fdcc..85d5ed27 100644 --- a/README.md +++ b/README.md @@ -486,9 +486,9 @@ ArchiveBox supports many input formats for URLs, including Pocket & Pinboard exp - TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file) - [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)) -- Browser extension [`archivebox-exporter`](https://github.com/tjhorner/archivebox-exporter) (realtime archiving from Chrome/Chromium/Firefox) +- Browser extension [`archivebox-exporter`](https://github.com/ArchiveBox/archivebox-extension) (realtime archiving from Chrome/Chromium/Firefox) - [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [Firefox Sync](https://github.com/ArchiveBox/ArchiveBox/issues/648), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) -- Proxy archiving with [`archivebox-proxy`](https://codeberg.org/brunoschroeder/archivebox-proxy) ([realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any browser or device) +- Proxy archiving with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy) ([realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any browser or device) From c8094887f89aeca83d062fde73f9ce1bc47391f9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 23 Jan 2024 13:27:48 -0800 Subject: [PATCH 007/227] cherry-pick README changes from dev --- README.md | 240 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 142 insertions(+), 98 deletions(-) diff --git a/README.md b/README.md index 08901575..85d5ed27 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,16 @@ -
- +
+

ArchiveBox
Open-source self-hosted web archiving.


-▶️ Quickstart | -Demo | -GitHub | -Documentation | -Info & Motivation | -Community +▶️ Quickstart | Demo | GitHub | Documentation | Info & Motivation | Community
- - -   - - - - +     ## Internet Archiving Ecosystem @@ -1476,16 +1473,10 @@ Extractors take the URL of a page to archive, write their output to the filesyst -- Home: [ArchiveBox.io](https://archivebox.io) -- Demo: [Demo.ArchiveBox.io](https://demo.archivebox.io) -- Docs: [Docs.ArchiveBox.io](https://docs.archivebox.io) -- Releases: [Github.com/ArchiveBox/ArchiveBox/releases](https://github.com/ArchiveBox/ArchiveBox/releases) -- Wiki: [Github.com/ArchiveBox/ArchiveBox/wiki](https://github.com/ArchiveBox/ArchiveBox/wiki) -- Issues: [Github.com/ArchiveBox/ArchiveBox/issues](https://github.com/ArchiveBox/ArchiveBox/issues) -- Discussions: [Github.com/ArchiveBox/ArchiveBox/discussions](https://github.com/ArchiveBox/ArchiveBox/discussions) -- Community Chat: [Zulip Chat (preferred)](https://zulip.archivebox.io) or [Matrix Chat (old)](https://app.element.io/#/room/#archivebox:matrix.org) +- [ArchiveBox.io Homepage](https://archivebox.io) / [Source Code (Github)](https://github.com/ArchiveBox/ArchiveBox) / [Demo Server](https://demo.archivebox.io) +- [Documentation Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki) / [API Reference Docs](https://docs.archivebox.io) / [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases) +- [Bug Tracker](https://github.com/ArchiveBox/ArchiveBox/issues) / [Discussions](https://github.com/ArchiveBox/ArchiveBox/discussions) / [Community Chat Forum (Zulip)](https://zulip.archivebox.io) - Social Media: [Twitter](https://twitter.com/ArchiveBoxApp), [LinkedIn](https://www.linkedin.com/company/archivebox/), [YouTube](https://www.youtube.com/@ArchiveBoxApp), [Alternative.to](https://alternativeto.net/software/archivebox/about/), [Reddit](https://www.reddit.com/r/ArchiveBox/) -- Donations: [Github.com/ArchiveBox/ArchiveBox/wiki/Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) --- @@ -1496,6 +1487,8 @@ Extractors take the URL of a page to archive, write their output to the filesyst     + +
ArchiveBox operates as a US 501(c)(3) nonprofit (sponsored by HCB), donations are tax-deductible.

From 2d26728c2ab074e66466b73a0b219d8f1de89ee4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 28 Jan 2024 04:27:40 -0800 Subject: [PATCH 085/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 60144ed3..86ec1d37 100644 --- a/README.md +++ b/README.md @@ -1164,7 +1164,7 @@ ArchiveBox is neither the highest fidelity nor the simplest tool available for s
-Our Community Wikia tries to be a comprehensive index of the broader web archiving community... +Our Community Wiki strives to be a comprehensive index of the broader web archiving community...
- [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) From a4be98dd2bfba6b22724a59d9851df1f6c4be877 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 28 Jan 2024 04:30:35 -0800 Subject: [PATCH 086/227] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 86ec1d37..5ded344a 100644 --- a/README.md +++ b/README.md @@ -1485,10 +1485,10 @@ Extractors take the URL of a page to archive, write their output to the filesyst 🏛️ Contact us for professional support 💬


  -   - - - +   +   +   +
ArchiveBox operates as a US 501(c)(3) nonprofit (sponsored by HCB), donations are tax-deductible.

From bd19b794e563beb72d2afabebadf87ebb92c2fc2 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 01:01:16 -0800 Subject: [PATCH 087/227] copy readme from dev --- README.md | 594 +++++++++++++++++++++++++++--------------------------- 1 file changed, 300 insertions(+), 294 deletions(-) diff --git a/README.md b/README.md index 61c143e9..5ded344a 100644 --- a/README.md +++ b/README.md @@ -23,39 +23,28 @@ curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instruct Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a free central archive, but they require all archives to be public, and they can't save every type of content. -*ArchiveBox is an open source tool that helps you archive web content on your own (or privately within an organization): save copies of browser bookmarks, preserve evidence for legal cases, backup photos from FB / Insta / Flickr, download your media from YT / Soundcloud / etc., snapshot research papers & academic citations, and more...* +*ArchiveBox is an open source tool that helps organizations and individuals archive web content and retain control over their data: save copies of browser bookmarks, preserve evidence for legal cases, backup photos from FB / Insta / Flickr, download your media from YT / Soundcloud / etc., snapshot research papers & academic citations, and more...* -> ➡️ *Use ArchiveBox as a [command-line package](#quickstart) and/or [self-hosted web app](#quickstart) on Linux, macOS, or in [Docker](#quickstart).* +> ➡️ *Use ArchiveBox on [Linux](#quickstart)/[macOS](#quickstart)/[Windows](#quickstart)/[Docker](#quickstart) as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*
-📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from browser bookmarks or history, feeds like RSS, bookmark services like Pocket/Pinboard, and more. See input formats for a full list. +📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj), and more. See Input Formats for a full list. snapshot detail page -💾 **It saves snapshots of the URLs you feed it in several redundant formats.** +**It saves snapshots of the URLs you feed it in several redundant formats.** It also detects any content featured *inside* each webpage & extracts it out into a folder: -- `HTML/Generic websites -> HTML, PDF, PNG, WARC, Singlefile` -- `YouTube/SoundCloud/etc. -> MP3/MP4 + subtitles, description, thumbnail` -- `News articles -> article body TXT + title, author, featured images` -- `Github/Gitlab/etc. links -> git cloned source code` -- *[and more...](#output-formats)* +- 🌐 **HTML**/**Any websites** ➡️ `original HTML+CSS+JS`, `singlefile HTML`, `screenshot PNG`, `PDF`, `WARC`, ... +- 🎥 **Social Media**/**News** ➡️ `post content TXT`, `comments`, `title`, `author`, `images` +- 🎬 **YouTube**/**SoundCloud**/etc. ➡️ `MP3/MP4`s, `subtitles`, `metadata`, `thumbnail`, ... +- 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ... +- ✨ *and more, see [Output Formats](#output-formats) below...* -It uses normal filesystem folders to organize archives (no complicated proprietary formats), and offers a CLI + web UI. +It uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in ordinary [files & folders](#archive-layout) (no complex proprietary formats). --- -🏛️ ArchiveBox is used by many *[professionals](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102) and [hobbyists](https://zulip.archivebox.io/#narrow/stream/158-development)* who save content off the web, for example: - -- **Individuals:** - `backing up browser bookmarks/history`, `saving FB/Insta/etc. content`, `shopping lists` -- **Journalists:** - `crawling and collecting research`, `preserving quoted material`, `fact-checking and review` -- **Lawyers:** - `evidence collection`, `hashing & integrity verifying`, `search, tagging, & review` -- **Researchers:** - `collecting AI training sets`, `feeding analysis / web crawling pipelines` - The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down.
@@ -70,32 +59,45 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
-**📦  Get ArchiveBox with `docker` / `apt` / `brew` / `pip3` / `nix` / etc. ([see Quickstart below](#quickstart)).** +**📦  Install ArchiveBox using your preferred method: `docker` / `pip` / `apt` / `brew` / etc. ([see full Quickstart below](#quickstart)).** -```bash -# Get ArchiveBox with Docker Compose (recommended) or Docker -curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml -docker pull archivebox/archivebox -# Or install with your preferred package manager (see Quickstart below for apt, brew, and more) +
Expand for quick copy-pastable install commands...   ⤵️ +
+
mkdir ~/archivebox; cd ~/archivebox    # create a dir somewhere for your archivebox data
+
+# Option A: Get ArchiveBox with Docker Compose (recommended): +curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml # edit options in this file as-needed +docker compose run archivebox init --setup +# docker compose run archivebox add 'https://example.com' +# docker compose run archivebox help +# docker compose up +
+
+# Option B: Or use it as a plain Docker container: +docker run -it -v $PWD:/data archivebox/archivebox init --setup +# docker run -it -v $PWD:/data archivebox/archivebox add 'https://example.com' +# docker run -it -v $PWD:/data archivebox/archivebox help +# docker run -it -v $PWD:/data -p 8000:8000 archivebox/archivebox +
+
+# Option C: Or install it with your preferred pkg manager (see Quickstart below for apt, brew, and more) pip install archivebox - -# Or use the optional auto setup script to install it +archivebox init --setup +# archviebox add 'https://example.com' +# archivebox help +# archivebox server 0.0.0.0:8000 +
+
+# Option D: Or use the optional auto setup script to install it curl -sSL 'https://get.archivebox.io' | sh -``` +
+
+Open http://localhost:8000 to see your server's Web UI ➡️ +
+
-**🔢 Example usage: adding links to archive.** -```bash -archivebox add 'https://example.com' # add URLs one at a time -archivebox add < ~/Downloads/bookmarks.json # or pipe in URLs in any text-based format -archivebox schedule --every=day --depth=1 https://example.com/rss.xml # or auto-import URLs regularly on a schedule -``` -**🔢 Example usage: viewing the archived content.** -```bash -archivebox server 0.0.0.0:8000 # use the interactive web UI -archivebox list 'https://example.com' # use the CLI commands (--help for more) -ls ./archive/*/index.json # or browse directly via the filesystem -```


@@ -123,12 +125,23 @@ ls ./archive/*/index.json # or browse directly via the filesyste ## 🤝 Professional Integration -*[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102) if your institution/org wants to use ArchiveBox professionally.* +ArchiveBox is free for everyone to self-host, but we also provide support, security review, and custom integrations to help NGOs, governments, and other organizations [run ArchiveBox professionally](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102): -- setup & support, team permissioning, hashing, audit logging, backups, custom archiving etc. -- for **individuals**, **NGOs**, **academia**, **governments**, **journalism**, **law**, and more... +- 🗞️ **Journalists:** + `crawling and collecting research`, `preserving quoted material`, `fact-checking and review` +- ⚖️ **Lawyers:** + `collecting & preserving evidence`, `hashing / integrity checking / chain-of-custody`, `tagging & review` +- 🔬 **Researchers:** + `analyzing social media trends`, `collecting LLM training data`, `crawling to feed other pipelines` +- 👩🏽 **Individuals:** + `saving legacy social media / memoirs`, `preserving portfolios / resume`, `backing up news articles` -*We are a 501(c)(3) nonprofit and all our work goes towards supporting open-source development.* +> ***[Contact our team](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your institution/org wants to use ArchiveBox professionally.* +> +> - setup & support, team permissioning, hashing, audit logging, backups, custom archiving etc. +> - for **individuals**, **NGOs**, **academia**, **governments**, **journalism**, **law**, and more... + +*We are a 🏛️ 501(c)(3) nonprofit and all our work goes towards supporting open-source development.*
@@ -137,6 +150,8 @@ ls ./archive/*/index.json # or browse directly via the filesyste grassgrass
+ + # Quickstart **🖥  Supported OSs:** Linux/BSD, macOS, Windows (Docker)   **👾  CPUs:** `amd64` (`x86_64`), `arm64` (`arm8`), `arm7` (raspi>=3)
@@ -146,7 +161,7 @@ ls ./archive/*/index.json # or browse directly via the filesyste #### ✳️  Easy Setup -
+
Docker docker-compose (macOS/Linux/Windows)   👈  recommended   (click to expand)
👍 Docker Compose is recommended for the easiest install/update UX + best security + all the extras out-of-the-box. @@ -155,9 +170,10 @@ ls ./archive/*/index.json # or browse directly via the filesyste
  • Install Docker on your system (if not already installed).
  • Download the docker-compose.yml file into a new empty directory (can be anywhere).
    mkdir ~/archivebox && cd ~/archivebox
    -curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/docker-compose.yml'
    +# Read and edit docker-compose.yml options as-needed after downloading
    +curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml
     
  • -
  • Run the initial setup and create an admin user. +
  • Run the initial setup to create an admin user (or set ADMIN_USER/PASS in docker-compose.yml)
    docker compose run archivebox init --setup
     
  • Next steps: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin. @@ -187,6 +203,7 @@ docker run -v $PWD:/data -it archivebox/archivebox init --setup
    docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox
     # completely optional, CLI can always be used without running a server
     # docker run -v $PWD:/data -it [subcommand] [--args]
    +docker run -v $PWD:/data -it archivebox/archivebox help
     
  • @@ -216,8 +233,41 @@ See "Against curl | sh as a #### 🛠  Package Manager Setup + +
    -aptitude apt (Ubuntu/Debian) +Pip pip (macOS/Linux/BSD) +
    +
      + +
    1. Install Python >= v3.10 and Node >= v18 on your system (if not already installed).
    2. +
    3. Install the ArchiveBox package using pip3 (or pipx). +
      pip3 install archivebox
      +
      +
    4. +
    5. Create a new empty directory and initialize your collection (can be anywhere). +
      mkdir ~/archivebox && cd ~/archivebox
      +archivebox init --setup
      +# install any missing extras like wget/git/ripgrep/etc. manually as needed
      +
      +
    6. +
    7. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin. +
      archivebox server 0.0.0.0:8000
      +# completely optional, CLI can always be used without running a server
      +# archivebox [subcommand] [--args]
      +archivebox help
      +
      +
    8. +
    + +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
    +See the pip-archivebox repo for more details about this distribution. +

    +
    + + +
    +aptitude apt (Ubuntu/Debian/etc.)
    1. Add the ArchiveBox repository to your sources.
      @@ -241,6 +291,7 @@ archivebox init --setup # if any problems, install with pip instead
      archivebox server 0.0.0.0:8000
       # completely optional, CLI can always be used without running a server
       # archivebox [subcommand] [--args]
      +archivebox help
       
    @@ -251,7 +302,7 @@ See the debian-a
    -homebrew brew (macOS) +homebrew brew (macOS only)
    1. Install Homebrew on your system (if not already installed).
    2. @@ -269,6 +320,7 @@ archivebox init --setup # if any problems, install with pip instead
      archivebox server 0.0.0.0:8000
       # completely optional, CLI can always be used without running a server
       # archivebox [subcommand] [--args]
      +archivebox help
       
    @@ -278,35 +330,6 @@ See the homebr

    -
    -Pip pip (macOS/Linux/BSD) -
    -
      - -
    1. Install Python >= v3.9 and Node >= v18 on your system (if not already installed).
    2. -
    3. Install the ArchiveBox package using pip3. -
      pip3 install archivebox
      -
      -
    4. -
    5. Create a new empty directory and initialize your collection (can be anywhere). -
      mkdir ~/archivebox && cd ~/archivebox
      -archivebox init --setup
      -# install any missing extras like wget/git/ripgrep/etc. manually as needed
      -
      -
    6. -
    7. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin. -
      archivebox server 0.0.0.0:8000
      -# completely optional, CLI can always be used without running a server
      -# archivebox [subcommand] [--args]
      -
      -
    8. -
    - -See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
    -See the pip-archivebox repo for more details about this distribution. -

    -
    -
    Arch pacman / FreeBSD pkg / Nix nix (Arch/FreeBSD/NixOS/more)
    @@ -345,7 +368,7 @@ See below for usage examples using the CLI, W
    ✨ Alpha (contributors wanted!): for more info, see the: Electron ArchiveBox repo. -
    +
    @@ -419,124 +442,133 @@ For more discussion on managed and paid hosting options see here: -docker compose up -d # start the Web UI server in the background -docker compose run archivebox add 'https://example.com' # add a test URL to snapshot w/ Docker Compose - -archivebox list 'https://example.com' # fetch it with pip-installed archivebox on the host -docker compose run archivebox list 'https://example.com' # or w/ Docker Compose -docker run -it -v $PWD:/data archivebox/archivebox list 'https://example.com' # or w/ Docker, all equivalent - - -
    +curl sh automatic setup script CLI Usage Examples (non-Docker)
    - -##### Bare Metal Usage (`pip`/`apt`/`brew`/etc.) - -
    -
    -Click to expand... -
    -
    
     archivebox init --setup      # safe to run init multiple times (also how you update versions)
    -archivebox version           # get archivebox version info and more
    +archivebox version           # get archivebox version info + check dependencies
    +archivebox help              # get list of archivebox subcommands that can be run
     archivebox add --depth=1 'https://news.ycombinator.com'
     
    -
    -
    - -##### Docker Compose Usage
    +
    -Click to expand... +Docker Docker Compose CLI Usage Examples
    -
    
     # make sure you have `docker-compose.yml` from the Quickstart instructions first
     docker compose run archivebox init --setup
     docker compose run archivebox version
    +docker compose run archivebox help
     docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
    +# to start webserver: docker compose up
     
    -
    -
    - -##### Docker Usage
    +
    -Click to expand... +Docker Docker CLI Usage Examples
    -
    
     docker run -v $PWD:/data -it archivebox/archivebox init --setup
     docker run -v $PWD:/data -it archivebox/archivebox version
    +docker run -v $PWD:/data -it archivebox/archivebox help
    +docker run -v $PWD:/data -it archivebox/archivebox add --depth=1 'https://news.ycombinator.com'
    +# to start webserver: docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox
    +
    +
    + +
    + +
    +🗄  SQL/Python/Filesystem Usage +
    
    +archivebox shell           # explore the Python library API in a REPL
    +sqlite3 ./index.sqlite3    # run SQL queries directly on your index
    +ls ./archive/*/index.html  # or inspect snapshot data directly on the filesystem
    +
    +
    + + +
    + +
    +🖥  Web UI Usage +
    
    +# Start the server on bare metal (pip/apt/brew/etc):
    +archivebox manage createsuperuser              # create a new admin user via CLI
    +archivebox server 0.0.0.0:8000                 # start the server
    +
    +# Or with Docker Compose: +nano docker-compose.yml # setup initial ADMIN_USERNAME & ADMIN_PASSWORD +docker compose up # start the server +
    +# Or with a Docker container: +docker run -v $PWD:/data -it archivebox/archivebox archivebox manage createsuperuser +docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox +
    + +Open
    http://localhost:8000 to see your server's Web UI ➡️ +
    +Optional: Change permissions to allow non-logged-in users + +
    
    +archivebox config --set PUBLIC_ADD_VIEW=True   # allow guests to submit URLs 
    +archivebox config --set PUBLIC_SNAPSHOTS=True  # allow guests to see snapshot content
    +archivebox config --set PUBLIC_INDEX=True      # allow guests to see list of all snapshots
    +# or
    +docker compose run archivebox config --set ...
    +
    +# restart the server to apply any config changes
    +
    +
    + +
    +
    + +> [!TIP] +> Whether in Docker or not, ArchiveBox commands work the same way, and can be used to access the same data on-disk. +> For example, you could run the Web UI in Docker Compose, and run one-off commands with `pip`-installed ArchiveBox. + +
    +Expand to show comparison...
    + +
    
    +archivebox add --depth=1 'https://example.com'                     # add a URL with pip-installed archivebox on the host
    +docker compose run archivebox add --depth=1 'https://example.com'                       # or w/ Docker Compose
    +docker run -it -v $PWD:/data archivebox/archivebox add --depth=1 'https://example.com'  # or w/ Docker, all equivalent
     
    -
    -#### Next Steps - -- `archivebox help/version` to see the list of available subcommands and currently installed version info -- `archivebox setup/init/config/status/manage` to administer your collection -- `archivebox add/schedule/remove/update/list/shell/oneshot` to manage Snapshots in the archive -- `archivebox schedule` to pull in fresh URLs regularly from [bookmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats) - - -#### 🖥  Web UI Usage - -##### Start the Web Server -```bash -# Bare metal (pip/apt/brew/etc): -archivebox server 0.0.0.0:8000 # open http://127.0.0.1:8000 to view it - -# Docker Compose: -docker compose up - -# Docker: -docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox -``` - -##### Allow Public Access or Create an Admin User -```bash -archivebox manage createsuperuser # create a new admin username & pass -# OR # OR -archivebox config --set PUBLIC_ADD_VIEW=True # allow guests to submit URLs -archivebox config --set PUBLIC_SNAPSHOTS=True # allow guests to see snapshot content -archivebox config --set PUBLIC_INDEX=True # allow guests to see list of all snapshots - -# restart the server to apply any config changes -``` - -*Docker hint:* Set the [`ADMIN_USERNAME` & `ADMIN_PASSWORD`)](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#admin_username--admin_password) env variables to auto-create an admin user on first-run. - -#### 🗄  SQL/Python/Filesystem Usage - -```bash -sqlite3 ./index.sqlite3 # run SQL queries on your index -archivebox shell # explore the Python API in a REPL -ls ./archive/*/index.html # or inspect snapshots on the filesystem -```
    @@ -557,25 +589,28 @@ ls ./archive/*/index.html # or inspect snapshots on the filesystem ---
    -lego +lego

    # Overview -## Input Formats + -ArchiveBox supports many input formats for URLs, including Pocket & Pinboard exports, Browser bookmarks, Browser history, plain text, HTML, markdown, and more! +## Input Formats: How to pass URLs into ArchiveBox for saving -*Click these links for instructions on how to prepare your links from these sources:* +- The official ArchiveBox Browser Extension (provides realtime archiving from Chrome/Chromium/Firefox browsers) + +- Manual imports of URLs from RSS, JSON, CSV, TXT, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file) + +- [MITM Proxy](https://mitmproxy.org/) archiving with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy) ([realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any device going through the proxy) + +- Exported [browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)) + +- Links from [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [Firefox Sync](https://github.com/ArchiveBox/ArchiveBox/issues/648), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) -- TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file) -- [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)) -- Browser extension [`archivebox-exporter`](https://github.com/ArchiveBox/archivebox-extension) (realtime archiving from Chrome/Chromium/Firefox) -- [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [Firefox Sync](https://github.com/ArchiveBox/ArchiveBox/issues/648), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) -- Proxy archiving with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy) ([realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any browser or device) @@ -601,30 +636,41 @@ It also includes a built-in scheduled import feature with `archivebox schedule`
    -## Output Formats -Inside each Snapshot folder, ArchiveBox saves these different types of extractor outputs as plain files: + + +## Output Formats: What ArchiveBox saves for each URL -`./archive/TIMESTAMP/*` -- **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details -- **Title**, **Favicon**, **Headers** Response headers, site favicon, and parsed site title -- **SingleFile:** `singlefile.html` HTML snapshot rendered with headless Chrome using SingleFile -- **Wget Clone:** `example.com/page-name.html` wget clone of the site with `warc/TIMESTAMP.gz` -- Chrome Headless - - **PDF:** `output.pdf` Printed PDF of site using headless chrome - - **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome - - **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome -- **Article Text:** `article.html/json` Article text extraction using Readability & Mercury -- **Archive.org Permalink:** `archive.org.txt` A link to the saved site on archive.org -- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl (or yt-dlp) -- **Source Code:** `git/` clone of any repository found on GitHub, Bitbucket, or GitLab links -- _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._ +For each web page added, ArchiveBox creates a Snapshot folder and preserves its content as ordinary files inside the folder (e.g. HTML, PDF, PNG, JSON, etc.). -It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables / config. +It uses all available methods out-of-the-box, but you can disable extractors and fine-tune the [configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed. +
    +
    +Expand to see the full list of ways ArchiveBox saves each page... + + +./archive/{Snapshot.id}/
    +
      +
    • Index: index.html & index.json HTML and JSON index files containing metadata and details
    • +
    • Title, Favicon, Headers Response headers, site favicon, and parsed site title
    • +
    • SingleFile: singlefile.html HTML snapshot rendered with headless Chrome using SingleFile
    • +
    • Wget Clone: example.com/page-name.html wget clone of the site with warc/TIMESTAMP.gz
    • +
    • Chrome Headless
        +
      • PDF: output.pdf Printed PDF of site using headless chrome
      • +
      • Screenshot: screenshot.png 1440x900 screenshot of site using headless chrome
      • +
      • DOM Dump: output.html DOM Dump of the HTML after rendering using headless chrome
      • +
    • +
    • Article Text: article.html/json Article text extraction using Readability & Mercury
    • +
    • Archive.org Permalink: archive.org.txt A link to the saved site on archive.org
    • +
    • Audio & Video: media/ all audio/video files + playlists, including subtitles & metadata with youtube-dl (or yt-dlp)
    • +
    • Source Code: git/ clone of any repository found on GitHub, Bitbucket, or GitLab links
    • +
    • More coming soon! See the Roadmap...
    • +
    +

    ## Configuration @@ -632,52 +678,56 @@ It does everything out-of-the-box by default, but you can disable or tweak [indi ArchiveBox can be configured via environment variables, by using the `archivebox config` CLI, or by editing `./ArchiveBox.conf` directly. - -```bash -archivebox config # view the entire config +
    +
    +Expand to see examples... +
    archivebox config                               # view the entire config
     archivebox config --get CHROME_BINARY           # view a specific value
    -
    +
    archivebox config --set CHROME_BINARY=chromium # persist a config using CLI # OR echo CHROME_BINARY=chromium >> ArchiveBox.conf # persist a config using file # OR env CHROME_BINARY=chromium archivebox ... # run with a one-off config -``` +
    +These methods also work the same way when run inside Docker, see the Docker Configuration wiki page for details. +

    -These methods also work the same way when run inside Docker, see the Docker Configuration wiki page for details. +The configuration is documented here: **[Configuration Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)**, and loaded here: [`archivebox/config.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/config.py). -**The config loading logic with all the options defined is here: [`archivebox/config.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/config.py).** - -Most options are also documented on the **[Configuration Wiki page](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)**. - -#### Most Common Options to Tweak - -```bash + +
    +Expand to see the most common options to tweak... +
    
     # e.g. archivebox config --set TIMEOUT=120
    -
    +# or   docker compose run archivebox config --set TIMEOUT=120
    +
    TIMEOUT=120 # default: 60 add more seconds on slower networks CHECK_SSL_VALIDITY=True # default: False True = allow saving URLs w/ bad SSL SAVE_ARCHIVE_DOT_ORG=False # default: True False = disable Archive.org saving MAX_MEDIA_SIZE=1500m # default: 750m raise/lower youtubedl output size - +
    PUBLIC_INDEX=True # default: True whether anon users can view index PUBLIC_SNAPSHOTS=True # default: True whether anon users can view pages PUBLIC_ADD_VIEW=False # default: False whether anon users can add new URLs - +
    CHROME_USER_AGENT="Mozilla/5.0 ..." # change these to get around bot blocking WGET_USER_AGENT="Mozilla/5.0 ..." CURL_USER_AGENT="Mozilla/5.0 ..." -``` - +
    +

    ## Dependencies -To achieve high-fidelity archives in as many situations as possible, ArchiveBox depends on a variety of 3rd-party tools that specialize in extracting different types of content. +To achieve high-fidelity archives in as many situations as possible, ArchiveBox depends on a variety of 3rd-party libraries and tools that specialize in extracting different types of content. + +> Under-the-hood, ArchiveBox uses [Django](https://www.djangoproject.com/start/overview/) to power its [Web UI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage) and [SQlite](https://www.sqlite.org/locrsf.html) + the filesystem to provide [fast & durable metadata storage](https://www.sqlite.org/locrsf.html) w/ [determinisitc upgrades](https://stackoverflow.com/a/39976321/2156113). ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install), [`wget`, `yt-dlp`, `readability`, etc.](#dependencies) internally, and its operation can be [tuned, secured, and extended](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed for many different applications. +
    -Expand to learn more about ArchiveBox's dependencies...
    +Expand to learn more about ArchiveBox's internals & dependencies...
    > *TIP: For better security, easier updating, and to avoid polluting your host system with extra dependencies,**it is strongly recommended to use the [⭐️ official Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker)** with everything pre-installed for the best experience.* @@ -724,14 +774,13 @@ Installing directly on **Windows without Docker or WSL/WSL2/Cygwin is not offici ## Archive Layout -All of ArchiveBox's state (including the SQLite DB, archived assets, config, logs, etc.) is stored in a single folder called the "ArchiveBox Data Folder". -Data folders can be created anywhere (`~/archivebox` or `$PWD/data` as seen in our examples), and you can create more than one for different collections. +All of ArchiveBox's state (SQLite DB, archived assets, config, logs, etc.) is stored in a single folder called the "ArchiveBox Data Folder".
    Expand to learn more about the layout of Archivebox's data on-disk...
    - +Data folders can be created anywhere (`~/archivebox` or `$PWD/data` as seen in our examples), and you can create as many data folders as you want to hold different collections. All archivebox CLI commands are designed to be run from inside an ArchiveBox data folder, starting with archivebox init to initialize a new collection inside an empty directory.
    mkdir ~/archivebox && cd ~/archivebox   # just an example, can be anywhere
    @@ -774,7 +823,7 @@ Each snapshot subfolder ./archive/TIMESTAMP/ includes a static 
     
    @@ -783,14 +832,17 @@ You can export the main index to browse it statically as plain HTML files in a f > *NOTE: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.* -```bash +```bash| +# do a one-off single URL archive wihout needing a data dir initialized +archivebox oneshot 'https://example.com' + # archivebox list --help archivebox list --html --with-headers > index.html # export to static html table archivebox list --json --with-headers > index.json # export to json blob archivebox list --csv=timestamp,url,title > index.csv # export to csv spreadsheet # (if using Docker Compose, add the -T flag when piping) -# docker compose run -T archivebox list --html --filter-type=search snozzberries > index.json +# docker compose run -T archivebox list --html 'https://example.com' > index.json ``` The paths in the static exports are relative, make sure to keep them next to your `./archive` folder when backing them up or viewing them. @@ -806,8 +858,6 @@ The paths in the static exports are relative, make sure to keep them next to you
    ---- -
    security graphic
    @@ -823,7 +873,7 @@ If you're importing pages with private content or URLs containing secret tokens
    -Click to expand... +Expand to learn about privacy, permissions, and user accounts... ```bash @@ -838,6 +888,7 @@ archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in archivebox config --set PUBLIC_INDEX=False archivebox config --set PUBLIC_SNAPSHOTS=False archivebox config --set PUBLIC_ADD_VIEW=False +archivebox manage createsuperuser # if extra paranoid or anti-Google: archivebox config --set SAVE_FAVICON=False # disable favicon fetching (it calls a Google API passing the URL's domain part only) @@ -867,7 +918,7 @@ Be aware that malicious archived JS can access the contents of other pages in yo
    -Click to expand... +Expand to see risks and mitigations... ```bash @@ -903,7 +954,7 @@ For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) active
    -Click to expand... +Click to learn how to set up user agents, cookies, and site logins...
    @@ -926,7 +977,7 @@ ArchiveBox appends a hash with the current date `https://example.com#2020-10-24`
    -Click to expand... +Click to learn how the `Re-Snapshot` feature works...
    @@ -954,12 +1005,11 @@ Improved support for saving multiple snapshots of a single URL without this hash ### Storage Requirements -Because ArchiveBox is designed to ingest a large volume of URLs with multiple copies of each URL stored by different 3rd-party tools, it can be quite disk-space intensive. -There also also some special requirements when using filesystems like NFS/SMB/FUSE. +Because ArchiveBox is designed to ingest a large volume of URLs with multiple copies of each URL stored by different 3rd-party tools, it can be quite disk-space intensive. There are also some special requirements when using filesystems like NFS/SMB/FUSE.
    -Click to expand... +Click to learn more about ArchiveBox's filesystem and hosting requirements...
    @@ -1030,10 +1080,6 @@ If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to

    - ---- - -
    paisley graphic @@ -1047,7 +1093,7 @@ ArchiveBox aims to enable more of the internet to be saved from deterioration by
    -Click to read more... +Click to read more about why archiving is important and how to do it ethically...
    @@ -1082,7 +1128,7 @@ A variety of open and closed-source archiving projects exist, but few provide a
    -Click to read more...
    +Click to read about how we differ from other centralized archiving services and open source tools...
    ArchiveBox tries to be a robust, set-and-forget archiving solution suitable for archiving RSS feeds, bookmarks, or your entire browsing history (beware, it may be too big to store), including private/authenticated content that you wouldn't otherwise share with a centralized service. @@ -1111,33 +1157,21 @@ ArchiveBox is neither the highest fidelity nor the simplest tool available for s
    -
    -
    -dependencies graphic -
    + ## Internet Archiving Ecosystem - -Our Community Wiki page serves as an index of the broader web archiving community. - -
      -
    • See where archivists hang out online
    • -
    • Explore other open-source tools for your web archiving needs
    • -
    • Learn which organizations are the big players in the web archiving space
    • -
    -
    -Explore our index of web archiving software, blogs, and communities around the world... +Our Community Wiki strives to be a comprehensive index of the broader web archiving community...
    - [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) - - [The Master Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#the-master-lists) - _Community-maintained indexes of archiving tools and institutions._ - [Web Archiving Software](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#web-archiving-projects) - _Open source tools and projects in the internet archiving space._ + _List of ArchiveBox alternatives and open source projects in the internet archiving space._ + - [Awesome-Web-Archiving Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#the-master-lists) + _Community-maintained indexes of archiving tools and institutions like `iipc/awesome-web-archiving`._ - [Reading List](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#reading-list) _Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._ - [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#communities) @@ -1154,11 +1188,8 @@ Our Community Wiki page serves as an index of the broader web archiving communit > ✨ **[Hire the team that built Archivebox](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102) to work on your project.** ([@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp)) -(We also offer general software consulting across many industries) -
    ----
    documentation graphic @@ -1333,28 +1364,19 @@ archivebox init --setup
    -#### Run the linters +#### Run the linters / tests
    Click to expand... ```bash ./bin/lint.sh -``` -(uses `flake8` and `mypy`) - -
    - -#### Run the integration tests - -
    Click to expand... - -```bash ./bin/test.sh ``` -(uses `pytest -s`) +(uses `flake8`, `mypy`, and `pytest -s`)
    + #### Make migrations or enter a django shell
    Click to expand... @@ -1449,47 +1471,31 @@ Extractors take the URL of a page to archive, write their output to the filesyst ## Further Reading -- Home: [ArchiveBox.io](https://archivebox.io) -- Demo: [Demo.ArchiveBox.io](https://demo.archivebox.io) -- Docs: [Docs.ArchiveBox.io](https://docs.archivebox.io) -- Releases: [Github.com/ArchiveBox/ArchiveBox/releases](https://github.com/ArchiveBox/ArchiveBox/releases) -- Wiki: [Github.com/ArchiveBox/ArchiveBox/wiki](https://github.com/ArchiveBox/ArchiveBox/wiki) -- Issues: [Github.com/ArchiveBox/ArchiveBox/issues](https://github.com/ArchiveBox/ArchiveBox/issues) -- Discussions: [Github.com/ArchiveBox/ArchiveBox/discussions](https://github.com/ArchiveBox/ArchiveBox/discussions) -- Community Chat: [Zulip Chat (preferred)](https://zulip.archivebox.io) or [Matrix Chat (old)](https://app.element.io/#/room/#archivebox:matrix.org) + + +- [ArchiveBox.io Homepage](https://archivebox.io) / [Source Code (Github)](https://github.com/ArchiveBox/ArchiveBox) / [Demo Server](https://demo.archivebox.io) +- [Documentation Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki) / [API Reference Docs](https://docs.archivebox.io) / [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases) +- [Bug Tracker](https://github.com/ArchiveBox/ArchiveBox/issues) / [Discussions](https://github.com/ArchiveBox/ArchiveBox/discussions) / [Community Chat Forum (Zulip)](https://zulip.archivebox.io) - Social Media: [Twitter](https://twitter.com/ArchiveBoxApp), [LinkedIn](https://www.linkedin.com/company/archivebox/), [YouTube](https://www.youtube.com/@ArchiveBoxApp), [Alternative.to](https://alternativeto.net/software/archivebox/about/), [Reddit](https://www.reddit.com/r/ArchiveBox/) -- Donations: [Github.com/ArchiveBox/ArchiveBox/wiki/Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) --- +
    +🏛️ Contact us for professional support 💬


    - -
    - -This project is maintained mostly in my spare time with the help from generous contributors. - - -

    - -**🏛️ [Contact us for professional support](https://docs.sweeting.me/s/archivebox-consulting-services) 💬** - -
    -     - - -
    -ArchiveBox operates as a US 501(c)(3) nonprofit, donations are tax-deductible.
    (fiscally sponsored by HCB EIN: 81-2908499)

    - -(网站存档 / 爬虫) - - - - -
    -
    -✨ Have spare CPU/disk/bandwidth and want to help the world?
    Check out our Good Karma Kit...
    +   +   +
    +ArchiveBox operates as a US 501(c)(3) nonprofit (sponsored by HCB), donations are tax-deductible. +

    +  +  +
    +ArchiveBox was started by Nick Sweeting in 2017, and has grown steadily with help from our amazing contributors. +
    +✨ Have spare CPU/disk/bandwidth after all your 网站存档爬 and want to help the world?
    Check out our Good Karma Kit...
    From d936b9eb8ae539e7bfa586748bdaafdd902ffd08 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 01:33:33 -0800 Subject: [PATCH 088/227] Update README.md to fix trailing words --- README.md | 149 +++++++++++++++++++++++++++++------------------------- 1 file changed, 80 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index 5ded344a..75208349 100644 --- a/README.md +++ b/README.md @@ -25,23 +25,25 @@ Without active preservation effort, everything on the internet eventually dissap *ArchiveBox is an open source tool that helps organizations and individuals archive web content and retain control over their data: save copies of browser bookmarks, preserve evidence for legal cases, backup photos from FB / Insta / Flickr, download your media from YT / Soundcloud / etc., snapshot research papers & academic citations, and more...* -> ➡️ *Use ArchiveBox on [Linux](#quickstart)/[macOS](#quickstart)/[Windows](#quickstart)/[Docker](#quickstart) as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* +> ➡️ *ArchiveBox is available on [Linux](#quickstart)/[macOS](#quickstart)/[Windows](#quickstart)/[Docker](#quickstart) as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*
    -📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj), and more. See Input Formats for a full list. +📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj), and more. +See Input Formats for a full list... snapshot detail page **It saves snapshots of the URLs you feed it in several redundant formats.** -It also detects any content featured *inside* each webpage & extracts it out into a folder: +It also detects any content featured *inside* pages & extracts it out into a folder: - 🌐 **HTML**/**Any websites** ➡️ `original HTML+CSS+JS`, `singlefile HTML`, `screenshot PNG`, `PDF`, `WARC`, ... - 🎥 **Social Media**/**News** ➡️ `post content TXT`, `comments`, `title`, `author`, `images` - 🎬 **YouTube**/**SoundCloud**/etc. ➡️ `MP3/MP4`s, `subtitles`, `metadata`, `thumbnail`, ... - 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ... - ✨ *and more, see [Output Formats](#output-formats) below...* -It uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in ordinary [files & folders](#archive-layout) (no complex proprietary formats). +It uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in ordinary [files & folders](#archive-layout). +*(no complex proprietary formats)* --- @@ -59,7 +61,7 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
    -**📦  Install ArchiveBox using your preferred method: `docker` / `pip` / `apt` / `brew` / etc. ([see full Quickstart below](#quickstart)).** +**📦  Install ArchiveBox using your preferred method: `docker` / `pip` / `apt` / etc. ([see full Quickstart](#quickstart)).**
    @@ -116,7 +118,7 @@ curl -sSL 'https://get.archivebox.io' | sh - [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (yt-dlp), articles (readability), code (git), etc.](#output-formats) - [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats) - [**Uses standard, durable, long-term formats**](#output-formats) like HTML, JSON, PDF, PNG, MP4, TXT, and WARC -- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) (ALPHA) +- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) - [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode) - Advanced users: support for archiving [content requiring login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (see wiki security caveats!) - Planned: support for running [JS during archiving](https://github.com/ArchiveBox/ArchiveBox/issues/51) to adblock, [autoscroll](https://github.com/ArchiveBox/ArchiveBox/issues/80), [modal-hide](https://github.com/ArchiveBox/ArchiveBox/issues/175), [thread-expand](https://github.com/ArchiveBox/ArchiveBox/issues/345) @@ -128,13 +130,13 @@ curl -sSL 'https://get.archivebox.io' | sh ArchiveBox is free for everyone to self-host, but we also provide support, security review, and custom integrations to help NGOs, governments, and other organizations [run ArchiveBox professionally](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102): - 🗞️ **Journalists:** - `crawling and collecting research`, `preserving quoted material`, `fact-checking and review` + `crawling during research`, `preserving cited pages`, `fact-checking & review` - ⚖️ **Lawyers:** - `collecting & preserving evidence`, `hashing / integrity checking / chain-of-custody`, `tagging & review` + `collecting & preserving evidence`, `detecting changes`, `tagging & review` - 🔬 **Researchers:** - `analyzing social media trends`, `collecting LLM training data`, `crawling to feed other pipelines` + `analyzing social media trends`, `getting LLM training sets`, `crawling pipelines` - 👩🏽 **Individuals:** - `saving legacy social media / memoirs`, `preserving portfolios / resume`, `backing up news articles` + `saving bookmarks`, `preserving portfolio content`, `legacy / memoirs archival` > ***[Contact our team](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your institution/org wants to use ArchiveBox professionally.* > @@ -154,7 +156,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur # Quickstart -**🖥  Supported OSs:** Linux/BSD, macOS, Windows (Docker)   **👾  CPUs:** `amd64` (`x86_64`), `arm64` (`arm8`), `arm7` (raspi>=3)
    +**🖥  Supported OSs:** Linux/BSD, macOS, Windows (Docker)   **👾  CPUs:** `amd64` (`x86_64`), `arm64`, `arm7` (raspi>=3)
    Note: On `arm7` the `playwright` package is not available, so `chromium` must be installed manually if needed.
    @@ -432,7 +434,7 @@ For more discussion on managed and paid hosting options see here: (depending on how you chose to install it) ```bash mkdir -p ~/archivebox/data # create a new data dir anywhere @@ -601,15 +604,20 @@ docker run -it -v $PWD:/data archivebox/archivebox add --depth=1 'https://exampl ## Input Formats: How to pass URLs into ArchiveBox for saving -- The official ArchiveBox Browser Extension (provides realtime archiving from Chrome/Chromium/Firefox browsers) +- The official ArchiveBox Browser Extension + Provides realtime archiving from Chrome/Chromium/Firefox browsers -- Manual imports of URLs from RSS, JSON, CSV, TXT, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file) +- Manual imports of URLs from RSS, JSON, CSV, TXT, SQL, HTML, Markdown + ArchiveBox supports injecting URLs in [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file) -- [MITM Proxy](https://mitmproxy.org/) archiving with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy) ([realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any device going through the proxy) +- [MITM Proxy](https://mitmproxy.org/) archiving with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy) + Provides [realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any device going through the proxy. -- Exported [browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](https://github.com/ArchiveBox/ArchiveBox/assets/511499/24ad068e-0fa6-41f4-a7ff-4c26fc91f71a), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](https://help.opera.com/en/latest/features/#bookmarks:~:text=Click%20the%20import/-,export%20button,-on%20the%20bottom), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)) +- Exported [browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) + See instructions for: Chrome, Firefox, Safari, IE, Opera, and more... -- Links from [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [Firefox Sync](https://github.com/ArchiveBox/ArchiveBox/issues/648), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) +- Links exported from Bookmarks on social media sites or Brookmarking services + See instructions for: Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved, Wallabag, Unmark.it, OneTab, Firefox Sync, and more... @@ -650,7 +658,7 @@ It uses all available methods out-of-the-box, but you can disable extractors and
    -Expand to see the full list of ways ArchiveBox saves each page... +Expand to see the full list of ways it saves each page... ./archive/{Snapshot.id}/
    @@ -677,7 +685,7 @@ It uses all available methods out-of-the-box, but you can disable extractors and -ArchiveBox can be configured via environment variables, by using the `archivebox config` CLI, or by editing `./ArchiveBox.conf` directly. +ArchiveBox can be configured via environment variables, by using the `archivebox config` CLI, or by editing `./ArchiveBox.conf`.
    Expand to see examples... @@ -722,8 +730,9 @@ CURL_USER_AGENT="Mozilla/5.0 ..." To achieve high-fidelity archives in as many situations as possible, ArchiveBox depends on a variety of 3rd-party libraries and tools that specialize in extracting different types of content. -> Under-the-hood, ArchiveBox uses [Django](https://www.djangoproject.com/start/overview/) to power its [Web UI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage) and [SQlite](https://www.sqlite.org/locrsf.html) + the filesystem to provide [fast & durable metadata storage](https://www.sqlite.org/locrsf.html) w/ [determinisitc upgrades](https://stackoverflow.com/a/39976321/2156113). ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install), [`wget`, `yt-dlp`, `readability`, etc.](#dependencies) internally, and its operation can be [tuned, secured, and extended](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed for many different applications. +> Under-the-hood, ArchiveBox uses [Django](https://www.djangoproject.com/start/overview/) to power its [Web UI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage) and [SQlite](https://www.sqlite.org/locrsf.html) + the filesystem to provide [fast & durable metadata storage](https://www.sqlite.org/locrsf.html) w/ [determinisitc upgrades](https://stackoverflow.com/a/39976321/2156113). +For the actual archiving, ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install), [`wget`, `yt-dlp`, `readability`, etc.](#dependencies) internally, and its operation can be [tuned, secured, and extended](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed for many different applications.
    @@ -774,7 +783,7 @@ Installing directly on **Windows without Docker or WSL/WSL2/Cygwin is not offici ## Archive Layout -All of ArchiveBox's state (SQLite DB, archived assets, config, logs, etc.) is stored in a single folder called the "ArchiveBox Data Folder". +All of ArchiveBox's state (SQLite DB, archived assets, config, logs, etc.) is stored in a single folder (`data/`).
    @@ -823,17 +832,17 @@ Each snapshot subfolder ./archive/TIMESTAMP/ includes a static
    Expand to learn how to export your ArchiveBox collection...
    +
    +

    NOTE: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the archivebox list command to export specific Snapshots or ranges.

    +
    -> *NOTE: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.* - -```bash| -# do a one-off single URL archive wihout needing a data dir initialized +
    # do a one-off single URL archive wihout needing a data dir initialized
     archivebox oneshot 'https://example.com'
     
     # archivebox list --help
    @@ -843,16 +852,17 @@ archivebox list --csv=timestamp,url,title > index.csv  # export to csv spreadshe
     
     # (if using Docker Compose, add the -T flag when piping)
     # docker compose run -T archivebox list --html 'https://example.com' > index.json
    -```
    +
    The paths in the static exports are relative, make sure to keep them next to your `./archive` folder when backing them up or viewing them. -#### Learn More - -- https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive#2-export-and-host-it-as-static-html -- https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#publishing -- https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#public_index--public_snapshots--public_add_view +

    Learn More

    +

    @@ -876,8 +886,7 @@ If you're importing pages with private content or URLs containing secret tokens Expand to learn about privacy, permissions, and user accounts... -```bash -# don't save private content to ArchiveBox, e.g.: +
    # don't save private content to ArchiveBox, e.g.:
     archivebox add 'https://docs.google.com/document/d/12345somePrivateDocument'
     archivebox add 'https://vimeo.com/somePrivateVideo'
     
    @@ -893,19 +902,22 @@ archivebox manage createsuperuser
     # if extra paranoid or anti-Google:
     archivebox config --set SAVE_FAVICON=False          # disable favicon fetching (it calls a Google API passing the URL's domain part only)
     archivebox config --set CHROME_BINARY=chromium      # ensure it's using Chromium instead of Chrome
    -```
    +
    -> *CAUTION: Assume anyone *viewing* your archives will be able to see any cookies, session tokens, or private URLs passed to ArchiveBox during archiving.* -> *Make sure to secure your ArchiveBox data and don't share snapshots with others without stripping out sensitive headers and content first.* +
    +

    CAUTION: Assume anyone viewing your archives will be able to see any cookies, session tokens, or private URLs passed to ArchiveBox during archiving. +Make sure to secure your ArchiveBox data and don't share snapshots with others without stripping out sensitive headers and content first.

    +
    -#### Learn More - -- https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive -- https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview -- https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile -- https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir -- https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#cookies_file +

    Learn More

    +

    @@ -921,28 +933,27 @@ Be aware that malicious archived JS can access the contents of other pages in yo Expand to see risks and mitigations... -```bash -# visiting an archived page with malicious JS: +
    # visiting an archived page with malicious JS:
     https://127.0.0.1:8000/archive/1602401954/example.com/index.html
     
     # example.com/index.js can now make a request to read everything from:
     https://127.0.0.1:8000/index.html
     https://127.0.0.1:8000/archive/*
     # then example.com/index.js can send it off to some evil server
    -```
    +
    -The admin UI is also served from the same origin as replayed JS, so malicious pages could also potentially use your ArchiveBox login cookies to perform admin actions (e.g. adding/removing links, running extractors, etc.). We are planning to fix this security shortcoming in a future version by using separate ports/origins to serve the Admin UI and archived content (see [Issue #239](https://github.com/ArchiveBox/ArchiveBox/issues/239)). - -> *NOTE: Only the `wget` & `dom` extractor methods execute archived JS when viewing snapshots, all other archive methods produce static output that does not execute JS on viewing.* -> *If you are worried about these issues ^ you should disable these extractors using `archivebox config --set SAVE_WGET=False SAVE_DOM=False`.* - -#### Learn More - -- https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview -- https://github.com/ArchiveBox/ArchiveBox/issues/239 -- https://github.com/ArchiveBox/ArchiveBox/security/advisories/GHSA-cr45-98w9-gwqx (`CVE-2023-45815`) -- https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#publishing +
    +

    NOTE: Only the wget & dom extractor methods execute archived JS when viewing snapshots, all other archive methods produce static output that does not execute JS on viewing. +If you are worried about these issues ^ you should disable these extractors using archivebox config --set SAVE_WGET=False SAVE_DOM=False.

    +
    +

    Learn More

    +

    @@ -958,13 +969,14 @@ For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) active
    -- Set [`CHROME_USER_AGENT`, `WGET_USER_AGENT`, `CURL_USER_AGENT`](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#curl_user_agent) to impersonate a real browser (instead of an ArchiveBox bot) -- Set up a logged-in browser session for archiving using [`CHROME_DATA_DIR` & `COOKIES_FILE`](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile) -- Rewrite your URLs before archiving to swap in an alternative frontend thats more bot-friendly e.g. - `reddit.com/some/url` -> `teddit.net/some/url`: https://github.com/mendel5/alternative-front-ends + - -In the future we plan on adding support for running JS scripts during archiving to block ads, cookie popups, modals, and fix other issues. Follow here for progress: [Issue #51](https://github.com/ArchiveBox/ArchiveBox/issues/51). +In the future we plan on adding support for running JS scripts during archiving to block ads, cookie popups, modals, and fix other issues. Follow here for progress: Issue #51.

    @@ -983,11 +995,10 @@ ArchiveBox appends a hash with the current date `https://example.com#2020-10-24` Because ArchiveBox uniquely identifies snapshots by URL, it must use a workaround to take multiple snapshots of the same URL (otherwise they would show up as a single Snapshot entry). It makes the URLs of repeated snapshots unique by adding a hash with the archive date at the end: -```bash -archivebox add 'https://example.com#2020-10-24' +
    archivebox add 'https://example.com#2020-10-24'
     ...
     archivebox add 'https://example.com#2020-10-25'
    -```
    +
    The Re-Snapshot Button button in the Admin UI is a shortcut for this hash-date multi-snapshotting workaround. @@ -1164,7 +1175,7 @@ ArchiveBox is neither the highest fidelity nor the simplest tool available for s
    -Our Community Wiki strives to be a comprehensive index of the broader web archiving community... +Our Community Wiki strives to be a comprehensive index of the web archiving industry...
    - [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) From b72a8ab65418273beaf9187940223b4d09e8cf60 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 01:46:33 -0800 Subject: [PATCH 089/227] README.md fixes --- README.md | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 75208349..9dd8a1af 100644 --- a/README.md +++ b/README.md @@ -30,13 +30,13 @@ Without active preservation effort, everything on the internet eventually dissap
    📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj), and more. -See Input Formats for a full list... +See Input Formats for a full list of supported input formats... snapshot detail page **It saves snapshots of the URLs you feed it in several redundant formats.** It also detects any content featured *inside* pages & extracts it out into a folder: -- 🌐 **HTML**/**Any websites** ➡️ `original HTML+CSS+JS`, `singlefile HTML`, `screenshot PNG`, `PDF`, `WARC`, ... +- 🌐 **HTML**/**Any websites** ➡️ `original HTML+CSS+JS`, `singlefile HTML`, `screenshot PNG`, `PDF`, `WARC`, `article text MD`, `headers JSON`, `title`, `favicon`, ... - 🎥 **Social Media**/**News** ➡️ `post content TXT`, `comments`, `title`, `author`, `images` - 🎬 **YouTube**/**SoundCloud**/etc. ➡️ `MP3/MP4`s, `subtitles`, `metadata`, `thumbnail`, ... - 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ... @@ -166,7 +166,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur
    Docker docker-compose (macOS/Linux/Windows)   👈  recommended   (click to expand)
    -👍 Docker Compose is recommended for the easiest install/update UX + best security + all the extras out-of-the-box. +👍 Docker Compose is recommended for the easiest install/update UX + best security + all extras out-of-the-box.

    1. Install Docker on your system (if not already installed).
    2. @@ -336,8 +336,7 @@ See the homebr Arch pacman / FreeBSD pkg / Nix nix (Arch/FreeBSD/NixOS/more)
      -> [!WARNING] -> *These are contributed by external volunteers and may lag behind the official `pip` channel.* +> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*
    -Self-hosting PlatformsTrueNAS / YunoHost / Cloudron / UNRAID / etc. (self-hosting solutions) +Self-hosting Platforms TrueNAS / UNRAID / YunoHost / Cloudron / etc. (self-hosting solutions)
    -> [!WARNING] -> *These are contributed by external volunteers and may lag behind the official `pip` channel.* +> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*
  • Article Text: article.html/json Article text extraction using Readability & Mercury
  • Archive.org Permalink: archive.org.txt A link to the saved site on archive.org
  • -
  • Audio & Video: media/ all audio/video files + playlists, including subtitles & metadata with youtube-dl (or yt-dlp)
  • +
  • Audio & Video: media/ all audio/video files + playlists, including subtitles & metadata w/ yt-dlp
  • Source Code: git/ clone of any repository found on GitHub, Bitbucket, or GitLab links
  • More coming soon! See the Roadmap...
  • @@ -737,7 +737,7 @@ To achieve high-fidelity archives in as many situations as possible, ArchiveBox > Under-the-hood, ArchiveBox uses [Django](https://www.djangoproject.com/start/overview/) to power its [Web UI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage) and [SQlite](https://www.sqlite.org/locrsf.html) + the filesystem to provide [fast & durable metadata storage](https://www.sqlite.org/locrsf.html) w/ [determinisitc upgrades](https://stackoverflow.com/a/39976321/2156113). -For the actual archiving, ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install), [`wget`, `yt-dlp`, `readability`, etc.](#dependencies) internally, and its operation can be [tuned, secured, and extended](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed for many different applications. +ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install), [`wget`, `yt-dlp`, `readability`, etc.](#dependencies) internally, and its operation can be [tuned, secured, and extended](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed for many different applications.
    @@ -788,7 +788,7 @@ Installing directly on **Windows without Docker or WSL/WSL2/Cygwin is not offici ## Archive Layout -All of ArchiveBox's state (SQLite DB, archived assets, config, logs, etc.) is stored in a single folder (`data/`). +All of ArchiveBox's state (SQLite DB, content, config, logs, etc.) is stored in a single folder per collection.
    @@ -824,11 +824,11 @@ Each snapshot subfolder ./archive/TIMESTAMP/ includes a static Learn More
    @@ -864,9 +864,9 @@ The paths in the static exports are relative, make sure to keep them next to you

    Learn More

    @@ -917,11 +917,11 @@ archivebox config --set CHROME_BINARY=chromium # ensure it's using Chromium

    Learn More

    @@ -954,10 +954,10 @@ https://127.0.0.1:8000/archive/*

    Learn More

    @@ -975,7 +975,7 @@ For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) active
      -
    • Set CHROME_USER_AGENT, WGET_USER_AGENT, CURL_USER_AGENT to impersonate a real browser (instead of an ArchiveBox bot)
    • +
    • Set CHROME_USER_AGENT, WGET_USER_AGENT, CURL_USER_AGENT to impersonate a real browser (by default, ArchiveBox reveals that it's a bot when using the default user agent settings)
    • Set up a logged-in browser session for archiving using CHROME_DATA_DIR & COOKIES_FILE
    • Rewrite your URLs before archiving to swap in an alternative frontend thats more bot-friendly e.g.
      reddit.com/some/url -> teddit.net/some/url: https://github.com/mendel5/alternative-front-ends
    • @@ -994,7 +994,7 @@ ArchiveBox appends a hash with the current date `https://example.com#2020-10-24`
      -Click to learn how the `Re-Snapshot` feature works... +Click to learn how the Re-Snapshot feature works...
      From ba851b17a69e59cf909359cdfde0d99808e0bab6 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:20:38 -0800 Subject: [PATCH 091/227] more README html-ifying --- README.md | 113 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 59 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 6d2f6c62..d3c0b16f 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Without active preservation effort, everything on the internet eventually dissap **It saves snapshots of the URLs you feed it in several redundant formats.** It also detects any content featured *inside* pages & extracts it out into a folder: -- 🌐 **HTML**/**Any websites** ➡️ `original HTML+CSS+JS`, `singlefile HTML`, `screenshot PNG`, `PDF`, `WARC`, `article text MD`, `headers JSON`, `title`, `favicon`, ... +- 🌐 **HTML**/**Any websites** ➡️ `original HTML+CSS+JS`, `singlefile HTML`, `screenshot PNG`, `PDF`, `WARC`, `title`, `article text`, `favicon`, `headers`, ... - 🎥 **Social Media**/**News** ➡️ `post content TXT`, `comments`, `title`, `author`, `images` - 🎬 **YouTube**/**SoundCloud**/etc. ➡️ `MP3/MP4`s, `subtitles`, `metadata`, `thumbnail`, ... - 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ... @@ -134,7 +134,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur - ⚖️ **Lawyers:** `collecting & preserving evidence`, `detecting changes`, `tagging & review` - 🔬 **Researchers:** - `analyzing social media trends`, `getting LLM training sets`, `crawling pipelines` + `analyzing social media trends`, `getting LLM training data`, `crawling pipelines` - 👩🏽 **Individuals:** `saving bookmarks`, `preserving portfolio content`, `legacy / memoirs archival` @@ -471,8 +471,8 @@ docker compose run archivebox help curl sh automatic setup script CLI Usage Examples (non-Docker)
      
      -# make sure you have pip-installed ArchiveBox and it's available in your $PATH first
      -
      +# make sure you have pip-installed ArchiveBox and it's available in your $PATH first  
      +
      # archivebox [subcommand] [--args] archivebox init --setup # safe to run init multiple times (also how you update versions) archivebox version # get archivebox version info + check dependencies @@ -488,7 +488,7 @@ archivebox add --depth=1 'https://news.ycombinator.com'
      
       # make sure you have `docker-compose.yml` from the Quickstart instructions first
      -
      +
      # docker compose run archivebox [subcommand [--args] docker compose run archivebox init --setup docker compose run archivebox version @@ -505,7 +505,7 @@ docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
      
       # make sure you create and cd into in a new empty directory first  
      -
      +
      # docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--args] docker run -v $PWD:/data -it archivebox/archivebox init --setup docker run -v $PWD:/data -it archivebox/archivebox version @@ -610,19 +610,19 @@ docker run -it -v $PWD:/data archivebox/archivebox add --depth=1 'https://exampl ## Input Formats: How to pass URLs into ArchiveBox for saving -- The official ArchiveBox Browser Extension - Provides realtime archiving of browsing history or selected pages from Chrome/Chromium/Firefox browsers +- From the official ArchiveBox Browser Extension + Provides realtime archiving of browsing history or selected pages from Chrome/Chromium/Firefox browsers. -- Manual imports of URLs from RSS, JSON, CSV, TXT, SQL, HTML, Markdown, etc. files - ArchiveBox supports injesting URLs in [any text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file) +- From manual imports of URLs from RSS, JSON, CSV, TXT, SQL, HTML, Markdown, etc. files + ArchiveBox supports injesting URLs in [any text-based format](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file). -- Manually exported [browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) from any browser +- From manually exported [browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (in Netscape format) See instructions for: Chrome, Firefox, Safari, IE, Opera, and more... -- [MITM Proxy](https://mitmproxy.org/) archiving with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy) +- From URLs visited through a [MITM Proxy](https://mitmproxy.org/) with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy) Provides [realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any device going through the proxy. -- Links from bookmarking services or social media (e.g. Twitter bookmarks, Reddit saved posts, etc.) +- From bookmarking services or social media (e.g. Twitter bookmarks, Reddit saved posts, etc.) See instructions for: Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved, Wallabag, Unmark.it, OneTab, Firefox Sync, and more... @@ -743,44 +743,47 @@ ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.c
      Expand to learn more about ArchiveBox's internals & dependencies...
      -> *TIP: For better security, easier updating, and to avoid polluting your host system with extra dependencies,**it is strongly recommended to use the [⭐️ official Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker)** with everything pre-installed for the best experience.* +
      +

      TIP: For better security, easier updating, and to avoid polluting your host system with extra dependencies,it is strongly recommended to use the ⭐️ official Docker image with everything pre-installed for the best experience.

      +
      These optional dependencies used for archiving sites include: -archivebox --version CLI output screenshot showing dependencies installed +archivebox --version CLI output screenshot showing dependencies installed +
        +
      • chromium / chrome (for screenshots, PDF, DOM HTML, and headless JS scripts)
      • +
      • node & npm (for readability, mercury, and singlefile)
      • +
      • wget (for plain HTML, static files, and WARC saving)
      • +
      • curl (for fetching headers, favicon, and posting to Archive.org)
      • +
      • yt-dlp or youtube-dl (for audio, video, and subtitles)
      • +
      • git (for cloning git repos)
      • +
      • singlefile (for saving into a self-contained html file)
      • +
      • postlight/parser (for discussion threads, forums, and articles)
      • +
      • readability (for articles and long text content)
      • +
      • and more as we grow...
      • +
      -- `chromium` / `chrome` (for screenshots, PDF, DOM HTML, and headless JS scripts) -- `node` & `npm` (for readability, mercury, and singlefile) -- `wget` (for plain HTML, static files, and WARC saving) -- `curl` (for fetching headers, favicon, and posting to Archive.org) -- `yt-dlp` or `youtube-dl` (for audio, video, and subtitles) -- `git` (for cloning git repos) -- `singlefile` (for saving into a self-contained html file) -- `postlight/parser` (for discussion threads, forums, and articles) -- `readability` (for articles and long text content) -- and more as we grow... - -You don't need to install every dependency to use ArchiveBox. ArchiveBox will automatically disable extractors that rely on dependencies that aren't installed, based on what is configured and available in your `$PATH`. - +You don't need to install every dependency to use ArchiveBox. ArchiveBox will automatically disable extractors that rely on dependencies that aren't installed, based on what is configured and available in your $PATH. + If not using Docker, make sure to keep the dependencies up-to-date yourself and check that ArchiveBox isn't reporting any incompatibility with the versions you install. -```bash -# install python3 and archivebox with your system package manager +
      #install python3 and archivebox with your system package manager
       # apt/brew/pip/etc install ... (see Quickstart instructions above)
      -
      +
      archivebox setup # auto install all the extractors and extras archivebox --version # see info and check validity of installed dependencies -``` +
      + +Installing directly on Windows without Docker or WSL/WSL2/Cygwin is not officially supported (I cannot respond to Windows support tickets), but some advanced users have reported getting it working. -Installing directly on **Windows without Docker or WSL/WSL2/Cygwin is not officially supported** (I cannot respond to Windows support tickets), but some advanced users have reported getting it working. - -#### Learn More - -- https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies -- https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install -- https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives -- https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#installing +

      Learn More

      +

      @@ -948,8 +951,8 @@ https://127.0.0.1:8000/archive/*
      -

      NOTE: Only the wget & dom extractor methods execute archived JS when viewing snapshots, all other archive methods produce static output that does not execute JS on viewing. -If you are worried about these issues ^ you should disable these extractors using archivebox config --set SAVE_WGET=False SAVE_DOM=False.

      +

      NOTE: Only the wget & dom extractor methods execute archived JS when viewing snapshots, all other archive methods produce static output that does not execute JS on viewing.
      +If you are worried about these issues ^ you should disable these extractors using:
      archivebox config --set SAVE_WGET=False SAVE_DOM=False.

      Learn More

      @@ -1007,13 +1010,14 @@ archivebox add 'https://example.com#2020-10-25' The Re-Snapshot Button button in the Admin UI is a shortcut for this hash-date multi-snapshotting workaround. -Improved support for saving multiple snapshots of a single URL without this hash-date workaround will be [added eventually](https://github.com/ArchiveBox/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). +Improved support for saving multiple snapshots of a single URL without this hash-date workaround will be added eventually (along with the ability to view diffs of the changes between runs). -#### Learn More - -- https://github.com/ArchiveBox/ArchiveBox/issues/179 -- https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#explanation-of-buttons-in-the-web-ui---admin-snapshots-list +

      Learn More

      +

      @@ -1036,14 +1040,15 @@ Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set [`PUID` & `PGID`](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid) and [disable `root_squash`](https://github.com/ArchiveBox/ArchiveBox/issues/1304) on your fileshare server. -#### Learn More - -- https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Disk-Layout -- https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#output-folder -- https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#large-archives -- https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid -- https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root +

      Learn More

      +
    From 6f87bf014e8ad568799dc768dfdfe284275ed0cc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:29:30 -0800 Subject: [PATCH 092/227] Update README.md fix CSS resizing table images --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d3c0b16f..7ec85114 100644 --- a/README.md +++ b/README.md @@ -1069,31 +1069,31 @@ If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to -brew install archivebox
    -archivebox version +brew install archivebox
    +archivebox version -archivebox init
    +archivebox init
    -archivebox add +archivebox add -archivebox data dir +archivebox data dir -archivebox server +archivebox server -archivebox server add +archivebox server add -archivebox server list +archivebox server list -archivebox server detail +archivebox server detail From 21584cdd7200a24fec745a2b93e2dd852753c0a4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:34:30 -0800 Subject: [PATCH 093/227] Update README.md --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7ec85114..61fb5ed8 100644 --- a/README.md +++ b/README.md @@ -1035,7 +1035,11 @@ Because ArchiveBox is designed to ingest a large volume of URLs with multiple co **ArchiveBox can use anywhere from ~1gb per 1000 articles, to ~50gb per 1000 articles**, mostly dependent on whether you're saving audio & video using `SAVE_MEDIA=True` and whether you lower `MEDIA_MAX_SIZE=750mb`. -Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like [fdupes](https://github.com/adrianlopezroche/fdupes) or [rdfind](https://github.com/pauldreik/rdfind). **Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `archive/` folder. **Try to keep the `index.sqlite3` file on local drive (not a network mount)** or SSD for maximum performance, however the `archive/` folder can be on a network mount or slower HDD. +Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like [fdupes](https://github.com/adrianlopezroche/fdupes) or [rdfind](https://github.com/pauldreik/rdfind). + +**Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `archive/` folder. + +**Try to keep the `index.sqlite3` file on local drive (not a network mount)** or SSD for maximum performance, however the `archive/` folder can be on a network mount or slower HDD. If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set [`PUID` & `PGID`](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid) and [disable `root_squash`](https://github.com/ArchiveBox/ArchiveBox/issues/1304) on your fileshare server. @@ -1124,7 +1128,7 @@ Whether it's to resist censorship by saving articles before they get taken down

    - Image from Perma.cc...
    +Image from Perma.cc...
    From 68d12b4ccb464de4ce464403d72569452ce2622a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:38:39 -0800 Subject: [PATCH 094/227] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 61fb5ed8..f0186438 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,9 @@ curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instruct **ArchiveBox is a powerful, self-hosted internet archiving solution to collect, save, and view websites offline.** -Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a free central archive, but they require all archives to be public, and they can't save every type of content. +Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but they require all saved URLs to be public, and they can't save every type of content. -*ArchiveBox is an open source tool that helps organizations and individuals archive web content and retain control over their data: save copies of browser bookmarks, preserve evidence for legal cases, backup photos from FB / Insta / Flickr, download your media from YT / Soundcloud / etc., snapshot research papers & academic citations, and more...* +*ArchiveBox is an open source tool that helps organizations & individuals archive web content and retain control over their data: save copies of your bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr, download media from YT/Soundcloud/etc., snapshot research papers, and more...* > ➡️ *ArchiveBox is available on [Linux](#quickstart)/[macOS](#quickstart)/[Windows](#quickstart)/[Docker](#quickstart) as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* From 1f5c6d1df87f592818899c39fac7eebf7f791736 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:39:11 -0800 Subject: [PATCH 095/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f0186438..5b473226 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instruct **ArchiveBox is a powerful, self-hosted internet archiving solution to collect, save, and view websites offline.** -Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but they require all saved URLs to be public, and they can't save every type of content. +Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but they require saved URLs to be public, and they can't save every type of content. *ArchiveBox is an open source tool that helps organizations & individuals archive web content and retain control over their data: save copies of your bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr, download media from YT/Soundcloud/etc., snapshot research papers, and more...* From 780dac3b120cf5d2adb3a62ce4a6628bbb4d7360 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:41:07 -0800 Subject: [PATCH 096/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5b473226..bb544f05 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instruct **ArchiveBox is a powerful, self-hosted internet archiving solution to collect, save, and view websites offline.** -Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but they require saved URLs to be public, and they can't save every type of content. +Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but saved URLs have to be public, and they can't save every type of content. *ArchiveBox is an open source tool that helps organizations & individuals archive web content and retain control over their data: save copies of your bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr, download media from YT/Soundcloud/etc., snapshot research papers, and more...* From a3dc7106ee66c63ebbabfd3c89e66997becdb26b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:42:54 -0800 Subject: [PATCH 097/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bb544f05..d000efb4 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instruct Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but saved URLs have to be public, and they can't save every type of content. -*ArchiveBox is an open source tool that helps organizations & individuals archive web content and retain control over their data: save copies of your bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr, download media from YT/Soundcloud/etc., snapshot research papers, and more...* +*ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr, download media from YT/Soundcloud/etc., snapshot research papers, and more...* > ➡️ *ArchiveBox is available on [Linux](#quickstart)/[macOS](#quickstart)/[Windows](#quickstart)/[Docker](#quickstart) as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* From 85be7f891abd09d84ef0b7d053adb258db615caf Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:44:04 -0800 Subject: [PATCH 098/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d000efb4..9ac76b55 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instruct Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but saved URLs have to be public, and they can't save every type of content. -*ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr, download media from YT/Soundcloud/etc., snapshot research papers, and more...* +*ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* > ➡️ *ArchiveBox is available on [Linux](#quickstart)/[macOS](#quickstart)/[Windows](#quickstart)/[Docker](#quickstart) as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* From b004aa517058c79fe6b4946ce1cd9fcf444b389c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:47:13 -0800 Subject: [PATCH 099/227] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9ac76b55..a319ff73 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,13 @@ Without active preservation effort, everything on the internet eventually dissap *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* -> ➡️ *ArchiveBox is available on [Linux](#quickstart)/[macOS](#quickstart)/[Windows](#quickstart)/[Docker](#quickstart) as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* +> ➡️ *ArchiveBox is available on [Linux](#quickstart), [macOS](#quickstart), [Windows](#quickstart), and [Docker](#quickstart) and can be used as a +> [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*
    📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj), and more. -See Input Formats for a full list of supported input formats... +See Input Formats for a full list of supported input formats... snapshot detail page From eed91485923b7f8610b49da11f1f87afa1316a2e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:49:34 -0800 Subject: [PATCH 100/227] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a319ff73..63ea145d 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,8 @@ Without active preservation effort, everything on the internet eventually dissap *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* -> ➡️ *ArchiveBox is available on [Linux](#quickstart), [macOS](#quickstart), [Windows](#quickstart), and [Docker](#quickstart) and can be used as a -> [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* +> ➡️ *ArchiveBox is available via `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via [Docker](#quickstart). +> It can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*
    From e42a7390fbfbd7de18e2799173dd94667fbcd28e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:50:36 -0800 Subject: [PATCH 101/227] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 63ea145d..b801582a 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,8 @@ Without active preservation effort, everything on the internet eventually dissap *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* -> ➡️ *ArchiveBox is available via `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via [Docker](#quickstart). -> It can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* +> ➡️ *Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via [Docker](#quickstart). +> Then it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*
    From 30739224402b955393a582315eb4ffbb047679e3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:52:33 -0800 Subject: [PATCH 102/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b801582a..ee1871dd 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Without active preservation effort, everything on the internet eventually dissap *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* -> ➡️ *Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via [Docker](#quickstart). +> ➡️ *Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[⭐️ Docker](#quickstart)**. > Then it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*
    From 2c51430a318caaa8f5803f88940fa63c2856d2e1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:53:25 -0800 Subject: [PATCH 103/227] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ee1871dd..373cb272 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,8 @@ Without active preservation effort, everything on the internet eventually dissap *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* -> ➡️ *Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[⭐️ Docker](#quickstart)**. -> Then it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* +> ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️. +> *Then it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*
    From dd1216546c19b32c71bbdf35fede742830cd74bb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:56:28 -0800 Subject: [PATCH 104/227] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 373cb272..7afaafa6 100644 --- a/README.md +++ b/README.md @@ -38,13 +38,13 @@ Without active preservation effort, everything on the internet eventually dissap **It saves snapshots of the URLs you feed it in several redundant formats.** It also detects any content featured *inside* pages & extracts it out into a folder: - 🌐 **HTML**/**Any websites** ➡️ `original HTML+CSS+JS`, `singlefile HTML`, `screenshot PNG`, `PDF`, `WARC`, `title`, `article text`, `favicon`, `headers`, ... -- 🎥 **Social Media**/**News** ➡️ `post content TXT`, `comments`, `title`, `author`, `images` +- 🎥 **Social Media**/**News** ➡️ `post content TXT`, `comments`, `title`, `author`, `images`, ... - 🎬 **YouTube**/**SoundCloud**/etc. ➡️ `MP3/MP4`s, `subtitles`, `metadata`, `thumbnail`, ... - 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ... - ✨ *and more, see [Output Formats](#output-formats) below...* -It uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in ordinary [files & folders](#archive-layout). -*(no complex proprietary formats)* +It uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout) and SQLite3. +*(no complex proprietary formats, accessible decades in the future without needing to run ArchiveBox)* --- From b15bc27bb3619d773110ccf7cc2f4253b6c33369 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 02:59:54 -0800 Subject: [PATCH 105/227] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7afaafa6..cd190e95 100644 --- a/README.md +++ b/README.md @@ -44,11 +44,12 @@ It also detects any content featured *inside* pages & extracts it out into a fol - ✨ *and more, see [Output Formats](#output-formats) below...* It uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout) and SQLite3. -*(no complex proprietary formats, accessible decades in the future without needing to run ArchiveBox)* +*(no complex proprietary formats, all data is readable without needing to run ArchiveBox)* + +The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down. --- -The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down.


    From c908d3e8d081c158d6178a48b284278d494ac1eb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:04:40 -0800 Subject: [PATCH 106/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cd190e95..a35e5984 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur > ***[Contact our team](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your institution/org wants to use ArchiveBox professionally.* > -> - setup & support, team permissioning, hashing, audit logging, backups, custom archiving etc. +> - setup & support, custom features, hashing & audit logging, backups, chain-of-custody setup, etc. > - for **individuals**, **NGOs**, **academia**, **governments**, **journalism**, **law**, and more... *We are a 🏛️ 501(c)(3) nonprofit and all our work goes towards supporting open-source development.* From dd2864128dedf1052aafb165d19ab0efc384bff7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:05:33 -0800 Subject: [PATCH 107/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a35e5984..78f15662 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur > ***[Contact our team](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your institution/org wants to use ArchiveBox professionally.* > -> - setup & support, custom features, hashing & audit logging, backups, chain-of-custody setup, etc. +> - setup & support, custom features, security help, hashing & audit logging for chain-of-custody, etc. > - for **individuals**, **NGOs**, **academia**, **governments**, **journalism**, **law**, and more... *We are a 🏛️ 501(c)(3) nonprofit and all our work goes towards supporting open-source development.* From 114002aa5d0aa78eaeeb8c7aea83181c43b74631 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:09:06 -0800 Subject: [PATCH 108/227] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 78f15662..65873741 100644 --- a/README.md +++ b/README.md @@ -463,10 +463,10 @@ docker compose run archivebox help #### ArchiveBox Subcommands -- `archivebox` `help`/`version` to see the list of available subcommands and currently installed version info -- `archivebox` `setup`/`init`/`config`/`status`/`manage` to administer your collection -- `archivebox` `add`/`schedule`/`remove`/`update`/`list`/`shell`/`oneshot` to manage Snapshots in the archive -- `archivebox` `schedule` to pull in fresh URLs regularly from [bookmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats) +- `archivebox` `help`/`version` to see the list of available subcommands / currently installed version info +- `archivebox` `setup`/`init`/`config`/`status`/`shell`/`manage` to administer your collection +- `archivebox` `add`/`oneshot`/`schedule` to pull in fresh URLs from [bookmarks/history/RSS/etc.](#input-formats) +- `archivebox` `list`/`update`/`remove` to manage existing Snapshots in your collection
    From 53732c29581b2d347d9d03cd96cb29f3492057c1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:11:30 -0800 Subject: [PATCH 109/227] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 65873741..b3f900ff 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,8 @@ Without active preservation effort, everything on the internet eventually dissap *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* -> ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️. -> *Then it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* +> ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️. +> *Once installed, it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*
    From 55f1ec5b4bc152702b8c60c2c9ad3484c4940c7b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:12:02 -0800 Subject: [PATCH 110/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b3f900ff..21a5ec7a 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Without active preservation effort, everything on the internet eventually dissap *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* > ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️. -> *Once installed, it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [`pip` library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* +> *Once installed, it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [Python library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*
    From 3f50922f72bed4c0ec490e9bc0b836183f248639 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:13:50 -0800 Subject: [PATCH 111/227] Update README.md --- README.md | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 21a5ec7a..fcdfe8de 100644 --- a/README.md +++ b/README.md @@ -51,18 +51,6 @@ The goal is to sleep soundly knowing the part of the internet you care about wil --- -
    -

    -bookshelf graphic   logo   bookshelf graphic -

    -Demo | Screenshots | Usage -
    -. . . . . . . . . . . . . . . . . . . . . . . . . . . . -

    -
    - -
    - **📦  Install ArchiveBox using your preferred method: `docker` / `pip` / `apt` / etc. ([see full Quickstart](#quickstart)).** @@ -105,6 +93,12 @@ curl -sSL 'https://get.archivebox.io' | sh


    +bookshelf graphic   logo   bookshelf graphic +

    +Demo | Screenshots | Usage +
    +. . . . . . . . . . . . . . . . . . . . . . . . . . . . +

    cli init screenshot cli init screenshot server snapshot admin screenshot From 26481d77c7b2ac3c9a8807ac63ef5f33f60d1de0 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:15:22 -0800 Subject: [PATCH 112/227] Update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index fcdfe8de..9ef1557d 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,15 @@ Without active preservation effort, everything on the internet eventually dissap > ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️. > *Once installed, it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [Python library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* +

    +
    📥 **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://chromewebstore.google.com/detail/archivebox-exporter/habonpimjphpdnmcfkaockjnffodikoj), and more. See Input Formats for a full list of supported input formats... +
    + snapshot detail page **It saves snapshots of the URLs you feed it in several redundant formats.** @@ -43,6 +47,8 @@ It also detects any content featured *inside* pages & extracts it out into a fol - 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ... - ✨ *and more, see [Output Formats](#output-formats) below...* +--- + It uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout) and SQLite3. *(no complex proprietary formats, all data is readable without needing to run ArchiveBox)* From da38950cea16842e795b30bbed27a29b0ec72814 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:17:07 -0800 Subject: [PATCH 113/227] Update README.md --- README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9ef1557d..d4c69660 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,9 @@ curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instructions below)
    --> -
    +
    +
    **ArchiveBox is a powerful, self-hosted internet archiving solution to collect, save, and view websites offline.** @@ -25,8 +26,11 @@ Without active preservation effort, everything on the internet eventually dissap *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* +
    + > ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️. -> *Once installed, it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [Python library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).* + +*Once installed, it can be used as a [CLI tool](#usage), [self-hosted Web App](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [Python library](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [one-off command](#static-archive-exporting).*

    @@ -47,14 +51,18 @@ It also detects any content featured *inside* pages & extracts it out into a fol - 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ... - ✨ *and more, see [Output Formats](#output-formats) below...* ---- +
    +
    +
    -It uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout) and SQLite3. +🛠️ It uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout) and SQLite3. *(no complex proprietary formats, all data is readable without needing to run ArchiveBox)* The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down. ---- +
    +
    +
    **📦  Install ArchiveBox using your preferred method: `docker` / `pip` / `apt` / etc. ([see full Quickstart](#quickstart)).** From 31392f8c3461d4d9094e0b8e95048683e2ec1719 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:18:29 -0800 Subject: [PATCH 114/227] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d4c69660..7bbb233b 100644 --- a/README.md +++ b/README.md @@ -55,17 +55,17 @@ It also detects any content featured *inside* pages & extracts it out into a fol

    -🛠️ It uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout) and SQLite3. +🛠️ ArchiveBox uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout). *(no complex proprietary formats, all data is readable without needing to run ArchiveBox)* The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down. -
    +

    -**📦  Install ArchiveBox using your preferred method: `docker` / `pip` / `apt` / etc. ([see full Quickstart](#quickstart)).** +**📦  Install ArchiveBox using your preferred method: `docker` / `pip` / `apt` / etc. ([see full Quickstart below](#quickstart)).**
    From 5ee85107e6e9dadc2f23bf2eb0cc33b596a81016 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:18:57 -0800 Subject: [PATCH 115/227] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 7bbb233b..48500785 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,6 @@ It also detects any content featured *inside* pages & extracts it out into a fol

    -
    🛠️ ArchiveBox uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout). *(no complex proprietary formats, all data is readable without needing to run ArchiveBox)* From 13e3322993d782b423855554004d2ee5d2a60892 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:22:09 -0800 Subject: [PATCH 116/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 48500785..f0680abc 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur > ***[Contact our team](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your institution/org wants to use ArchiveBox professionally.* > -> - setup & support, custom features, security help, hashing & audit logging for chain-of-custody, etc. +> - setup & support, hosting, custom features, security, hashing & audit logging for chain-of-custody, etc. > - for **individuals**, **NGOs**, **academia**, **governments**, **journalism**, **law**, and more... *We are a 🏛️ 501(c)(3) nonprofit and all our work goes towards supporting open-source development.* From ea0563d85b0a76ad070e203a5cd2205f10177521 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:23:01 -0800 Subject: [PATCH 117/227] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f0680abc..a0971be7 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur - 👩🏽 **Individuals:** `saving bookmarks`, `preserving portfolio content`, `legacy / memoirs archival` -> ***[Contact our team](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your institution/org wants to use ArchiveBox professionally.* +> ***[Contact our team](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your institution/org wants to use ArchiveBox professionally. We offer services such as:* > > - setup & support, hosting, custom features, security, hashing & audit logging for chain-of-custody, etc. > - for **individuals**, **NGOs**, **academia**, **governments**, **journalism**, **law**, and more... From dcd9b7bd14bd2f12d9fcac092808cc02c94d129b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Jan 2024 03:32:08 -0800 Subject: [PATCH 118/227] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a0971be7..b7bd0ff2 100644 --- a/README.md +++ b/README.md @@ -1510,7 +1510,7 @@ Extractors take the URL of a page to archive, write their output to the filesyst - [ArchiveBox.io Homepage](https://archivebox.io) / [Source Code (Github)](https://github.com/ArchiveBox/ArchiveBox) / [Demo Server](https://demo.archivebox.io) - [Documentation Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki) / [API Reference Docs](https://docs.archivebox.io) / [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases) - [Bug Tracker](https://github.com/ArchiveBox/ArchiveBox/issues) / [Discussions](https://github.com/ArchiveBox/ArchiveBox/discussions) / [Community Chat Forum (Zulip)](https://zulip.archivebox.io) -- Social Media: [Twitter](https://twitter.com/ArchiveBoxApp), [LinkedIn](https://www.linkedin.com/company/archivebox/), [YouTube](https://www.youtube.com/@ArchiveBoxApp), [Alternative.to](https://alternativeto.net/software/archivebox/about/), [Reddit](https://www.reddit.com/r/ArchiveBox/) +- Find us on social media: [Twitter](https://twitter.com/ArchiveBoxApp), [LinkedIn](https://www.linkedin.com/company/archivebox/), [YouTube](https://www.youtube.com/@ArchiveBoxApp), [SaaSHub](https://www.saashub.com/archivebox), [Alternative.to](https://alternativeto.net/software/archivebox/about/), [Reddit](https://www.reddit.com/r/ArchiveBox/) --- @@ -1524,7 +1524,7 @@ Extractors take the URL of a page to archive, write their output to the filesyst  
    -ArchiveBox operates as a US 501(c)(3) nonprofit (sponsored by HCB), donations are tax-deductible. +ArchiveBox operates as a US 501(c)(3) nonprofit (sponsored by HCB), direct donations are tax-deductible.

        From a4bd4410775d46863a4c1a16e48e5acf5722e8a5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 31 Jan 2024 01:59:43 -0800 Subject: [PATCH 119/227] Update config.py fix trim-filenames --- archivebox/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/config.py b/archivebox/config.py index 0dfc41dd..3186a6b0 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -154,7 +154,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ '--restrict-filenames', - '--trim-filenames', + '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', From 402aac2366b17ea3186730f20457812c77b4266d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 31 Jan 2024 11:48:43 -0800 Subject: [PATCH 120/227] Update README.md --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index b7bd0ff2..66743966 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml
  • Next steps: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    docker compose up
     # completely optional, CLI can always be used without running a server
    -# docker compose run [-T] archivebox [subcommand] [--args]
    +# docker compose run [-T] archivebox [subcommand] [--help]
     docker compose run archivebox add 'https://example.com'
     docker compose run archivebox help
     
  • @@ -213,7 +213,7 @@ docker run -v $PWD:/data -it archivebox/archivebox init --setup
  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox
     # completely optional, CLI can always be used without running a server
    -# docker run -v $PWD:/data -it [subcommand] [--args]
    +# docker run -v $PWD:/data -it [subcommand] [--help]
     docker run -v $PWD:/data -it archivebox/archivebox help
     
  • @@ -265,7 +265,7 @@ archivebox init --setup
  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    archivebox server 0.0.0.0:8000
     # completely optional, CLI can always be used without running a server
    -# archivebox [subcommand] [--args]
    +# archivebox [subcommand] [--help]
     archivebox help
     
  • @@ -301,7 +301,7 @@ archivebox init --setup # if any problems, install with pip instead
  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    archivebox server 0.0.0.0:8000
     # completely optional, CLI can always be used without running a server
    -# archivebox [subcommand] [--args]
    +# archivebox [subcommand] [--help]
     archivebox help
     
  • @@ -330,7 +330,7 @@ archivebox init --setup # if any problems, install with pip instead
  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    archivebox server 0.0.0.0:8000
     # completely optional, CLI can always be used without running a server
    -# archivebox [subcommand] [--args]
    +# archivebox [subcommand] [--help]
     archivebox help
     
  • @@ -458,13 +458,13 @@ ArchiveBox commands can be run in a terminal directly on your host, or via Docke mkdir -p ~/archivebox/data # create a new data dir anywhere cd ~/archivebox/data # IMPORTANT: cd into the directory -# archivebox [subcommand] [--args] +# archivebox [subcommand] [--help] archivebox help -# equivalent: docker compose run archivebox [subcommand [--args] +# equivalent: docker compose run archivebox [subcommand [--help] docker compose run archivebox help -# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--args] +# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--help] docker run -it -v $PWD:/data archivebox/archivebox help ``` @@ -482,7 +482,7 @@ docker compose run archivebox help
    
     # make sure you have pip-installed ArchiveBox and it's available in your $PATH first  
     
    -# archivebox [subcommand] [--args] +# archivebox [subcommand] [--help] archivebox init --setup # safe to run init multiple times (also how you update versions) archivebox version # get archivebox version info + check dependencies archivebox help # get list of archivebox subcommands that can be run @@ -498,7 +498,7 @@ archivebox add --depth=1 'https://news.ycombinator.com'
    
     # make sure you have `docker-compose.yml` from the Quickstart instructions first
     
    -# docker compose run archivebox [subcommand [--args] +# docker compose run archivebox [subcommand [--help] docker compose run archivebox init --setup docker compose run archivebox version docker compose run archivebox help @@ -515,7 +515,7 @@ docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
    
     # make sure you create and cd into in a new empty directory first  
     
    -# docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--args] +# docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--help] docker run -v $PWD:/data -it archivebox/archivebox init --setup docker run -v $PWD:/data -it archivebox/archivebox version docker run -v $PWD:/data -it archivebox/archivebox help From 9f8ad4b126959f5593d6f22a0b8ecc1eb5a9e697 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 1 Feb 2024 01:13:04 -0800 Subject: [PATCH 121/227] fix missing closing square brackets in readme cli examples --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 66743966..c5230039 100644 --- a/README.md +++ b/README.md @@ -461,10 +461,10 @@ cd ~/archivebox/data # IMPORTANT: cd into the directory # archivebox [subcommand] [--help] archivebox help -# equivalent: docker compose run archivebox [subcommand [--help] +# equivalent: docker compose run archivebox [subcommand] [--help] docker compose run archivebox help -# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--help] +# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] docker run -it -v $PWD:/data archivebox/archivebox help ``` @@ -498,7 +498,7 @@ archivebox add --depth=1 'https://news.ycombinator.com'
    
     # make sure you have `docker-compose.yml` from the Quickstart instructions first
     
    -# docker compose run archivebox [subcommand [--help] +# docker compose run archivebox [subcommand] [--help] docker compose run archivebox init --setup docker compose run archivebox version docker compose run archivebox help @@ -515,7 +515,7 @@ docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
    
     # make sure you create and cd into in a new empty directory first  
     
    -# docker run -it -v $PWD:/data archivebox/archivebox [subcommand [--help] +# docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] docker run -v $PWD:/data -it archivebox/archivebox init --setup docker run -v $PWD:/data -it archivebox/archivebox version docker run -v $PWD:/data -it archivebox/archivebox help From babd273fc0e63809932b81fa46ddc68805a74f04 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 1 Feb 2024 01:40:33 -0800 Subject: [PATCH 122/227] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c5230039..e8492472 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,6 @@ curl -sSL 'https://get.archivebox.io' | sh # (or see pip/brew/Docker instruct Without active preservation effort, everything on the internet eventually dissapears or degrades. Archive.org does a great job as a centralized service, but saved URLs have to be public, and they can't save every type of content. *ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* -
    > ➡️ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart), [macOS](#quickstart), and [Windows](#quickstart) (WSL2), or via **[Docker](#quickstart)** ⭐️. @@ -51,10 +50,13 @@ It also detects any content featured *inside* pages & extracts it out into a fol - 💾 **Github**/**Gitlab**/etc. links ➡️ `clone of GIT source code`, `README`, `images`, ... - ✨ *and more, see [Output Formats](#output-formats) below...* +You can run ArchiveBox as a Docker web app to manage these snapshots, or continue accessing the same collection using the `pip`-installed CLI, Python API, and SQLite3 APIs. +All the ways of using it are equivalent, and provide matching features like adding tags, scheduling regular crawls, viewing logs, and more... +

    -🛠️ ArchiveBox uses [standard tools](#dependencies) like Chrome, `wget`, & `yt-dlp`, and stores data in [ordinary files & folders](#archive-layout). +🛠️ ArchiveBox uses [standard tools](#dependencies) like Chrome, [`wget`](https://www.gnu.org/software/wget/), & [`yt-dlp`](https://github.com/yt-dlp/yt-dlp), and stores data in [ordinary files & folders](#archive-layout). *(no complex proprietary formats, all data is readable without needing to run ArchiveBox)* The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down. From eb62b4403619d89e352cb497521641c70286e1f4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 8 Feb 2024 00:11:27 -0800 Subject: [PATCH 123/227] Update README on Docker Hub when docker is built --- .github/workflows/docker.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 75c7658c..5102aecb 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -81,6 +81,13 @@ jobs: - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} + + - name: Update README + uses: peter-evans/dockerhub-description@v4 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + repository: archivebox/archivebox # This ugly bit is necessary if you don't want your cache to grow forever # until it hits GitHub's limit of 5GB. From f5aaeb6de7a780808db8d79c20af3b53857d6414 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 8 Feb 2024 00:47:48 -0800 Subject: [PATCH 124/227] Update docker.yml --- .github/workflows/docker.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 5102aecb..871f0260 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -85,8 +85,8 @@ jobs: - name: Update README uses: peter-evans/dockerhub-description@v4 with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} repository: archivebox/archivebox # This ugly bit is necessary if you don't want your cache to grow forever From 19aefc85e6c3801ac6c77246c1534fc9758739df Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 8 Feb 2024 18:58:12 -0800 Subject: [PATCH 125/227] fix get_system_user failing on uid 999 in k3s --- archivebox/config.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 3186a6b0..1edd2eeb 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -366,24 +366,32 @@ ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE ############################## Version Config ################################## -def get_system_user(): - SYSTEM_USER = getpass.getuser() or os.getlogin() +def get_system_user() -> str: + # some host OS's are unable to provide a username (k3s, Windows), making this complicated + # uid 999 is especially problematic and breaks many attempts + SYSTEM_USER = None + FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}' + + # Option 1 try: import pwd - return pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER - except KeyError: - # Process' UID might not map to a user in cases such as running the Docker image - # (where `archivebox` is 999) as a different UID. - pass - except ModuleNotFoundError: - # pwd doesn't exist on windows - pass - except Exception: - # this should never happen, uncomment to debug - # raise + SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name + except (ModuleNotFoundError, Exception): pass - return SYSTEM_USER + # Option 2 + try: + SYSTEM_USER = SYSTEM_USER or getpass.getuser() + except Exception: + pass + + # Option 3 + try: + SYSTEM_USER = SYSTEM_USER or os.getlogin() + except Exception: + pass + + return SYSTEM_USER or FALLBACK_USER_PLACHOLDER def get_version(config): try: From 00d2d20a631ca507c50c951b67859e2eb52ed7f4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 12 Feb 2024 02:04:07 -0800 Subject: [PATCH 126/227] Update README.md --- README.md | 66 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index e8492472..b8892b06 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@    
    @@ -72,10 +72,9 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
      Expand for quick copy-pastable install commands...   ⤵️
    -
    mkdir ~/archivebox; cd ~/archivebox    # create a dir somewhere for your archivebox data
    -
    -# Option A: Get ArchiveBox with Docker Compose (recommended): -curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml # edit options in this file as-needed +
    # Option A: Get ArchiveBox with Docker Compose (recommended):
    +mkdir -p ~/archivebox/data && cd ~/archivebox
    +curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml   # edit options in this file as-needed
     docker compose run archivebox init --setup
     # docker compose run archivebox add 'https://example.com'
     # docker compose run archivebox help
    @@ -83,6 +82,7 @@ docker compose run archivebox init --setup
     

    # Option B: Or use it as a plain Docker container: +mkdir -p ~/archivebox/data && cd ~/archivebox/data docker run -it -v $PWD:/data archivebox/archivebox init --setup # docker run -it -v $PWD:/data archivebox/archivebox add 'https://example.com' # docker run -it -v $PWD:/data archivebox/archivebox help @@ -91,6 +91,7 @@ docker run -it -v $PWD:/data archivebox/archivebox init --setup
    # Option C: Or install it with your preferred pkg manager (see Quickstart below for apt, brew, and more) pip install archivebox +mkdir -p ~/archivebox/data && cd ~/archivebox/data archivebox init --setup # archviebox add 'https://example.com' # archivebox help @@ -98,7 +99,7 @@ archivebox init --setup

    # Option D: Or use the optional auto setup script to install it -curl -sSL 'https://get.archivebox.io' | sh +curl -fsSL 'https://get.archivebox.io' | sh

    Open http://localhost:8000 to see your server's Web UI ➡️ @@ -182,9 +183,9 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur
    1. Install Docker on your system (if not already installed).
    2. Download the docker-compose.yml file into a new empty directory (can be anywhere). -
      mkdir ~/archivebox && cd ~/archivebox
      +
      mkdir -p ~/archivebox/data && cd ~/archivebox
       # Read and edit docker-compose.yml options as-needed after downloading
      -curl -sSL 'https://docker-compose.archivebox.io' > docker-compose.yml
      +curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
       
    3. Run the initial setup to create an admin user (or set ADMIN_USER/PASS in docker-compose.yml)
      docker compose run archivebox init --setup
      @@ -208,7 +209,7 @@ See below for more usage examples using the C
       
      1. Install Docker on your system (if not already installed).
      2. Create a new empty directory and initialize your collection (can be anywhere). -
        mkdir ~/archivebox && cd ~/archivebox
        +
        mkdir -p ~/archivebox/data && cd ~/archivebox/data
         docker run -v $PWD:/data -it archivebox/archivebox init --setup
         
      3. @@ -231,7 +232,7 @@ See below for more usage examples using the C
        1. Install Docker on your system (optional, highly recommended but not required).
        2. Run the automatic setup script. -
          curl -sSL 'https://get.archivebox.io' | sh
          +
          curl -fsSL 'https://get.archivebox.io' | sh
        @@ -256,12 +257,16 @@ See "Against curl | sh as a
      4. Install Python >= v3.10 and Node >= v18 on your system (if not already installed).
      5. Install the ArchiveBox package using pip3 (or pipx).
        pip3 install archivebox
        +archivebox version
        +# install any missing extras shown using apt/brew/pkg/etc.
        +#    python@3.10 node curl wget git ripgrep ...
         
        +See the Install: Bare Metal Wiki for full install instructions for each OS...
      6. Create a new empty directory and initialize your collection (can be anywhere). -
        mkdir ~/archivebox && cd ~/archivebox
        -archivebox init --setup
        -# install any missing extras like wget/git/ripgrep/etc. manually as needed
        +
        mkdir -p ~/archivebox/data && cd ~/archivebox/data   # for example
        +archivebox init --setup   # instantialize a new collection
        +# (--setup auto-installs and link JS dependencies: singlefile, readability, etc.)
         
      7. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin. @@ -274,7 +279,8 @@ archivebox help
      See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
      -See the pip-archivebox repo for more details about this distribution. +
      +See the pip-archivebox repo for more details about this distribution.

    @@ -295,10 +301,10 @@ sudo python3 -m pip install --upgrade --ignore-installed archivebox # pip need
  • Create a new empty directory and initialize your collection (can be anywhere). -
    mkdir ~/archivebox && cd ~/archivebox
    +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
     archivebox init --setup           # if any problems, install with pip instead
     
    -Note: If you encounter issues with NPM/NodeJS, install a more recent version.

    +Note: If you encounter issues or want more granular instructions, see the Install: Bare Metal Wiki.

  • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
    archivebox server 0.0.0.0:8000
    @@ -323,9 +329,10 @@ See the debian-a
     
    brew tap archivebox/archivebox
     brew install archivebox
     
    +See the
    Install: Bare Metal Wiki for more granular instructions for macOS... ➡️
  • Create a new empty directory and initialize your collection (can be anywhere). -
    mkdir ~/archivebox && cd ~/archivebox
    +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
     archivebox init --setup         # if any problems, install with pip instead
     
  • @@ -334,7 +341,7 @@ archivebox init --setup # if any problems, install with pip instead # completely optional, CLI can always be used without running a server # archivebox [subcommand] [--help] archivebox help - +

    @@ -351,7 +358,7 @@ See the homebr
    • Arch: yay -S archivebox (contributed by @imlonghao)
    • -
    • FreeBSD: curl -sSL 'https://get.archivebox.io' | sh (uses pkg + pip3 under-the-hood)
    • +
    • FreeBSD: curl -fsSL 'https://get.archivebox.io' | sh (uses pkg + pip3 under-the-hood)
    • Nix: nix-env --install archivebox (contributed by @siraben)
    • Guix: guix install archivebox (contributed by @rakino)
    • More: contribute another distribution...!
    • @@ -461,13 +468,14 @@ mkdir -p ~/archivebox/data # create a new data dir anywhere cd ~/archivebox/data # IMPORTANT: cd into the directory # archivebox [subcommand] [--help] +archivebox version archivebox help # equivalent: docker compose run archivebox [subcommand] [--help] docker compose run archivebox help # equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] - docker run -it -v $PWD:/data archivebox/archivebox help +docker run -it -v $PWD:/data archivebox/archivebox help ``` #### ArchiveBox Subcommands @@ -677,7 +685,7 @@ It uses all available methods out-of-the-box, but you can disable extractors and Expand to see the full list of ways it saves each page... -./archive/{Snapshot.id}/
      +data/archive/{Snapshot.id}/
      • Index: index.html & index.json HTML and JSON index files containing metadata and details
      • Title, Favicon, Headers Response headers, site favicon, and parsed site title
      • @@ -808,18 +816,18 @@ All of ArchiveBox's state (SQLite DB, content, config, logs, etc.) is stored in
        Expand to learn more about the layout of Archivebox's data on-disk...
        -Data folders can be created anywhere (`~/archivebox` or `$PWD/data` as seen in our examples), and you can create as many data folders as you want to hold different collections. +Data folders can be created anywhere (`~/archivebox/data` or `$PWD/data` as seen in our examples), and you can create as many data folders as you want to hold different collections. All archivebox CLI commands are designed to be run from inside an ArchiveBox data folder, starting with archivebox init to initialize a new collection inside an empty directory. -
        mkdir ~/archivebox && cd ~/archivebox   # just an example, can be anywhere
        +
        mkdir -p ~/archivebox/data && cd ~/archivebox/data   # just an example, can be anywhere
         archivebox init
        -The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard index.sqlite3 database in the root of the data folder (it can also be exported as static JSON/HTML), and the archive snapshots are organized by date-added timestamp in the ./archive/ subfolder. +The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard index.sqlite3 database in the root of the data folder (it can also be exported as static JSON/HTML), and the archive snapshots are organized by date-added timestamp in the data/archive/ subfolder. -
        /data/
        +
        data/
             index.sqlite3
             ArchiveBox.conf
             archive/
        @@ -834,7 +842,7 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
                     ...
         
        -Each snapshot subfolder ./archive/TIMESTAMP/ includes a static index.json and index.html describing its contents, and the snapshot extractor outputs are plain files within the folder. +Each snapshot subfolder data/archive/TIMESTAMP/ includes a static index.json and index.html describing its contents, and the snapshot extractor outputs are plain files within the folder.

        Learn More

          @@ -1048,9 +1056,9 @@ Because ArchiveBox is designed to ingest a large volume of URLs with multiple co Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like [fdupes](https://github.com/adrianlopezroche/fdupes) or [rdfind](https://github.com/pauldreik/rdfind). -**Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `archive/` folder. +**Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `data/archive/` folder. -**Try to keep the `index.sqlite3` file on local drive (not a network mount)** or SSD for maximum performance, however the `archive/` folder can be on a network mount or slower HDD. +**Try to keep the `data/index.sqlite3` file on local drive (not a network mount)** or SSD for maximum performance, however the `data/archive/` folder can be on a network mount or slower HDD. If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set [`PUID` & `PGID`](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid) and [disable `root_squash`](https://github.com/ArchiveBox/ArchiveBox/issues/1304) on your fileshare server. @@ -1441,7 +1449,7 @@ https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-dj ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page. -Extractors take the URL of a page to archive, write their output to the filesystem `archive/TIMESTAMP/EXTRACTOR/...`, and return an [`ArchiveResult`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/core/models.py#:~:text=return%20qs-,class%20ArchiveResult,-(models.Model)%3A) entry which is saved to the database (visible on the `Log` page in the UI). +Extractors take the URL of a page to archive, write their output to the filesystem `data/archive/TIMESTAMP/EXTRACTOR/...`, and return an [`ArchiveResult`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/core/models.py#:~:text=return%20qs-,class%20ArchiveResult,-(models.Model)%3A) entry which is saved to the database (visible on the `Log` page in the UI). *Check out how we added **[`archivebox/extractors/singlefile.py`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/singlefile.py)** as an example of the process: [Issue #399](https://github.com/ArchiveBox/ArchiveBox/issues/399) + [PR #403](https://github.com/ArchiveBox/ArchiveBox/pull/403).* From 3ad32509e985236f82f3558f31b856623b1eb261 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 12 Feb 2024 02:09:39 -0800 Subject: [PATCH 127/227] Update FUNDING.yml --- .github/FUNDING.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index ff0edb0f..d3fbf26a 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,3 +1,3 @@ github: pirate patreon: theSquashSH -custom: ["https://twitter.com/ArchiveBoxApp", "https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"] +custom: ["https://hcb.hackclub.com/donations/start/archivebox", "https://paypal.me/NicholasSweeting"] From 91c46411990147fa9db4a0b35a3a195bad78673f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 12 Feb 2024 21:26:34 -0800 Subject: [PATCH 128/227] skip dir size calculation when path is too long --- archivebox/system.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/archivebox/system.py b/archivebox/system.py index d80a2cb5..6e03846f 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -146,20 +146,24 @@ def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional recursively and limiting to a given filter list """ num_bytes, num_dirs, num_files = 0, 0, 0 - for entry in os.scandir(path): - if (pattern is not None) and (pattern not in entry.path): - continue - if entry.is_dir(follow_symlinks=False): - if not recursive: + try: + for entry in os.scandir(path): + if (pattern is not None) and (pattern not in entry.path): continue - num_dirs += 1 - bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path) - num_bytes += bytes_inside - num_dirs += dirs_inside - num_files += files_inside - else: - num_bytes += entry.stat(follow_symlinks=False).st_size - num_files += 1 + if entry.is_dir(follow_symlinks=False): + if not recursive: + continue + num_dirs += 1 + bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path) + num_bytes += bytes_inside + num_dirs += dirs_inside + num_files += files_inside + else: + num_bytes += entry.stat(follow_symlinks=False).st_size + num_files += 1 + except OSError: + # e.g. FileNameTooLong or other error while trying to read dir + pass return num_bytes, num_dirs, num_files From 903c72fa8871f4bf17afa032064853edcdb30120 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 12 Feb 2024 21:28:52 -0800 Subject: [PATCH 129/227] fix typing errors --- archivebox/system.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/archivebox/system.py b/archivebox/system.py index 6e03846f..bced0bac 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -30,8 +30,7 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, if capture_output: if ('stdout' in kwargs) or ('stderr' in kwargs): - raise ValueError('stdout and stderr arguments may not be used ' - 'with capture_output.') + raise ValueError('stdout and stderr arguments may not be used with capture_output.') kwargs['stdout'] = PIPE kwargs['stderr'] = PIPE @@ -175,7 +174,7 @@ def dedupe_cron_jobs(cron: CronTab) -> CronTab: deduped: Set[Tuple[str, str]] = set() for job in list(cron): - unique_tuple = (str(job.slices), job.command) + unique_tuple = (str(job.slices), str(job.command)) if unique_tuple not in deduped: deduped.add(unique_tuple) cron.remove(job) From 2a845d1976c5108e002c437dff4b62a1019165c1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 18 Feb 2024 04:13:55 -0800 Subject: [PATCH 130/227] Update README.md --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b8892b06..64c08a3d 100644 --- a/README.md +++ b/README.md @@ -168,8 +168,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur # Quickstart -**🖥  Supported OSs:** Linux/BSD, macOS, Windows (Docker)   **👾  CPUs:** `amd64` (`x86_64`), `arm64`, `arm7` (raspi>=3)
          -Note: On `arm7` the `playwright` package is not available, so `chromium` must be installed manually if needed. +**🖥  [Supported OSs](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#supported-systems):** Linux/BSD, macOS, Windows (Docker)   **👾  CPUs:** `amd64` (`x86_64`), `arm64`, `arm7` (raspi>=3)

          @@ -196,10 +195,12 @@ curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml # docker compose run [-T] archivebox [subcommand] [--help] docker compose run archivebox add 'https://example.com' docker compose run archivebox help -
        +
        +For more info, see Install: Docker Compose in the Wiki. ➡️ + -See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive. +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.

        @@ -212,6 +213,7 @@ See below for more usage examples using the C
        mkdir -p ~/archivebox/data && cd ~/archivebox/data
         docker run -v $PWD:/data -it archivebox/archivebox init --setup
         
        +For more info, see Install: Docker in the Wiki. ➡️
      • Optional: Start the server then login to the Web UI http://127.0.0.1:8000 ⇢ Admin.
        docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox
        
        From 74e3645c6a97db216fde873d1159a745cf70c1d2 Mon Sep 17 00:00:00 2001
        From: Nick Sweeting 
        Date: Sun, 18 Feb 2024 04:14:57 -0800
        Subject: [PATCH 131/227] Update README.md
        
        ---
         README.md | 2 +-
         1 file changed, 1 insertion(+), 1 deletion(-)
        
        diff --git a/README.md b/README.md
        index 64c08a3d..66f968a5 100644
        --- a/README.md
        +++ b/README.md
        @@ -10,7 +10,7 @@
         
         
         
        -         
        +