diff --git a/.github/workflows/homebrew.yml b/.github/workflows/homebrew.yml index d9bb05f1..af7a0795 100644 --- a/.github/workflows/homebrew.yml +++ b/.github/workflows/homebrew.yml @@ -23,11 +23,12 @@ jobs: cd brew_dist/ brew install --build-bottle ./archivebox.rb # brew bottle archivebox + archivebox version - name: Add some links to test run: | mkdir data && cd data - archivebox init + archivebox init --setup archivebox add 'https://example.com' archivebox version archivebox status diff --git a/.gitignore b/.gitignore index a80c30ba..f8fefbfb 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ data1/ data2/ data3/ output/ + +# vim +*.sw? diff --git a/Dockerfile b/Dockerfile index c9daca81..dda3c97a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,22 @@ # This is the Dockerfile for ArchiveBox, it bundles the following dependencies: -# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, single-file +# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, yt-dlp, single-file # Usage: +# git submodule update --init --recursive +# git pull --recurse-submodules # docker build . -t archivebox --no-cache # docker run -v "$PWD/data":/data archivebox init # docker run -v "$PWD/data":/data archivebox add 'https://example.com' # docker run -v "$PWD/data":/data -it archivebox manage createsuperuser # docker run -v "$PWD/data":/data -p 8000:8000 archivebox server +# Multi-arch build: +# docker buildx create --use +# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev +# +# Read more about [developing +# Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development). -FROM python:3.9-slim-buster + +FROM python:3.10-slim-bullseye LABEL name="archivebox" \ maintainer="Nick Sweeting " \ @@ -48,11 +57,12 @@ RUN apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ wget curl chromium git ffmpeg youtube-dl ripgrep \ fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ + && ln -s /usr/bin/chromium /usr/bin/chromium-browser \ && rm -rf /var/lib/apt/lists/* # Install Node environment RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ - && echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \ + && echo 'deb https://deb.nodesource.com/node_17.x buster main' >> /etc/apt/sources.list \ && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ nodejs \ @@ -80,7 +90,8 @@ RUN apt-get update -qq \ build-essential python-dev python3-dev \ && echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \ && python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \ - && pip install --quiet -r /tmp/requirements.txt \ + && pip install -r /tmp/requirements.txt \ + && pip install --upgrade youtube-dl yt-dlp \ && apt-get purge -y build-essential python-dev python3-dev \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* @@ -103,13 +114,14 @@ RUN pip install -e . WORKDIR "$DATA_DIR" ENV IN_DOCKER=True \ CHROME_SANDBOX=False \ - CHROME_BINARY="chromium" \ + CHROME_BINARY="/usr/bin/chromium-browser" \ USE_SINGLEFILE=True \ SINGLEFILE_BINARY="$NODE_DIR/node_modules/.bin/single-file" \ USE_READABILITY=True \ READABILITY_BINARY="$NODE_DIR/node_modules/.bin/readability-extractor" \ USE_MERCURY=True \ - MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" + MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" \ + YOUTUBEDL_BINARY="yt-dlp" # Print version for nice docker finish summary # RUN archivebox version @@ -119,8 +131,9 @@ RUN /app/bin/docker_entrypoint.sh archivebox version VOLUME "$DATA_DIR" EXPOSE 8000 -HEALTHCHECK --interval=30s --timeout=20s --retries=15 \ - CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1 +# Optional: +# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \ +# CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1 ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"] CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"] diff --git a/README.md b/README.md index 5febf75d..039dd6bb 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,13 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
-**📦  Get ArchiveBox with `docker-compose` / `docker` / `apt` / `brew` / `pip3` ([see Quickstart below](#quickstart)).** +**📦  Get ArchiveBox with Docker / `apt` / `brew` / `pip3` / etc. ([see Quickstart below](#quickstart)).** ```bash -# Or use this auto setup script to install it for you (optional) +# Follow the instructions for your package manager in the quickstart, e.g.: +pip3 install archivebox + +# Or use the optional auto setup script to install it for you: curl -sSL 'https://get.archivebox.io' | sh ``` @@ -81,15 +84,15 @@ ls ./archive/*/index.json # or browse directly via the filesyste ## Key Features -- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally +- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up online, stores all data locally - [**Powerful, intuitive command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular optional dependencies](#dependencies) - [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) -- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl), articles (readability), code (git), etc.](#output-formats) +- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl or yt-dlp), articles (readability), code (git), etc.](#output-formats) - [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats) - [**Uses standard, durable, long-term formats**](#saves-lots-of-useful-stuff-for-each-imported-link) like HTML, JSON, PDF, PNG, and WARC - [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) (ALPHA) -- [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode) -- Planned: support for archiving [content requiring a login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (working, but ill-advised until some pending fixes are released) +- [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode) +- Advanced users: support for archiving [content requiring login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (see wiki security caveats!) - Planned: support for running [JS during archiving](https://github.com/ArchiveBox/ArchiveBox/issues/51) to adblock, [autoscroll](https://github.com/ArchiveBox/ArchiveBox/issues/80), [modal-hide](https://github.com/ArchiveBox/ArchiveBox/issues/175), [thread-expand](https://github.com/ArchiveBox/ArchiveBox/issues/345)...

@@ -165,14 +168,16 @@ See below for more usage examples using the C See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
-See setup.sh for the source code of the auto-install script. +See setup.sh for the source code of the auto-install script.
+See "Against curl | sh as an install method" blog post for my thoughts on the shortcomings of this install method.


-#### 🛠  Manual Setup +#### 🛠  Package Manager Setup +
aptitude apt (Ubuntu/Debian)
@@ -272,7 +277,7 @@ See the pip-archive Arch pacman / FreeBSD pkg / Nix nix (Arch/FreeBSD/NixOS/more)
#### Install and run a specific GitHub branch @@ -975,7 +988,8 @@ cd path/to/test/data/ archivebox shell archivebox manage dbshell ``` -(uses `pytest -s`) +(uses `pytest -s`) +https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running @@ -1067,7 +1081,7 @@ Extractors take the URL of a page to archive, write their output to the filesyst
-This project is maintained mostly in
my spare time with the help from generous contributors and Monadical (✨ hire them for dev work!). +This project is maintained mostly in my spare time with the help from generous contributors and Monadical (✨ hire them for dev work!).

diff --git a/_config.yml b/_config.yml index c50ff38d..9f63db0d 100644 --- a/_config.yml +++ b/_config.yml @@ -1 +1,3 @@ -theme: jekyll-theme-merlot \ No newline at end of file +production_url: https://archivebox.io +theme: jekyll-theme-merlot +# Github Pages static site settings for https://archivebox.io diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 2c3d7ce3..ed05584c 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -30,11 +30,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3", ) parser.add_argument( - '--update-all', #'-n', + '--update', #'-u', action='store_true', default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links help="Also retry previously skipped/failed links when adding new links", ) + parser.add_argument( + '--update-all', #'-n', + action='store_true', + default=False, + help="Also update ALL links in index when finished adding new links", + ) parser.add_argument( '--index-only', #'-o', action='store_true', @@ -104,6 +110,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional urls=stdin_urls or urls, depth=command.depth, tag=command.tag, + update=command.update, update_all=command.update_all, index_only=command.index_only, overwrite=command.overwrite, diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py index f528e6a6..d4747906 100644 --- a/archivebox/cli/archivebox_schedule.py +++ b/archivebox/cli/archivebox_schedule.py @@ -51,6 +51,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional action='store_true', help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots', ) + parser.add_argument( + '--update', + action='store_true', + help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults', + ) group.add_argument( '--clear', # '-c' action='store_true', @@ -94,6 +99,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional every=command.every, depth=command.depth, overwrite=command.overwrite, + update=command.update, import_path=command.import_path, out_dir=pwd or OUTPUT_DIR, ) diff --git a/archivebox/config.py b/archivebox/config.py index 3c88adbb..f20303ac 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -26,11 +26,12 @@ import io import re import sys import json +import inspect import getpass import platform import shutil -import sqlite3 import django +from sqlite3 import dbapi2 as sqlite3 from hashlib import md5 from pathlib import Path @@ -48,6 +49,9 @@ from .config_stubs import ( ConfigDefaultDict, ) + +### Pre-Fetch Minimal System Config + SYSTEM_USER = getpass.getuser() or os.getlogin() try: @@ -65,6 +69,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']}, 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now 'IN_DOCKER': {'type': bool, 'default': False}, + 'PUID': {'type': int, 'default': os.getuid()}, + 'PGID': {'type': int, 'default': os.getgid()}, # TODO: 'SHOW_HINTS': {'type: bool, 'default': True}, }, @@ -79,6 +85,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages 'URL_WHITELIST': {'type': str, 'default': None}, 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, + 'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'}, }, 'SERVER_CONFIG': { @@ -93,9 +100,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, 'TIME_ZONE': {'type': str, 'default': 'UTC'}, + 'TIMEZONE': {'type': str, 'default': 'UTC'}, 'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'}, 'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''}, 'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'}, + 'PREVIEW_ORIGINALS': {'type': bool, 'default': True}, }, 'ARCHIVE_METHOD_TOGGLES': { @@ -122,9 +131,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, - 'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, - 'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, - 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, + 'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, + 'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, + 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, 'COOKIES_FILE': {'type': str, 'default': None}, 'CHROME_USER_DATA_DIR': {'type': str, 'default': None}, @@ -139,10 +148,18 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--no-call-home', '--write-sub', '--all-subs', - '--write-auto-sub', + # There are too many of these and youtube + # throttles you with HTTP error 429 + #'--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', + # This flag doesn't exist in youtube-dl + # only in yt-dlp + '--no-abort-on-error', + # --ignore-errors must come AFTER + # --no-abort-on-error + # https://github.com/yt-dlp/yt-dlp/issues/4914 '--ignore-errors', '--geo-bypass', '--add-metadata', @@ -164,6 +181,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--compressed' ]}, 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, + 'SINGLEFILE_ARGS': {'type': list, 'default' : None} }, 'SEARCH_BACKEND_CONFIG' : { @@ -197,7 +215,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')}, - 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, + #'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, + 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, 'NODE_BINARY': {'type': str, 'default': 'node'}, 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, 'CHROME_BINARY': {'type': str, 'default': None}, @@ -321,6 +340,15 @@ ALLOWED_IN_OUTPUT_DIR = { 'static_index.json', } +def get_version(config): + return json.loads((Path(config['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version'] + +def get_commit_hash(config): + try: + return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip() + except Exception: + return None + ############################## Derived Config ################################## @@ -345,14 +373,20 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')}, - 'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']}, - + 'VERSION': {'default': lambda c: get_version(c)}, + 'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)}, + 'PYTHON_BINARY': {'default': lambda c: sys.executable}, 'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()}, 'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])}, - 'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')}, + 'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)}, 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)}, + + 'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)}, + 'SQLITE_VERSION': {'default': lambda c: sqlite3.version}, + #'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting but unused for now + #'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])}, 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None}, @@ -373,6 +407,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, + 'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []}, 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, @@ -652,7 +687,9 @@ def bin_version(binary: Optional[str]) -> Optional[str]: return None try: - version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode() + version_str = run([abspath, "--version"], stdout=PIPE, env={'LANG': 'C'}).stdout.strip().decode() + if not version_str: + version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode() # take first 3 columns of first line of version info return ' '.join(version_str.split('\n')[0].strip().split()[:3]) except OSError: @@ -795,6 +832,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'path': config['OUTPUT_DIR'].resolve(), 'enabled': True, 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(), + 'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()), }, 'SOURCES_DIR': { 'path': config['SOURCES_DIR'].resolve(), @@ -810,6 +848,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'path': config['ARCHIVE_DIR'].resolve(), 'enabled': True, 'is_valid': config['ARCHIVE_DIR'].exists(), + 'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()), }, 'CONFIG_FILE': { 'path': config['CONFIG_FILE'].resolve(), @@ -820,18 +859,12 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(), 'enabled': True, 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(), + 'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()), }, } def get_dependency_info(config: ConfigDict) -> ConfigValue: return { - 'ARCHIVEBOX_BINARY': { - 'path': bin_path(config['ARCHIVEBOX_BINARY']), - 'version': config['VERSION'], - 'hash': bin_hash(config['ARCHIVEBOX_BINARY']), - 'enabled': True, - 'is_valid': True, - }, 'PYTHON_BINARY': { 'path': bin_path(config['PYTHON_BINARY']), 'version': config['PYTHON_VERSION'], @@ -839,6 +872,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': True, 'is_valid': bool(config['PYTHON_VERSION']), }, + 'SQLITE_BINARY': { + 'path': bin_path(config['SQLITE_BINARY']), + 'version': config['SQLITE_VERSION'], + 'hash': bin_hash(config['SQLITE_BINARY']), + 'enabled': True, + 'is_valid': bool(config['SQLITE_VERSION']), + }, 'DJANGO_BINARY': { 'path': bin_path(config['DJANGO_BINARY']), 'version': config['DJANGO_VERSION'], @@ -846,6 +886,14 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': True, 'is_valid': bool(config['DJANGO_VERSION']), }, + 'ARCHIVEBOX_BINARY': { + 'path': bin_path(config['ARCHIVEBOX_BINARY']), + 'version': config['VERSION'], + 'hash': bin_hash(config['ARCHIVEBOX_BINARY']), + 'enabled': True, + 'is_valid': True, + }, + 'CURL_BINARY': { 'path': bin_path(config['CURL_BINARY']), 'version': config['CURL_VERSION'], @@ -931,7 +979,7 @@ def get_chrome_info(config: ConfigDict) -> ConfigValue: 'TIMEOUT': config['TIMEOUT'], 'RESOLUTION': config['RESOLUTION'], 'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'], - 'CHROME_BINARY': config['CHROME_BINARY'], + 'CHROME_BINARY': bin_path(config['CHROME_BINARY']), 'CHROME_HEADLESS': config['CHROME_HEADLESS'], 'CHROME_SANDBOX': config['CHROME_SANDBOX'], 'CHROME_USER_AGENT': config['CHROME_USER_AGENT'], @@ -972,13 +1020,22 @@ globals().update(CONFIG) # Set timezone to UTC and umask to OUTPUT_PERMISSIONS -os.environ["TZ"] = 'UTC' +assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # we may allow this to change later +os.environ["TZ"] = TIMEZONE os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821 # add ./node_modules/.bin to $PATH so we can use node scripts in extractors NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin')) sys.path.append(NODE_BIN_PATH) +# OPTIONAL: also look around the host system for node modules to use +# avoid enabling this unless absolutely needed, +# having overlapping potential sources of libs is a big source of bugs/confusing to users +# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin')) +# sys.path.append(DEV_NODE_BIN_PATH) +# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve()) +# sys.path.append(USER_NODE_BIN_PATH) + # disable stderr "you really shouldnt disable ssl" warnings with library config if not CONFIG['CHECK_SSL_VALIDITY']: import urllib3 @@ -986,6 +1043,13 @@ if not CONFIG['CHECK_SSL_VALIDITY']: requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +# get SQLite database version, compile options, and runtime options +# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django +#cursor = sqlite3.connect(':memory:').cursor() +#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0] +#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0] +#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()] +#cursor.close() ########################### Config Validity Checkers ########################### @@ -1082,6 +1146,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') stderr() + def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None: output_dir = out_dir or config['OUTPUT_DIR'] assert isinstance(output_dir, (str, Path)) @@ -1156,11 +1221,10 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, # without running migrations automatically (user runs them manually by calling init) django.setup() - from django.conf import settings # log startup message to the error log - with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f: + with open(settings.ERROR_LOG, "a", encoding='utf-8') as f: command = ' '.join(sys.argv) ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") @@ -1170,10 +1234,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, # Enable WAL mode in sqlite3 from django.db import connection with connection.cursor() as cursor: + + # Set Journal mode to WAL to allow for multiple writers current_mode = cursor.execute("PRAGMA journal_mode") if current_mode != 'wal': cursor.execute("PRAGMA journal_mode=wal;") + # Set max blocking delay for concurrent writes and write sync mode + # https://litestream.io/tips/#busy-timeout + cursor.execute("PRAGMA busy_timeout = 5000;") + cursor.execute("PRAGMA synchronous = NORMAL;") + # Create cache table in DB if needed try: from django.core.cache import cache @@ -1181,7 +1252,6 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, except django.db.utils.OperationalError: call_command("createcachetable", verbosity=0) - # if archivebox gets imported multiple times, we have to close # the sqlite3 whenever we init from scratch to avoid multiple threads # sharing the same connection by accident diff --git a/archivebox/config_stubs.py b/archivebox/config_stubs.py index f9c22a0c..ead541a5 100644 --- a/archivebox/config_stubs.py +++ b/archivebox/config_stubs.py @@ -98,6 +98,7 @@ class ConfigDict(BaseConfig, total=False): WGET_ARGS: List[str] CURL_ARGS: List[str] GIT_ARGS: List[str] + TAG_SEPARATOR_PATTERN: str ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue] diff --git a/archivebox/core/migrations/0021_auto_20220914_0934.py b/archivebox/core/migrations/0021_auto_20220914_0934.py new file mode 100644 index 00000000..4ef09034 --- /dev/null +++ b/archivebox/core/migrations/0021_auto_20220914_0934.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.14 on 2022-09-14 09:34 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0020_auto_20210410_1031'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), + ), + ] diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 70cffa85..d2c91d9f 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -19,7 +19,7 @@ from ..config import ( SQL_INDEX_FILENAME, OUTPUT_DIR, LOGS_DIR, - TIME_ZONE, + TIMEZONE, ) IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3] @@ -157,7 +157,7 @@ DATABASES = { 'timeout': 60, 'check_same_thread': False, }, - 'TIME_ZONE': 'UTC', + 'TIME_ZONE': TIMEZONE, # DB setup is sometimes modified at runtime by setup_django() in config.py } } @@ -227,7 +227,8 @@ USE_L10N = True USE_TZ = True DATETIME_FORMAT = 'Y-m-d g:iA' SHORT_DATETIME_FORMAT = 'Y-m-d h:iA' -TIME_ZONE = TIME_ZONE # noqa +TIME_ZONE = TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent + from django.conf.locale.en import formats as en_formats diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index 87a302b8..8a3f0e22 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -6,7 +6,7 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns from django.conf import settings from django.views.generic.base import RedirectView -from core.views import HomepageView, SnapshotView, PublicIndexView, AddView +from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView # print('DEBUG', settings.DEBUG) @@ -24,14 +24,16 @@ urlpatterns = [ path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')), path('add/', AddView.as_view(), name='add'), - + path('accounts/login/', RedirectView.as_view(url='/admin/login/')), path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')), path('accounts/', include('django.contrib.auth.urls')), path('admin/', admin.site.urls), - + + path('health/', HealthCheckView.as_view(), name='healthcheck'), + path('index.html', RedirectView.as_view(url='/')), path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}), path('', HomepageView.as_view(), name='Home'), diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 5385add9..3f3fec12 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -38,7 +38,7 @@ class HomepageView(View): if PUBLIC_INDEX: return redirect('/public') - + return redirect(f'/admin/login/?next={request.path}') @@ -205,7 +205,7 @@ class SnapshotView(View): content_type="text/html", status=404, ) - + class PublicIndexView(ListView): template_name = 'public_index.html' @@ -220,7 +220,7 @@ class PublicIndexView(ListView): 'FOOTER_INFO': FOOTER_INFO, } - def get_queryset(self, **kwargs): + def get_queryset(self, **kwargs): qs = super().get_queryset(**kwargs) query = self.request.GET.get('q') if query and query.strip(): @@ -249,7 +249,7 @@ class AddView(UserPassesTestMixin, FormView): url = self.request.GET.get('url', None) if url: return {'url': url if '://' in url else f'https://{url}'} - + return super().get_initial() def test_func(self): @@ -295,3 +295,18 @@ class AddView(UserPassesTestMixin, FormView): "form": AddLinkForm() }) return render(template_name=self.template_name, request=self.request, context=context) + + +class HealthCheckView(View): + """ + A Django view that renders plain text "OK" for service discovery tools + """ + def get(self, request): + """ + Handle a GET request + """ + return HttpResponse( + 'OK', + content_type='text/plain', + status=200 + ) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index be5832e7..1acefdf4 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -1,12 +1,14 @@ __package__ = 'archivebox.extractors' import os +import sys from pathlib import Path from typing import Optional, List, Iterable, Union from datetime import datetime, timezone from django.db.models import QuerySet +from ..core.settings import ERROR_LOG from ..index.schema import Link from ..index.sql import write_link_to_sql_index from ..index import ( @@ -42,7 +44,6 @@ from .headers import should_save_headers, save_headers def get_default_archive_methods(): return [ - ('title', should_save_title, save_title), ('favicon', should_save_favicon, save_favicon), ('headers', should_save_headers, save_headers), ('singlefile', should_save_singlefile, save_singlefile), @@ -50,7 +51,8 @@ def get_default_archive_methods(): ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), ('wget', should_save_wget, save_wget), - ('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them + ('title', should_save_title, save_title), # keep title and readability below wget and singlefile, as it depends on them + ('readability', should_save_readability, save_readability), ('mercury', should_save_mercury, save_mercury), ('git', should_save_git, save_git), ('media', should_save_media, save_media), @@ -127,10 +129,27 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s # print('{black} X {}{reset}'.format(method_name, **ANSI)) stats['skipped'] += 1 except Exception as e: + # Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984 + # and https://github.com/ArchiveBox/ArchiveBox/issues/1014 + # are fixed. + """ raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format( method_name, link.url, )) from e + """ + # Instead, use the kludgy workaround from + # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627 + with open(ERROR_LOG, "a", encoding='utf-8') as f: + command = ' '.join(sys.argv) + ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') + f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format( + method_name, + link.url, + command, + ts + ) + "\n")) + #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") # print(' ', stats) @@ -182,7 +201,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa except KeyboardInterrupt: log_archiving_paused(num_links, idx, link.timestamp) raise SystemExit(0) - except BaseException: # lgtm [py/catch-base-exception] + except BaseException: print() raise diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index e41a4002..7d73024f 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -33,7 +33,7 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio @enforce_types def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: - """Download playlists or individual video, audio, and subtitles using youtube-dl""" + """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp""" out_dir = out_dir or Path(link.link_dir) output: ArchiveOutput = 'media' @@ -43,6 +43,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME YOUTUBEDL_BINARY, *YOUTUBEDL_ARGS, *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), + # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} link.url, ] status = 'succeeded' @@ -60,7 +61,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME pass else: hints = ( - 'Got youtube-dl response code: {}.'.format(result.returncode), + 'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode), *result.stderr.decode().split('\n'), ) raise ArchiveError('Failed to save media', hints) @@ -71,8 +72,18 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME timer.end() # add video description and subtitles to full-text index + # Let's try a few different index_texts = [ - text_file.read_text(encoding='utf-8').strip() + # errors: + # * 'strict' to raise a ValueError exception if there is an + # encoding error. The default value of None has the same effect. + # * 'ignore' ignores errors. Note that ignoring encoding errors + # can lead to data loss. + # * 'xmlcharrefreplace' is only supported when writing to a + # file. Characters not supported by the encoding are replaced with + # the appropriate XML character reference &#nnn;. + # There are a few more options described in https://docs.python.org/3/library/functions.html#open + text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip() for text_file in ( *output_path.glob('*.description'), *output_path.glob('*.srt'), diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 7e5ed592..a1689f95 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -10,9 +10,7 @@ from ..index.schema import Link, ArchiveResult, ArchiveError from ..system import run, atomic_write from ..util import ( enforce_types, - download_url, is_static_file, - ) from ..config import ( TIMEOUT, @@ -22,28 +20,8 @@ from ..config import ( READABILITY_VERSION, ) from ..logging_util import TimedProgress +from .title import get_html -@enforce_types -def get_html(link: Link, path: Path) -> str: - """ - Try to find wget, singlefile and then dom files. - If none is found, download the url again. - """ - canonical = link.canonical_outputs() - abs_path = path.absolute() - sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]] - document = None - for source in sources: - try: - with open(abs_path / source, "r", encoding="utf-8") as f: - document = f.read() - break - except (FileNotFoundError, TypeError): - continue - if document is None: - return download_url(link.url) - else: - return document @enforce_types def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 3279960e..f7b1b686 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -17,6 +17,7 @@ from ..config import ( SAVE_SINGLEFILE, DEPENDENCIES, SINGLEFILE_VERSION, + SINGLEFILE_ARGS, CHROME_BINARY, ) from ..logging_util import TimedProgress @@ -45,10 +46,31 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - cmd = [ - DEPENDENCIES['SINGLEFILE_BINARY']['path'], + options = [ + *SINGLEFILE_ARGS, '--browser-executable-path={}'.format(CHROME_BINARY), browser_args, + ] + + # Deduplicate options (single-file doesn't like when you use the same option two times) + # + # NOTE: Options names that come first clobber conflicting names that come later + # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most + # specificity, therefore the user sets it with a lot intent, therefore it should take precedence + # kind of like the ergonomic principle of lexical scope in programming languages. + seen_option_names = [] + def test_seen(argument): + option_name = argument.split("=")[0] + if option_name in seen_option_names: + return False + else: + seen_option_names.append(option_name) + return True + deduped_options = list(filter(test_seen, options)) + + cmd = [ + DEPENDENCIES['SINGLEFILE_BINARY']['path'], + *deduped_options, link.url, output, ] diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 272eebc8..19a78591 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -58,6 +58,27 @@ class TitleParser(HTMLParser): if tag.lower() == "title": self.inside_title_tag = False +@enforce_types +def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: + """ + Try to find wget, singlefile and then dom files. + If none is found, download the url again. + """ + canonical = link.canonical_outputs() + abs_path = path.absolute() + sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]] + document = None + for source in sources: + try: + with open(abs_path / source, "r", encoding="utf-8") as f: + document = f.read() + break + except (FileNotFoundError, TypeError): + continue + if document is None: + return download_url(link.url, timeout=timeout) + else: + return document @enforce_types def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: @@ -90,7 +111,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - html = download_url(link.url, timeout=timeout) + html = get_html(link, out_dir, timeout=timeout) try: # try using relatively strict html parser first parser = TitleParser() diff --git a/archivebox/index/html.py b/archivebox/index/html.py index d45f66ea..66e26fab 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -24,6 +24,7 @@ from ..config import ( FOOTER_INFO, HTML_INDEX_FILENAME, SAVE_ARCHIVE_DOT_ORG, + PREVIEW_ORIGINALS, ) MAIN_INDEX_TEMPLATE = 'static_index.html' @@ -105,6 +106,7 @@ def link_details_template(link: Link) -> str: 'status_color': 'success' if link.is_archived else 'danger', 'oldest_archive_date': ts_to_date_str(link.oldest_archive_date), 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, + 'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS, }) @enforce_types diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 2fcabd61..420b9de6 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -1,5 +1,7 @@ __package__ = 'archivebox.index' +import re + from io import StringIO from pathlib import Path from typing import List, Tuple, Iterator @@ -8,7 +10,10 @@ from django.db import transaction from .schema import Link from ..util import enforce_types, parse_date -from ..config import OUTPUT_DIR +from ..config import ( + OUTPUT_DIR, + TAG_SEPARATOR_PATTERN, +) ### Main Links Index @@ -33,9 +38,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: def write_link_to_sql_index(link: Link): from core.models import Snapshot, ArchiveResult info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} - tags = info.pop("tags") - if tags is None: - tags = [] + + tag_list = list(dict.fromkeys( + tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '') + )) + info.pop('tags') try: info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp @@ -44,7 +51,7 @@ def write_link_to_sql_index(link: Link): info["timestamp"] = str(float(info["timestamp"]) + 1.0) snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info) - snapshot.save_tags(tags) + snapshot.save_tags(tag_list) for extractor, entries in link.history.items(): for entry in entries: @@ -104,10 +111,9 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: snap = write_link_to_sql_index(link) snap.title = link.title - tag_set = ( - set(tag.strip() for tag in (link.tags or '').split(',')) - ) - tag_list = list(tag_set) or [] + tag_list = list(dict.fromkeys( + tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '') + )) snap.save() snap.save_tags(tag_list) diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index a8c4e590..49ee12d7 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -432,7 +432,13 @@ def log_archive_method_finished(result: "ArchiveResult"): # Prettify error output hints string and limit to five lines hints = getattr(result.output, 'hints', None) or () if hints: - hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') + if isinstance(hints, (list, tuple, type(_ for _ in ()))): + hints = [hint.decode() for hint in hints if isinstance(hint, bytes)] + else: + if isinstance(hints, bytes): + hints = hints.decode() + hints = hints.split('\n') + hints = ( ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset']) for line in hints[:5] if line.strip() @@ -566,7 +572,7 @@ def printable_config(config: ConfigDict, prefix: str='') -> str: def printable_folder_status(name: str, folder: Dict) -> str: if folder['enabled']: if folder['is_valid']: - color, symbol, note = 'green', '√', 'valid' + color, symbol, note, num_files = 'green', '√', 'valid', '' else: color, symbol, note, num_files = 'red', 'X', 'invalid', '?' else: @@ -581,6 +587,10 @@ def printable_folder_status(name: str, folder: Dict) -> str: ) else: num_files = 'missing' + + if folder.get('is_mount'): + # add symbol @ next to filecount if path is a remote filesystem mount + num_files = f'{num_files} @' if num_files else '@' path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else '' if path and ' ' in path: diff --git a/archivebox/main.py b/archivebox/main.py index 65588ef7..0499f73f 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -4,8 +4,9 @@ import os import sys import shutil import platform +from django.utils import timezone from pathlib import Path -from datetime import date +from datetime import date, datetime from typing import Dict, List, Optional, Iterable, IO, Union from crontab import CronTab, CronSlices @@ -70,7 +71,12 @@ from .config import ( IS_TTY, DEBUG, IN_DOCKER, + PUID, + PGID, USER, + TIMEZONE, + ENFORCE_ATOMIC_WRITES, + OUTPUT_PERMISSIONS, PYTHON_BINARY, ARCHIVEBOX_BINARY, ONLY_NEW, @@ -90,6 +96,7 @@ from .config import ( check_data_folder, write_config_file, VERSION, + COMMIT_HASH, CODE_LOCATIONS, EXTERNAL_LOCATIONS, DATA_LOCATIONS, @@ -203,32 +210,44 @@ def help(out_dir: Path=OUTPUT_DIR) -> None: def version(quiet: bool=False, out_dir: Path=OUTPUT_DIR) -> None: """Print the ArchiveBox version and dependency information""" - - if quiet: - print(VERSION) - else: - # ArchiveBox v0.5.6 - # Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY) - print('ArchiveBox v{}'.format(VERSION)) + + print(VERSION) + + if not quiet: + # 0.6.3 + # ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY) + # DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep + p = platform.uname() print( + 'ArchiveBox v{}'.format(VERSION), + *((COMMIT_HASH[:7],) if COMMIT_HASH else ()), sys.implementation.name.title(), p.system, platform.platform(), p.machine, ) + OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount'] print( - f'IN_DOCKER={IN_DOCKER}', f'DEBUG={DEBUG}', + f'IN_DOCKER={IN_DOCKER}', f'IS_TTY={IS_TTY}', - f'TZ={os.environ.get("TZ", "UTC")}', - f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}', + f'TZ={TIMEZONE}', + #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually + f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}', + f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', + f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}', + f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}', ) print() print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) for name, dependency in DEPENDENCIES.items(): print(printable_dependency_version(name, dependency)) + + # add a newline between core dependencies and extractor dependencies for easier reading + if name == 'ARCHIVEBOX_BINARY': + print() print() print('{white}[i] Source-code locations:{reset}'.format(**ANSI)) @@ -427,7 +446,7 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path= print(' archivebox server # then visit http://127.0.0.1:8000') print() print(' To add new links, you can run:') - print(" archivebox add ~/some/path/or/url/to/list_of_links.txt") + print(" archivebox add < ~/some/path/to/list_of_links.txt") print() print(' For more usage and examples, run:') print(' archivebox help') @@ -554,7 +573,8 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR): def add(urls: Union[str, List[str]], tag: str='', depth: int=0, - update_all: bool=not ONLY_NEW, + update: bool=not ONLY_NEW, + update_all: bool=False, index_only: bool=False, overwrite: bool=False, # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically @@ -587,6 +607,7 @@ def add(urls: Union[str, List[str]], # save verbatim args to sources write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) + new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser) # If we're going one level deeper, download each link and look for more links @@ -594,8 +615,11 @@ def add(urls: Union[str, List[str]], if new_links and depth == 1: log_crawl_started(new_links) for new_link in new_links: - downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) - new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) + try: + downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) + new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) + except Exception as err: + stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red') imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) @@ -618,11 +642,21 @@ def add(urls: Union[str, List[str]], if extractors: archive_kwargs["methods"] = extractors - if update_all: + stderr() + + ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S') + + if update: + stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green') + archive_links(imported_links, overwrite=overwrite, **archive_kwargs) + elif update_all: + stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green') archive_links(all_links, overwrite=overwrite, **archive_kwargs) elif overwrite: + stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green') archive_links(imported_links, overwrite=True, **archive_kwargs) elif new_links: + stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green') archive_links(new_links, overwrite=False, **archive_kwargs) @@ -1113,6 +1147,7 @@ def schedule(add: bool=False, every: Optional[str]=None, depth: int=0, overwrite: bool=False, + update: bool=not ONLY_NEW, import_path: Optional[str]=None, out_dir: Path=OUTPUT_DIR): """Set ArchiveBox to regularly import URLs at specific times using cron""" @@ -1142,6 +1177,7 @@ def schedule(add: bool=False, *([ 'add', *(['--overwrite'] if overwrite else []), + *(['--update'] if update else []), f'--depth={depth}', f'"{import_path}"', ] if import_path else ['update']), diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 2451f0f5..c033ab28 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -149,7 +149,17 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None, def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str: ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0] source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts)) - atomic_write(source_path, raw_text) + + referenced_texts = '' + + for entry in raw_text.split(): + try: + if Path(entry).exists(): + referenced_texts += Path(entry).read_text() + except Exception as err: + print(err) + + atomic_write(source_path, raw_text + '\n' + referenced_texts) log_source_saved(source_file=source_path) return source_path @@ -176,7 +186,7 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba ANSI['reset'], )) print(' ', e) - raise SystemExit(1) + raise e else: # Source is a path to a local file on the filesystem diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py index afad70ed..eec4d73b 100644 --- a/archivebox/parsers/pocket_api.py +++ b/archivebox/parsers/pocket_api.py @@ -47,11 +47,11 @@ def get_pocket_articles(api: Pocket, since=None, page=0): def link_from_article(article: dict, sources: list): - url: str = article['resolved_url'] or article['given_url'] + url: str = article.get('resolved_url') or article['given_url'] broken_protocol = _BROKEN_PROTOCOL_RE.match(url) if broken_protocol: url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://') - title = article['resolved_title'] or article['given_title'] or url + title = article.get('resolved_title') or article.get('given_title') or url return Link( url=url, diff --git a/archivebox/parsers/wallabag_atom.py b/archivebox/parsers/wallabag_atom.py index 32740097..3a39c54a 100644 --- a/archivebox/parsers/wallabag_atom.py +++ b/archivebox/parsers/wallabag_atom.py @@ -34,13 +34,19 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: trailing_removed = entry.split('', 1)[0] leading_removed = trailing_removed.strip() - rows = leading_removed.split('\n') + splits_fixed = leading_removed.replace('"\n href="', '" href="') + rows = splits_fixed.split('\n') - def get_row(key): - return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0] + def get_row(prefix): + return [ + row.strip() + for row in rows + if row.strip().startswith('<{}'.format(prefix)) + ][0] title = str_between(get_row('title'), '<![CDATA[', ']]>').strip() - url = str_between(get_row('link rel="via"'), '', '') + url_inside_link = str_between(get_row('link rel="via"'), '', '') + url_inside_attr = str_between(get_row('link rel="via"'), 'href="', '"/>') ts_str = str_between(get_row('published'), '', '') time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") try: @@ -49,7 +55,7 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags = None yield Link( - url=htmldecode(url), + url=htmldecode(url_inside_attr or url_inside_link), timestamp=str(time.timestamp()), title=htmldecode(title) or None, tags=tags or '', diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index 9dc62516..0592fa0a 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -197,7 +197,7 @@ // select the action button from the dropdown container.find('select[name=action]') - .find('op:selected').removeAttr('selected').end() + .find('[selected]').removeAttr('selected').end() .find('[value=' + action_type + ']').attr('selected', 'selected').click() // click submit & replace the archivebox logo with a spinner diff --git a/archivebox/templates/core/add.html b/archivebox/templates/core/add.html index 4a60b02e..7d6efc6c 100644 --- a/archivebox/templates/core/add.html +++ b/archivebox/templates/core/add.html @@ -28,6 +28,14 @@   Add more URLs âž• {% else %} +
{% csrf_token %}

Add new URLs to your archive


@@ -48,10 +56,9 @@ {% endif %} diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html index ccb74227..d4f43d3d 100644 --- a/archivebox/templates/core/snapshot.html +++ b/archivebox/templates/core/snapshot.html @@ -414,6 +414,7 @@ {% endif %} + {% if PREVIEW_ORIGINALS %}
@@ -427,6 +428,7 @@
+ {% endif %}
diff --git a/bin/setup.sh b/bin/setup.sh index 30a3ea54..a3f6c102 100755 --- a/bin/setup.sh +++ b/bin/setup.sh @@ -91,9 +91,9 @@ echo " This is a helper script which installs the ArchiveBox dependencies on echo " You may be prompted for a sudo password in order to install the following:" echo "" echo " - archivebox" -echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)" -echo " - curl, wget, git, youtube-dl (used for extracting title, favicon, git, media, and more)" -echo " - chromium (skips this if any Chrome/Chromium version is already installed)" +echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)" +echo " - curl, wget, git, youtube-dl, yt-dlp (used for extracting title, favicon, git, media, and more)" +echo " - chromium (skips this if any Chrome/Chromium version is already installed)" echo "" echo " If you'd rather install these manually as-needed, you can find detailed documentation here:" echo " https://github.com/ArchiveBox/ArchiveBox/wiki/Install" @@ -115,13 +115,13 @@ if which apt-get > /dev/null; then fi echo echo "[+] Installing ArchiveBox system dependencies using apt..." - sudo apt-get install -y git python3 python3-pip python3-distutils wget curl youtube-dl ffmpeg git nodejs npm ripgrep + sudo apt-get install -y git python3 python3-pip python3-distutils wget curl youtube-dl yt-dlp ffmpeg git nodejs npm ripgrep sudo apt-get install -y libgtk2.0-0 libgtk-3-0 libnotify-dev libgconf-2-4 libnss3 libxss1 libasound2 libxtst6 xauth xvfb libgbm-dev || sudo apt-get install -y chromium || sudo apt-get install -y chromium-browser || true sudo apt-get install -y archivebox sudo apt-get --only-upgrade install -y archivebox echo "" - echo "[+] Installing ArchiveBox python dependencies using pip..." - sudo python3.7 -m pip install --upgrade --ignore-installed archivebox + echo "[+] Installing ArchiveBox python dependencies using pip3..." + sudo python3 -m pip install --upgrade --ignore-installed archivebox # On Mac: elif which brew > /dev/null; then echo "[+] Installing ArchiveBox system dependencies using brew..." @@ -129,16 +129,16 @@ elif which brew > /dev/null; then brew update brew install --fetch-HEAD -f archivebox echo "" - echo "[+] Installing ArchiveBox python dependencies using pip..." + echo "[+] Installing ArchiveBox python dependencies using pip3..." python3 -m pip install --upgrade --ignore-installed archivebox elif which pkg > /dev/null; then - echo "[+] Installing ArchiveBox system dependencies using pkg..." - sudo pkg install -y python37 py37-pip py37-sqlite3 node npm wget curl youtube_dl ffmpeg git ripgrep + echo "[+] Installing ArchiveBox system dependencies using pkg and pip (python3.9)..." + sudo pkg install -y python3 py39-pip py39-sqlite3 npm wget curl youtube_dl ffmpeg git ripgrep sudo pkg install -y chromium echo "" echo "[+] Installing ArchiveBox python dependencies using pip..." - sudo python3.7 -m pip install --upgrade --ignore-installed archivebox - alias python3=python3.7 + # don't use sudo here so that pip installs in $HOME/.local instead of into /usr/local + python3 -m pip install --upgrade --ignore-installed archivebox else echo "[!] Warning: Could not find aptitude/homebrew/pkg! May not be able to install all dependencies automatically." echo "" @@ -192,7 +192,7 @@ echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized echo " cd ~/archivebox" echo " ps aux | grep archivebox" echo " pkill -f archivebox" -echo " pip3 install --upgrade archivebox" +echo " python3 -m pip install --upgrade archivebox" echo " archivebox server --quick-init 0.0.0.0:8000" echo " archivebox manage createsuperuser" echo " archivebox add 'https://example.com'" diff --git a/brew_dist b/brew_dist index 95a1c1a0..a4314719 160000 --- a/brew_dist +++ b/brew_dist @@ -1 +1 @@ -Subproject commit 95a1c1a0875841d076f06106bd4c2307504928c2 +Subproject commit a4314719746de549f359c2fa975762fc73b62f94 diff --git a/docker-compose.yml b/docker-compose.yml index 3b2959d5..7e494e65 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,7 +8,7 @@ # Documentation: # https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose -version: '2.4' +version: '2.4' # '3.9' or greater also works services: archivebox: @@ -23,15 +23,21 @@ services: # - SEARCH_BACKEND_ENGINE=sonic # uncomment these if you enable sonic below # - SEARCH_BACKEND_HOST_NAME=sonic # - SEARCH_BACKEND_PASSWORD=SecretPassword + # dns: # uncomment to use pihole below for ad/tracker blocking during archiving + # - pihole volumes: - ./data:/data # - ./archivebox:/app/archivebox # for developers working on archivebox - # To run the Sonic full-text search backend, first download the config file to sonic.cfg + + ### Optional Addons: tweak these examples as needed for your specific use case + + ### Example: To run the Sonic full-text search backend, first download the config file to sonic.cfg # curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg # after starting, backfill any existing Snapshots into the index: docker-compose run archivebox update --index-only + # sonic: - # image: valeriansaliou/sonic:v1.3.0 + # image: valeriansaliou/sonic:v1.3.1 # expose: # - 1491 # environment: @@ -39,12 +45,25 @@ services: # volumes: # - ./sonic.cfg:/etc/sonic.cfg:ro # - ./data/sonic:/var/lib/sonic/store + + + ### Example: To run pihole in order to block ad/tracker requests during archiving, + # uncomment this block and set up pihole using its admin interface + + # pihole: + # image: pihole/pihole:latest + # ports: + # - 80:80 # uncomment to access the admin HTTP interface on http://localhost:80 + # environment: + # WEBPASSWORD: 'set a secure password here or it will be random' + # volumes: + # - ./data/pihole:/etc/pihole + # - ./data/dnsmasq:/etc/dnsmasq.d - ### Optional Addons: tweak these examples as needed for your specific use case + ### Example: Run scheduled imports in a docker instead of using cron on the + # host machine, add tasks and see more info with archivebox schedule --help - # Example: Run scheduled imports in a docker instead of using cron on the - # host machine, add tasks and see more info with archivebox schedule --help # scheduler: # image: archivebox/archivebox:latest # command: schedule --foreground --every=day --depth=1 'https://getpocket.com/users/USERNAME/feed/all' @@ -54,7 +73,9 @@ services: # volumes: # - ./data:/data - # Example: Put Nginx in front of the ArchiveBox server for SSL termination + + ### Example: Put Nginx in front of the ArchiveBox server for SSL termination + # nginx: # image: nginx:alpine # ports: @@ -64,7 +85,9 @@ services: # - ./etc/nginx/nginx.conf:/etc/nginx/nginx.conf # - ./data:/var/www - # Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel + + ### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel + # wireguard: # image: linuxserver/wireguard # network_mode: 'service:archivebox' @@ -78,14 +101,16 @@ services: # - /lib/modules:/lib/modules # - ./wireguard.conf:/config/wg0.conf:ro - # Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox + + ### Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox + # pywb: # image: webrecorder/pywb:latest - # entrypoint: /bin/sh 'wb-manager add default /archivebox/archive/*/warc/*.warc.gz; wayback --proxy;' + # entrypoint: /bin/sh -c '(wb-manager init default || test $$? -eq 2) && wb-manager add default /archivebox/archive/*/warc/*.warc.gz; wayback;' # environment: # - INIT_COLLECTION=archivebox # ports: # - 8080:8080 # volumes: - # ./data:/archivebox - # ./data/wayback:/webarchive + # - ./data:/archivebox + # - ./data/wayback:/webarchive diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index 982a1931..03048a42 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -55,7 +55,7 @@ # CURL_BINARY = curl # GIT_BINARY = git # WGET_BINARY = wget -# YOUTUBEDL_BINARY = youtube-dl +# YOUTUBEDL_BINARY = yt-dlp # CHROME_BINARY = chromium # CHROME_USER_DATA_DIR="~/.config/google-chrome/Default" diff --git a/package-lock.json b/package-lock.json index 5d61f755..eab7b512 100644 --- a/package-lock.json +++ b/package-lock.json @@ -5,11 +5,11 @@ "requires": true, "dependencies": { "@babel/runtime-corejs2": { - "version": "7.13.10", - "resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.13.10.tgz", - "integrity": "sha512-rZw5P1ZewO6XZTDxtXuAuAFUqfNXyM8HO/9WiaDd34Anka0uFTpo0RvBLeV775AEE/zKw3LQB+poZw/O9lrZBg==", + "version": "7.17.11", + "resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.17.11.tgz", + "integrity": "sha512-pJe8Aerb88TGVi1Xe/AE36aRCPrg+h6ktZPGl6xaJvOfTLcMMuogQu3BYcxeXPTNHhSYbmsDVYBs8CfAxeFFTg==", "requires": { - "core-js": "^2.6.5", + "core-js": "^2.6.12", "regenerator-runtime": "^0.13.4" } }, @@ -28,9 +28,8 @@ } }, "@postlight/mercury-parser": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@postlight/mercury-parser/-/mercury-parser-2.2.0.tgz", - "integrity": "sha512-nz6dIvCAaiv74o1vhhp0BRsUe+ysPbZG5mVNpJmgLoI/goOBqRMM3Yg8uXtnv++e7tzKFSXdls8b2/zKk1qL0Q==", + "version": "git+https://github.com/postlight/mercury-parser.git#9cd9662bcbfea00b773fad691a4f6e53394ff543", + "from": "git+https://github.com/postlight/mercury-parser.git", "requires": { "@babel/runtime-corejs2": "^7.2.0", "@postlight/ci-failed-test-reporter": "^1.0", @@ -50,35 +49,7 @@ "url": "^0.11.0", "valid-url": "^1.0.9", "wuzzy": "^0.1.4", - "yargs-parser": "^13.0.0" - }, - "dependencies": { - "http-headers": { - "version": "3.0.2", - "bundled": true, - "requires": { - "next-line": "^1.1.0" - } - }, - "jquery": { - "version": "3.4.1", - "bundled": true - }, - "moment": { - "version": "2.23.0", - "bundled": true - }, - "moment-timezone": { - "version": "0.5.26", - "bundled": true, - "requires": { - "moment": ">= 2.9.0" - } - }, - "next-line": { - "version": "1.1.0", - "bundled": true - } + "yargs-parser": "^15.0.1" } }, "@postman/form-data": { @@ -105,9 +76,9 @@ "integrity": "sha512-RbzJvlNzmRq5c3O09UipeuXno4tA1FE6ikOjxZK0tuxVv3412l64l5t1W5pj4+rJq9vpkm/kwiR07aZXnsKPxw==" }, "@types/node": { - "version": "16.0.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-16.0.0.tgz", - "integrity": "sha512-TmCW5HoZ2o2/z2EYi109jLqIaPIi9y/lc2LmDCWzuCi35bcaQ+OtUh6nwBiFK7SOu25FAU5+YKdqFZUwtqGSdg==", + "version": "17.0.4", + "resolved": "https://registry.npmjs.org/@types/node/-/node-17.0.4.tgz", + "integrity": "sha512-6xwbrW4JJiJLgF+zNypN5wr2ykM9/jHcL7rQ8fZe2vuftggjzZeRSM4OwRc6Xk8qWjwJ99qVHo/JgOGmomWRog==", "optional": true }, "@types/yauzl": { @@ -170,9 +141,9 @@ } }, "ansi-regex": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz", - "integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg==" + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==" }, "ansi-styles": { "version": "4.3.0", @@ -188,9 +159,9 @@ "integrity": "sha1-jCpe8kcv2ep0KwTHenUJO6J1fJM=" }, "asn1": { - "version": "0.2.4", - "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz", - "integrity": "sha512-jxwzQpLQjSmWXgwaCZE9Nz+glAG01yF1QnWgbhGwHI5A6FRIEY6IVqtHhIepHqI7/kyEyQEagBC5mBEFlIYvdg==", + "version": "0.2.6", + "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.6.tgz", + "integrity": "sha512-ix/FxPn0MDjeyJ7i/yoHGFt/EX6LyNbxSEhPPXODPL+KB0VPk86UYfL0lMdy+KCnv+fmvIzySwaK5COwqVbWTQ==", "requires": { "safer-buffer": "~2.1.0" } @@ -445,9 +416,9 @@ } }, "debug": { - "version": "4.3.2", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.2.tgz", - "integrity": "sha512-mOp8wKcvj7XxC78zLgw/ZA+6TSgkoE2C/ienthhRD298T7UNwAg9diBpLRxC0mOezLl4B0xV7M0cCO6P/O0Xhw==", + "version": "4.3.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.3.tgz", + "integrity": "sha512-/zxw5+vh1Tfv+4Qn7a5nsbcJKPaSvCDhojn6FEl9vupwK2VCSDtEiEtqr8DFtzYFOdz63LBkxec7DYuc2jon6Q==", "requires": { "ms": "2.1.2" } @@ -515,9 +486,9 @@ } }, "dompurify": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.3.0.tgz", - "integrity": "sha512-VV5C6Kr53YVHGOBKO/F86OYX6/iLTw2yVSI721gKetxpHCK/V5TaLEf9ODjRgl1KLSWRMY6cUhAbv/c+IUnwQw==" + "version": "2.3.4", + "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.3.4.tgz", + "integrity": "sha512-6BVcgOAVFXjI0JTjEvZy901Rghm+7fDQOrNIcxB4+gdhj6Kwp6T9VBhBY/AbagKHJocRkDYGd6wvI+p4/10xtQ==" }, "domutils": { "version": "1.5.1", @@ -702,9 +673,9 @@ } }, "glob": { - "version": "7.1.7", - "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.7.tgz", - "integrity": "sha512-OvD9ENzPLbegENnYP5UUfJIirTg4+XwMWGaQfQTY0JenxNvvIKP3U3/tAQSPIu/lHxXYSZmpXlUHeqAIdKzBLQ==", + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.0.tgz", + "integrity": "sha512-lmLf6gtyrPq8tTjSmrO94wBeQbFR3HbLHbuyD69wuyQkImp2hWqMGB47OX65FBkPffO641IP9jWa1z4ivqG26Q==", "requires": { "fs.realpath": "^1.0.0", "inflight": "^1.0.4", @@ -729,9 +700,9 @@ } }, "heap": { - "version": "0.2.6", - "resolved": "https://registry.npmjs.org/heap/-/heap-0.2.6.tgz", - "integrity": "sha1-CH4fELBGky/IWU3Z5tN4r8nR5aw=" + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/heap/-/heap-0.2.7.tgz", + "integrity": "sha512-2bsegYkkHO+h/9MGbn6KWcE45cHZgPANo5LXF7EvWdT0yT2EguSVO1nDgU5c8+ZOPwp2vMNa7YFsJhVcDR9Sdg==" }, "html-encoding-sniffer": { "version": "1.0.2", @@ -773,12 +744,12 @@ } }, "http-signature": { - "version": "1.3.5", - "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.3.5.tgz", - "integrity": "sha512-NwoTQYSJoFt34jSBbwzDHDofoA61NGXzu6wXh95o1Ry62EnmKjXb/nR/RknLeZ3G/uGwrlKNY2z7uPt+Cdl7Tw==", + "version": "1.3.6", + "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.3.6.tgz", + "integrity": "sha512-3adrsD6zqo4GsTqtO7FyrejHNv+NgiIfAfv68+jVlFmSr9OGy7zrxONceFRLKvnnZA5jbxQBX1u9PpB6Wi32Gw==", "requires": { "assert-plus": "^1.0.0", - "jsprim": "^1.2.2", + "jsprim": "^2.0.2", "sshpk": "^1.14.1" } }, @@ -848,6 +819,11 @@ "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz", "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo=" }, + "jquery": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/jquery/-/jquery-3.6.0.tgz", + "integrity": "sha512-JVzAR/AjBvVt2BmYhxRCSYysDsPcssdmTFnzyLEts9qNwmjmu4JTAMYubEfwVOSwpQ1I1sKKFcxhZCI2buerfw==" + }, "jsbn": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz", @@ -887,9 +863,9 @@ } }, "json-schema": { - "version": "0.2.3", - "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz", - "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM=" + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz", + "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==" }, "json-schema-traverse": { "version": "0.4.1", @@ -902,20 +878,20 @@ "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus=" }, "jsprim": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.1.tgz", - "integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-2.0.2.tgz", + "integrity": "sha512-gqXddjPqQ6G40VdnI6T6yObEC+pDNvyP95wdQhkWkg7crHH3km5qP1FsOXEkzEQwnz6gz5qGTn1c2Y52wP3OyQ==", "requires": { "assert-plus": "1.0.0", "extsprintf": "1.3.0", - "json-schema": "0.2.3", + "json-schema": "0.4.0", "verror": "1.10.0" } }, "jszip": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.6.0.tgz", - "integrity": "sha512-jgnQoG9LKnWO3mnVNBnfhkh0QknICd1FGSrXcgrl67zioyJ4wgx25o9ZqwNtrROSflGBCGYnJfjrIyRIby1OoQ==", + "version": "3.7.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.7.1.tgz", + "integrity": "sha512-ghL0tz1XG9ZEmRMcEN2vt7xabrDdqHHeykgARpmZ0BiIctWxM47Vt63ZO2dnp4QYt/xJVLLy5Zv1l/xRdh2byg==", "requires": { "lie": "~3.3.0", "pako": "~1.0.2", @@ -1078,11 +1054,24 @@ "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==" }, + "moment": { + "version": "2.29.3", + "resolved": "https://registry.npmjs.org/moment/-/moment-2.29.3.tgz", + "integrity": "sha512-c6YRvhEo//6T2Jz/vVtYzqBzwvPT95JBQ+smCytzf7c50oMZRsR/a4w88aD34I+/QVSfnoAnSBFPJHItlOMJVw==" + }, "moment-parseformat": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/moment-parseformat/-/moment-parseformat-3.0.0.tgz", "integrity": "sha512-dVgXe6b6DLnv4CHG7a1zUe5mSXaIZ3c6lSHm/EKeVeQI2/4pwe0VRde8OyoCE1Ro2lKT5P6uT9JElF7KDLV+jw==" }, + "moment-timezone": { + "version": "0.5.26", + "resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.26.tgz", + "integrity": "sha512-sFP4cgEKTCymBBKgoxZjYzlSovC20Y6J7y3nanDc5RoBIXKlZhoYwBoZGe3flwU6A372AcRwScH8KiwV6zjy1g==", + "requires": { + "moment": ">= 2.9.0" + } + }, "ms": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", @@ -1094,9 +1083,33 @@ "integrity": "sha1-/K5XhTBStqm66CCOQN19PC0wRgM=" }, "node-fetch": { - "version": "2.6.1", - "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.1.tgz", - "integrity": "sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==" + "version": "2.6.7", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.7.tgz", + "integrity": "sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ==", + "requires": { + "whatwg-url": "^5.0.0" + }, + "dependencies": { + "tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha1-gYT9NH2snNwYWZLzpmIuFLnZq2o=" + }, + "webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE=" + }, + "whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha1-lmRU6HZUYuN2RNNib2dCzotwll0=", + "requires": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + } + } }, "nth-check": { "version": "1.0.2", @@ -1207,9 +1220,9 @@ "integrity": "sha512-2qHaIQr2VLRFoxe2nASzsV6ef4yOOH+Fi9FBOVH6cqeSgUnoyySPZkxzLuzd+RYOQTRpROA0ztTMqxROKSb/nA==" }, "postman-request": { - "version": "2.88.1-postman.29", - "resolved": "https://registry.npmjs.org/postman-request/-/postman-request-2.88.1-postman.29.tgz", - "integrity": "sha512-QuL3+AvGlmPLb1Qf0t/rM8M4U8LCYbADZBijUNToLl6l37i65KH8wY1gTLWLxlw2I6ugxUfX2Zyyk5/J5HFZIg==", + "version": "2.88.1-postman.31", + "resolved": "https://registry.npmjs.org/postman-request/-/postman-request-2.88.1-postman.31.tgz", + "integrity": "sha512-OJbYqP7ItxQ84yHyuNpDywCZB0HYbpHJisMQ9lb1cSL3N5H3Td6a2+3l/a74UMd3u82BiGC5yQyYmdOIETP/nQ==", "requires": { "@postman/form-data": "~3.1.1", "@postman/tunnel-agent": "^0.6.3", @@ -1308,16 +1321,16 @@ } }, "ws": { - "version": "7.5.2", - "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.2.tgz", - "integrity": "sha512-lkF7AWRicoB9mAgjeKbGqVUekLnSNO4VjKVnuPHpQeOxZOErX6BPXwJk70nFslRCEEA8EVW7ZjKwXaP9N+1sKQ==" + "version": "7.5.6", + "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.6.tgz", + "integrity": "sha512-6GLgCqo2cy2A2rjCNFlxQS6ZljG/coZfZXclldI8FB/1G3CCI36Zd8xy2HrFVACi8tfk5XrgLQEk+P0Tnz9UcA==" } } }, "qs": { - "version": "6.5.2", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz", - "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA==" + "version": "6.5.3", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.3.tgz", + "integrity": "sha512-qxXIEh4pCGfHICj1mAJQ2/2XVZkjCDTcEgfoSQxc/fYivUZxTkk7L3bDBJSoNrEzXI17oUO5Dp07ktqE5KzczA==" }, "querystring": { "version": "0.2.0", @@ -1334,9 +1347,9 @@ }, "dependencies": { "acorn": { - "version": "8.4.1", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.4.1.tgz", - "integrity": "sha512-asabaBSkEKosYKMITunzX177CXxQ4Q8BSSzMTKD+FefUhipQC70gfW5SiUDhYQ3vk8G+81HqQk7Fv9OXwwn9KA==" + "version": "8.6.0", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.6.0.tgz", + "integrity": "sha512-U1riIR+lBSNi3IbxtaHOIKdH8sLFv3NYfNv8sg7ZsNhcfl4HF2++BfqqrNAxoCLQW1iiylOj76ecnaUxz+z9yw==" }, "acorn-globals": { "version": "6.0.0", @@ -1417,9 +1430,9 @@ } }, "estraverse": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz", - "integrity": "sha512-BxbNGGNm0RyRYvUdHpIwv9IWzeM9XClbOxwoATuFdOE7ZE6wHL+HQ5T8hoPM+zHvmKzzsEqhgy0GrQ5X13afiQ==" + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", + "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==" }, "form-data": { "version": "3.0.1", @@ -1440,9 +1453,9 @@ } }, "jsdom": { - "version": "16.6.0", - "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.6.0.tgz", - "integrity": "sha512-Ty1vmF4NHJkolaEmdjtxTfSfkdb8Ywarwf63f+F8/mDD1uLSSWDxDuMiZxiPhwunLrn9LOSVItWj4bLYsLN3Dg==", + "version": "16.7.0", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.7.0.tgz", + "integrity": "sha512-u9Smc2G1USStM+s/x1ru5Sxrl6mPYCbByG1U/hUmqaVsm4tbNyS7CicOSRyuGQYZhTu0h84qkZZQ/I+dzizSVw==", "requires": { "abab": "^2.0.5", "acorn": "^8.2.4", @@ -1469,7 +1482,7 @@ "whatwg-encoding": "^1.0.5", "whatwg-mimetype": "^2.3.0", "whatwg-url": "^8.5.0", - "ws": "^7.4.5", + "ws": "^7.4.6", "xml-name-validator": "^3.0.0" } }, @@ -1512,9 +1525,9 @@ } }, "ws": { - "version": "7.5.2", - "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.2.tgz", - "integrity": "sha512-lkF7AWRicoB9mAgjeKbGqVUekLnSNO4VjKVnuPHpQeOxZOErX6BPXwJk70nFslRCEEA8EVW7ZjKwXaP9N+1sKQ==" + "version": "7.5.6", + "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.6.tgz", + "integrity": "sha512-6GLgCqo2cy2A2rjCNFlxQS6ZljG/coZfZXclldI8FB/1G3CCI36Zd8xy2HrFVACi8tfk5XrgLQEk+P0Tnz9UcA==" } } }, @@ -1529,9 +1542,9 @@ } }, "regenerator-runtime": { - "version": "0.13.7", - "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.7.tgz", - "integrity": "sha512-a54FxoJDIr27pgf7IgeQGxmqUNYrcV338lf/6gH456HZ/PhX+5BcwHXG9ajESmwe6WRO0tAzRUrRmNONWgkrew==" + "version": "0.13.9", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.9.tgz", + "integrity": "sha512-p3VT+cOEgxFsRRA9X4lkI1E+k2/CtnKtU4gcxyaCUreilL/vqI6CdZ3wxVUx3UOUg+gnUOQQcRI7BmSI656MYA==" }, "request": { "version": "2.88.2", @@ -1569,6 +1582,17 @@ "jsprim": "^1.2.2", "sshpk": "^1.7.0" } + }, + "jsprim": { + "version": "1.4.2", + "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.2.tgz", + "integrity": "sha512-P2bSOMAc/ciLz6DzgjVlGJP9+BrJWu5UDGK70C2iweC5QBIeFf0ZXRvGjEj2uYgrY2MkAAhsSWHDWlFtEroZWw==", + "requires": { + "assert-plus": "1.0.0", + "extsprintf": "1.3.0", + "json-schema": "0.4.0", + "verror": "1.10.0" + } } } }, @@ -1683,9 +1707,9 @@ }, "dependencies": { "acorn": { - "version": "8.4.1", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.4.1.tgz", - "integrity": "sha512-asabaBSkEKosYKMITunzX177CXxQ4Q8BSSzMTKD+FefUhipQC70gfW5SiUDhYQ3vk8G+81HqQk7Fv9OXwwn9KA==" + "version": "8.6.0", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.6.0.tgz", + "integrity": "sha512-U1riIR+lBSNi3IbxtaHOIKdH8sLFv3NYfNv8sg7ZsNhcfl4HF2++BfqqrNAxoCLQW1iiylOj76ecnaUxz+z9yw==" }, "acorn-globals": { "version": "6.0.0", @@ -1766,9 +1790,9 @@ } }, "estraverse": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz", - "integrity": "sha512-BxbNGGNm0RyRYvUdHpIwv9IWzeM9XClbOxwoATuFdOE7ZE6wHL+HQ5T8hoPM+zHvmKzzsEqhgy0GrQ5X13afiQ==" + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", + "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==" }, "form-data": { "version": "3.0.1", @@ -1797,9 +1821,9 @@ } }, "jsdom": { - "version": "16.6.0", - "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.6.0.tgz", - "integrity": "sha512-Ty1vmF4NHJkolaEmdjtxTfSfkdb8Ywarwf63f+F8/mDD1uLSSWDxDuMiZxiPhwunLrn9LOSVItWj4bLYsLN3Dg==", + "version": "16.7.0", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.7.0.tgz", + "integrity": "sha512-u9Smc2G1USStM+s/x1ru5Sxrl6mPYCbByG1U/hUmqaVsm4tbNyS7CicOSRyuGQYZhTu0h84qkZZQ/I+dzizSVw==", "requires": { "abab": "^2.0.5", "acorn": "^8.2.4", @@ -1826,7 +1850,7 @@ "whatwg-encoding": "^1.0.5", "whatwg-mimetype": "^2.3.0", "whatwg-url": "^8.5.0", - "ws": "^7.4.5", + "ws": "^7.4.6", "xml-name-validator": "^3.0.0" } }, @@ -1869,9 +1893,9 @@ } }, "ws": { - "version": "7.5.2", - "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.2.tgz", - "integrity": "sha512-lkF7AWRicoB9mAgjeKbGqVUekLnSNO4VjKVnuPHpQeOxZOErX6BPXwJk70nFslRCEEA8EVW7ZjKwXaP9N+1sKQ==" + "version": "7.5.6", + "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.6.tgz", + "integrity": "sha512-6GLgCqo2cy2A2rjCNFlxQS6ZljG/coZfZXclldI8FB/1G3CCI36Zd8xy2HrFVACi8tfk5XrgLQEk+P0Tnz9UcA==" } } }, @@ -1882,9 +1906,9 @@ "optional": true }, "sshpk": { - "version": "1.16.1", - "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.16.1.tgz", - "integrity": "sha512-HXXqVUq7+pcKeLqqZj6mHFUMvXtOJt1uoUx09pFW6011inTMxqI8BA8PM95myrIyyKwdnzjdFjLiE6KBPVtJIg==", + "version": "1.17.0", + "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.17.0.tgz", + "integrity": "sha512-/9HIEs1ZXGhSPE8X6Ccm7Nam1z8KcoCqPdI7ecm1N33EzAetWahvQWVqLZtaZQ+IDKX4IyA2o0gBzqIMkAagHQ==", "requires": { "asn1": "~0.2.3", "assert-plus": "^1.0.0", @@ -1916,13 +1940,13 @@ "integrity": "sha1-PYRT5ydKLkShQrPchEnftk2a3jo=" }, "string-width": { - "version": "4.2.2", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.2.tgz", - "integrity": "sha512-XBJbT3N4JhVumXE0eoLU9DCjcaF92KLNqTmFCnG1pf8duUxFGwtP6AD6nkjw9a3IdiRtL3E2w3JDiE/xi3vOeA==", + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", "requires": { "emoji-regex": "^8.0.0", "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.0" + "strip-ansi": "^6.0.1" } }, "string_decoder": { @@ -1934,11 +1958,11 @@ } }, "strip-ansi": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.0.tgz", - "integrity": "sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==", + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", "requires": { - "ansi-regex": "^5.0.0" + "ansi-regex": "^5.0.1" } }, "strong-data-uri": { @@ -2187,9 +2211,9 @@ } }, "wuzzy": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/wuzzy/-/wuzzy-0.1.6.tgz", - "integrity": "sha512-x1lDcj0VvzJ1ygDpd9LWMnQVei6gEkUbCcZUG8TPnXhlPbaQWQa32ab/6xbm/samxJ2T3Y2+P3xHeeQIAcEvqQ==", + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/wuzzy/-/wuzzy-0.1.8.tgz", + "integrity": "sha512-FUzKQepFSTnANsDYwxpIzGJ/dIJaqxuMre6tzzbvWwFAiUHPsI1nVQVCLK4Xqr67KO7oYAK0kaCcI/+WYj/7JA==", "requires": { "lodash": "^4.17.15" } @@ -2231,9 +2255,9 @@ } }, "yargs-parser": { - "version": "13.1.2", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-13.1.2.tgz", - "integrity": "sha512-3lbsNRf/j+A4QuSZfDRA7HRSfWrzO0YjqTJd5kjAq37Zep1CEgaYmrH9Q3GwPiB9cHyd1Y1UwggGhJGoxipbzg==", + "version": "15.0.3", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-15.0.3.tgz", + "integrity": "sha512-/MVEVjTXy/cGAjdtQf8dW3V9b97bPN7rNn8ETj6BmAQL7ibC7O1Q9SPJbGjgh3SlwoBNXMzj/ZGIj8mBgl12YA==", "requires": { "camelcase": "^5.0.0", "decamelize": "^1.2.0" diff --git a/package.json b/package.json index 782a7be7..405a2ed1 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,7 @@ "repository": "github:ArchiveBox/ArchiveBox", "license": "MIT", "dependencies": { - "@postlight/mercury-parser": "^2.2.0", + "@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git", "readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git", "single-file": "git+https://github.com/gildas-lormeau/SingleFile.git" } diff --git a/setup.py b/setup.py index a9d8a509..346d3b62 100755 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ INSTALL_REQUIRES = [ "django-extensions>=3.0.3", "dateparser>=1.0.0", "youtube-dl>=2021.04.17", + "yt-dlp>=2021.4.11", "python-crontab>=2.5.1", "croniter>=0.3.34", "w3lib>=1.22.0", diff --git a/stdeb.cfg b/stdeb.cfg index 6664c6c7..571d4245 100644 --- a/stdeb.cfg +++ b/stdeb.cfg @@ -5,7 +5,7 @@ Package3: archivebox Suite: focal Suite3: focal Build-Depends: debhelper, dh-python, python3-all, python3-pip, python3-setuptools, python3-wheel, python3-stdeb -Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep +Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, yt-dlp, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep X-Python3-Version: >= 3.7 XS-Python-Version: >= 3.7 Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck