move almost all config into new archivebox.CONSTANTS
Some checks are pending
CodeQL / Analyze (python) (push) Waiting to run
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Build GitHub Pages website / build (push) Waiting to run
Build GitHub Pages website / deploy (push) Blocked by required conditions
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run

This commit is contained in:
Nick Sweeting 2024-09-25 05:10:09 -07:00
parent f5e8d99fdf
commit bb65b2dbec
No known key found for this signature in database
32 changed files with 982 additions and 840 deletions

View file

@ -1,14 +1,15 @@
__package__ = 'archivebox' __package__ = 'archivebox'
# print('INSTALLING MONKEY PATCHES')
from .monkey_patches import * # print('INSTALLING MONKEY PATCHES')
from .monkey_patches import * # noqa
# print('DONE INSTALLING MONKEY PATCHES')
import os import os
import importlib import importlib.metadata
from pathlib import Path from pathlib import Path
PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir
DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir
@ -28,7 +29,9 @@ def _detect_installed_version():
raise Exception('Failed to detect installed archivebox version!') raise Exception('Failed to detect installed archivebox version!')
VERSION = _detect_installed_version()
__version__ = _detect_installed_version() __version__ = VERSION
# print('DONE INSTALLING MONKEY PATCHES')
from .constants import CONSTANTS

View file

@ -26,10 +26,7 @@ import io
import re import re
import sys import sys
import json import json
import inspect
import getpass
import shutil import shutil
import requests
import archivebox import archivebox
from hashlib import md5 from hashlib import md5
@ -38,7 +35,6 @@ from datetime import datetime, timezone
from typing import Optional, Type, Tuple, Dict from typing import Optional, Type, Tuple, Dict
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
from configparser import ConfigParser from configparser import ConfigParser
import importlib.metadata
from pydantic_pkgr import SemVer from pydantic_pkgr import SemVer
from rich.progress import Progress from rich.progress import Progress
@ -49,7 +45,6 @@ from django.db.backends.sqlite3.base import Database as sqlite3
from .config_stubs import ( from .config_stubs import (
AttrDict, AttrDict,
SimpleConfigValueDict,
ConfigValue, ConfigValue,
ConfigDict, ConfigDict,
ConfigDefaultValue, ConfigDefaultValue,
@ -61,7 +56,7 @@ from .misc.logging import (
ANSI, ANSI,
COLOR_DICT, COLOR_DICT,
stderr, stderr,
hint, hint, # noqa
) )
# print('STARTING CONFIG LOADING') # print('STARTING CONFIG LOADING')
@ -165,8 +160,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, 'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'}, 'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'}, 'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' wget/{WGET_VERSION}'},
'COOKIES_FILE': {'type': str, 'default': None}, 'COOKIES_FILE': {'type': str, 'default': None},
@ -254,12 +249,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CURL_BINARY': {'type': str, 'default': 'curl'}, 'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'}, 'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2 'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')}, 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
'NODE_BINARY': {'type': str, 'default': 'node'}, 'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, # 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
# 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
# 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, 'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
@ -308,212 +303,16 @@ CONFIG_FILENAME = 'ArchiveBox.conf'
STATICFILE_EXTENSIONS = {
# 99.999% of the time, URLs ending in these extensions are static files
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img',
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# These are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_OUTPUT_DIR = {
".gitignore",
"lost+found",
".DS_Store",
".venv",
"venv",
"virtualenv",
".virtualenv",
"node_modules",
"package.json",
"package-lock.json",
"yarn.lock",
"static",
"sonic",
"search.sqlite3",
CRONTABS_DIR_NAME,
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
CACHE_DIR_NAME,
LIB_DIR_NAME,
PERSONAS_DIR_NAME,
SQL_INDEX_FILENAME,
f"{SQL_INDEX_FILENAME}-wal",
f"{SQL_INDEX_FILENAME}-shm",
"queue.sqlite3",
"queue.sqlite3-wal",
"queue.sqlite3-shm",
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
CONFIG_FILENAME,
f"{CONFIG_FILENAME}.bak",
"static_index.json",
}
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
CONSTANTS = { CONSTANTS = archivebox.CONSTANTS._asdict()
"PACKAGE_DIR_NAME": {'default': lambda c: PACKAGE_DIR_NAME},
"LIB_DIR_NAME": {'default': lambda c: LIB_DIR_NAME},
"TEMPLATES_DIR_NAME": {'default': lambda c: TEMPLATES_DIR_NAME},
"ARCHIVE_DIR_NAME": {'default': lambda c: ARCHIVE_DIR_NAME},
"SOURCES_DIR_NAME": {'default': lambda c: SOURCES_DIR_NAME},
"LOGS_DIR_NAME": {'default': lambda c: LOGS_DIR_NAME},
"CACHE_DIR_NAME": {'default': lambda c: CACHE_DIR_NAME},
"PERSONAS_DIR_NAME": {'default': lambda c: PERSONAS_DIR_NAME},
"CRONTABS_DIR_NAME": {'default': lambda c: CRONTABS_DIR_NAME},
"SQL_INDEX_FILENAME": {'default': lambda c: SQL_INDEX_FILENAME},
"JSON_INDEX_FILENAME": {'default': lambda c: JSON_INDEX_FILENAME},
"HTML_INDEX_FILENAME": {'default': lambda c: HTML_INDEX_FILENAME},
"ROBOTS_TXT_FILENAME": {'default': lambda c: ROBOTS_TXT_FILENAME},
"FAVICON_FILENAME": {'default': lambda c: FAVICON_FILENAME},
"CONFIG_FILENAME": {'default': lambda c: CONFIG_FILENAME},
"DEFAULT_CLI_COLORS": {'default': lambda c: DEFAULT_CLI_COLORS},
"ANSI": {'default': lambda c: ANSI},
"COLOR_DICT": {'default': lambda c: COLOR_DICT},
"STATICFILE_EXTENSIONS": {'default': lambda c: STATICFILE_EXTENSIONS},
"ALLOWED_IN_OUTPUT_DIR": {'default': lambda c: ALLOWED_IN_OUTPUT_DIR},
# "ALLOWDENYLIST_REGEX_FLAGS": {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS},
}
############################## Version Config ################################## ############################## Version Config ##################################
def get_system_user() -> str:
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
# uid 999 is especially problematic and breaks many attempts
SYSTEM_USER = None
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
# Option 1
try:
import pwd
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
except (ModuleNotFoundError, Exception):
pass
# Option 2
try:
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
except Exception:
pass
# Option 3
try:
SYSTEM_USER = SYSTEM_USER or os.getlogin()
except Exception:
pass
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
def get_version(config):
try:
return importlib.metadata.version(__package__ or 'archivebox')
except importlib.metadata.PackageNotFoundError:
try:
pyproject_config = (config['PACKAGE_DIR'] / 'pyproject.toml').read_text()
for line in pyproject_config:
if line.startswith('version = '):
return line.split(' = ', 1)[-1].strip('"')
except FileNotFoundError:
# building docs, pyproject.toml is not available
return 'dev'
raise Exception('Failed to detect installed archivebox version!')
def get_commit_hash(config) -> Optional[str]:
try:
git_dir = config['PACKAGE_DIR'] / '../.git'
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
commit_hash = git_dir.joinpath(ref).read_text().strip()
return commit_hash
except Exception:
pass
try:
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
pass
return None
def get_build_time(config) -> str:
if config['IN_DOCKER']:
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
return docker_build_end_time
src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
def get_versions_available_on_github(config):
"""
returns a dictionary containing the ArchiveBox GitHub release info for
the recommended upgrade version and the currently installed version
"""
# we only want to perform the (relatively expensive) check for new versions
# when its most relevant, e.g. when the user runs a long-running command
subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
if subcommand_run_by_user not in long_running_commands:
return None
github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
response = requests.get(github_releases_api)
if response.status_code != 200:
stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
return None
all_releases = response.json()
installed_version = parse_version_string(config['VERSION'])
# find current version or nearest older version (to link to)
current_version = None
for idx, release in enumerate(all_releases):
release_version = parse_version_string(release['tag_name'])
if release_version <= installed_version:
current_version = release
break
current_version = current_version or all_releases[-1]
# recommended version is whatever comes after current_version in the release list
# (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
try:
recommended_version = all_releases[idx+1]
except IndexError:
recommended_version = None
return {'recommended_version': recommended_version, 'current_version': current_version}
def can_upgrade(config):
if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
return recommended_version > current_version
return False
############################## Derived Config ################################## ############################## Derived Config ##################################
@ -523,55 +322,25 @@ def can_upgrade(config):
# These are derived/computed values calculated *after* all user-provided config values are ingested # These are derived/computed values calculated *after* all user-provided config values are ingested
# they appear in `archivebox config` output and are intended to be read-only for the user # they appear in `archivebox config` output and are intended to be read-only for the user
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
**CONSTANTS, **{
key: {'default': lambda c: val}
for key, val in archivebox.CONSTANTS.items()
},
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
'USER': {'default': lambda c: get_system_user()},
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()})},
'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent}, 'PACKAGE_DIR': {'default': lambda c: archivebox.PACKAGE_DIR.resolve()},
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME}, 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])}, 'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
'CACHE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
'LIB_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME},
'BIN_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME / 'bin'},
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
'NODE_BIN_PATH': {'default': lambda c: str((Path(c["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))},
'VERSION': {'default': lambda c: get_version(c).split('+', 1)[0]}, # remove +editable from user-displayed version string
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)}, # short git commit hash of codebase HEAD commit
'BUILD_TIME': {'default': lambda c: get_build_time(c)}, # docker build completed time or python src last modified time
'VERSIONS_AVAILABLE': {'default': lambda c: False}, # get_versions_available_on_github(c)},
'CAN_UPGRADE': {'default': lambda c: False}, # can_upgrade(c)},
'PYTHON_BINARY': {'default': lambda c: sys.executable},
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])}, 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None}, 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)}, # 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []}, 'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []}, 'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']}, 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
@ -580,23 +349,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])}, 'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None}, 'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False}, 'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)}, # 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []}, 'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
# 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []}, 'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []}, 'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
@ -605,21 +365,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']}, 'USE_NODE': {'default': lambda c: True},
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None}, 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, # 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, # 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
@ -696,12 +447,10 @@ def load_config_val(key: str,
raise Exception('Config values can only be str, bool, int, or json') raise Exception('Config values can only be str, bool, int, or json')
def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]: def load_config_file(out_dir: str | None=archivebox.DATA_DIR) -> Optional[ConfigDict]:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() config_path = archivebox.CONSTANTS.CONFIG_FILE
assert out_dir and out_dir.is_dir()
config_path = Path(out_dir) / CONFIG_FILENAME
if config_path.exists(): if config_path.exists():
config_file = ConfigParser() config_file = ConfigParser()
config_file.optionxform = str config_file.optionxform = str
@ -718,7 +467,7 @@ def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
return None return None
def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> ConfigDict: def write_config_file(config: Dict[str, str], out_dir: str | None=archivebox.DATA_DIR) -> ConfigDict:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
from .system import atomic_write from .system import atomic_write
@ -737,8 +486,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> Confi
""") """)
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() config_path = archivebox.CONSTANTS.CONFIG_FILE
config_path = Path(out_dir) / CONFIG_FILENAME
if not config_path.exists(): if not config_path.exists():
atomic_write(config_path, CONFIG_HEADER) atomic_write(config_path, CONFIG_HEADER)
@ -833,7 +581,7 @@ def load_config(defaults: ConfigDefaultDict,
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration') stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
stderr() stderr()
# raise # raise
raise SystemExit(2) # raise SystemExit(2)
return AttrDict(extended_config) return AttrDict(extended_config)
@ -984,98 +732,6 @@ def wget_supports_compression(config):
except (FileNotFoundError, OSError): except (FileNotFoundError, OSError):
return False return False
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
return {
'PACKAGE_DIR': {
'path': (config['PACKAGE_DIR']).resolve(),
'enabled': True,
'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(),
},
'TEMPLATES_DIR': {
'path': (config['TEMPLATES_DIR']).resolve(),
'enabled': True,
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
},
'LIB_DIR': {
'path': (config['LIB_DIR']).resolve(),
'enabled': True,
'is_valid': config['LIB_DIR'].is_dir(),
},
# 'NODE_MODULES_DIR': {
# 'path': ,
# 'enabled': ,
# 'is_valid': (...).exists(),
# },
}
def get_data_locations(config: ConfigDict) -> ConfigValue:
return {
# OLD: migrating to personas
# 'CHROME_USER_DATA_DIR': {
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
# },
# 'COOKIES_FILE': {
# 'path': os.path.abspath(config['COOKIES_FILE']),
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
# },
"OUTPUT_DIR": {
"path": config["OUTPUT_DIR"].resolve(),
"enabled": True,
"is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
"is_mount": os.path.ismount(config["OUTPUT_DIR"].resolve()),
},
"CONFIG_FILE": {
"path": config["CONFIG_FILE"].resolve(),
"enabled": True,
"is_valid": config["CONFIG_FILE"].exists(),
},
"SQL_INDEX": {
"path": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve(),
"enabled": True,
"is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
"is_mount": os.path.ismount((config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve()),
},
"ARCHIVE_DIR": {
"path": config["ARCHIVE_DIR"].resolve(),
"enabled": True,
"is_valid": config["ARCHIVE_DIR"].exists(),
"is_mount": os.path.ismount(config["ARCHIVE_DIR"].resolve()),
},
"SOURCES_DIR": {
"path": config["SOURCES_DIR"].resolve(),
"enabled": True,
"is_valid": config["SOURCES_DIR"].exists(),
},
"PERSONAS_DIR": {
"path": config["PERSONAS_DIR"].resolve(),
"enabled": True,
"is_valid": config["PERSONAS_DIR"].exists(),
},
"LOGS_DIR": {
"path": config["LOGS_DIR"].resolve(),
"enabled": True,
"is_valid": config["LOGS_DIR"].exists(),
},
"CACHE_DIR": {
"path": config["CACHE_DIR"].resolve(),
"enabled": True,
"is_valid": config["CACHE_DIR"].exists(),
},
"CUSTOM_TEMPLATES_DIR": {
"path": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).resolve(),
"enabled": bool(config["CUSTOM_TEMPLATES_DIR"]),
"is_valid": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).exists(),
},
# managed by bin/docker_entrypoint.sh and python-crontab:
# 'CRONTABS_DIR': {
# 'path': config['CRONTABS_DIR'].resolve(),
# 'enabled': True,
# 'is_valid': config['CRONTABS_DIR'].exists(),
# },
}
def get_dependency_info(config: ConfigDict) -> ConfigValue: def get_dependency_info(config: ConfigDict) -> ConfigValue:
return { return {
@ -1129,20 +785,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_NODE'], 'enabled': config['USE_NODE'],
'is_valid': bool(config['NODE_VERSION']), 'is_valid': bool(config['NODE_VERSION']),
}, },
'SINGLEFILE_BINARY': {
'path': bin_path(config['SINGLEFILE_BINARY']),
'version': config['SINGLEFILE_VERSION'],
'hash': bin_hash(config['SINGLEFILE_BINARY']),
'enabled': config['USE_SINGLEFILE'],
'is_valid': bool(config['SINGLEFILE_VERSION']),
},
'READABILITY_BINARY': {
'path': bin_path(config['READABILITY_BINARY']),
'version': config['READABILITY_VERSION'],
'hash': bin_hash(config['READABILITY_BINARY']),
'enabled': config['USE_READABILITY'],
'is_valid': bool(config['READABILITY_VERSION']),
},
'MERCURY_BINARY': { 'MERCURY_BINARY': {
'path': bin_path(config['MERCURY_BINARY']), 'path': bin_path(config['MERCURY_BINARY']),
'version': config['MERCURY_VERSION'], 'version': config['MERCURY_VERSION'],
@ -1157,13 +799,27 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_GIT'], 'enabled': config['USE_GIT'],
'is_valid': bool(config['GIT_VERSION']), 'is_valid': bool(config['GIT_VERSION']),
}, },
'YOUTUBEDL_BINARY': { # 'SINGLEFILE_BINARY': {
'path': bin_path(config['YOUTUBEDL_BINARY']), # 'path': bin_path(config['SINGLEFILE_BINARY']),
'version': config['YOUTUBEDL_VERSION'], # 'version': config['SINGLEFILE_VERSION'],
'hash': bin_hash(config['YOUTUBEDL_BINARY']), # 'hash': bin_hash(config['SINGLEFILE_BINARY']),
'enabled': config['USE_YOUTUBEDL'], # 'enabled': config['USE_SINGLEFILE'],
'is_valid': bool(config['YOUTUBEDL_VERSION']), # 'is_valid': bool(config['SINGLEFILE_VERSION']),
}, # },
# 'READABILITY_BINARY': {
# 'path': bin_path(config['READABILITY_BINARY']),
# 'version': config['READABILITY_VERSION'],
# 'hash': bin_hash(config['READABILITY_BINARY']),
# 'enabled': config['USE_READABILITY'],
# 'is_valid': bool(config['READABILITY_VERSION']),
# },
# 'YOUTUBEDL_BINARY': {
# 'path': bin_path(config['YOUTUBEDL_BINARY']),
# 'version': config['YOUTUBEDL_VERSION'],
# 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
# 'enabled': config['USE_YOUTUBEDL'],
# 'is_valid': bool(config['YOUTUBEDL_VERSION']),
# },
# 'CHROME_BINARY': { # 'CHROME_BINARY': {
# 'path': bin_path(config['CHROME_BINARY']), # 'path': bin_path(config['CHROME_BINARY']),
# 'version': config['CHROME_VERSION'], # 'version': config['CHROME_VERSION'],
@ -1227,10 +883,6 @@ assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # n
os.environ["TZ"] = TIMEZONE # noqa: F821 os.environ["TZ"] = TIMEZONE # noqa: F821
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821 os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
sys.path.append(CONFIG.NODE_BIN_PATH)
########################### Config Validity Checkers ########################### ########################### Config Validity Checkers ###########################
if not CONFIG.USE_COLOR: if not CONFIG.USE_COLOR:
@ -1256,6 +908,7 @@ def bump_startup_progress_bar():
def setup_django_minimal(): def setup_django_minimal():
sys.path.append(str(archivebox.PACKAGE_DIR)) sys.path.append(str(archivebox.PACKAGE_DIR))
os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup() django.setup()
@ -1267,29 +920,18 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS: with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25) INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
output_dir = out_dir or Path(config['OUTPUT_DIR']) output_dir = out_dir or archivebox.DATA_DIR
assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path) assert isinstance(output_dir, Path) and isinstance(archivebox.PACKAGE_DIR, Path)
bump_startup_progress_bar() bump_startup_progress_bar()
try: try:
from django.core.management import call_command from django.core.management import call_command
sys.path.append(str(config['PACKAGE_DIR'])) sys.path.append(str(archivebox.PACKAGE_DIR))
os.environ.setdefault('OUTPUT_DIR', str(output_dir)) os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py' os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
# Check to make sure JSON extension is available in our Sqlite3 instance
try:
cursor = sqlite3.connect(':memory:').cursor()
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
except sqlite3.OperationalError as exc:
stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
hint([
'Upgrade your Python version or install the extension manually:',
'https://code.djangoproject.com/wiki/JSON1Extension'
])
bump_startup_progress_bar() bump_startup_progress_bar()
@ -1310,28 +952,16 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
bump_startup_progress_bar() bump_startup_progress_bar()
from django.conf import settings from django.conf import settings
from plugins_sys.config.apps import SHELL_CONFIG
# log startup message to the error log # log startup message to the error log
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f: with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv) command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n") f.write(f"\n> {command}; TS={ts} VERSION={archivebox.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
if check_db: if check_db:
# Enable WAL mode in sqlite3
from django.db import connection
with connection.cursor() as cursor:
# Set Journal mode to WAL to allow for multiple writers
current_mode = cursor.execute("PRAGMA journal_mode")
if current_mode != 'wal':
cursor.execute("PRAGMA journal_mode=wal;")
# Set max blocking delay for concurrent writes and write sync mode
# https://litestream.io/tips/#busy-timeout
cursor.execute("PRAGMA busy_timeout = 5000;")
cursor.execute("PRAGMA synchronous = NORMAL;")
# Create cache table in DB if needed # Create cache table in DB if needed
try: try:
from django.core.cache import cache from django.core.cache import cache
@ -1348,9 +978,9 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
for conn in connections.all(): for conn in connections.all():
conn.close_if_unusable_or_obsolete() conn.close_if_unusable_or_obsolete()
sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME sql_index_path = archivebox.CONSTANTS.DATABASE_FILE
assert sql_index_path.exists(), ( assert sql_index_path.exists(), (
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)') f'No database file {sql_index_path} found in: {archivebox.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
bump_startup_progress_bar() bump_startup_progress_bar()
@ -1363,7 +993,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
logfire.configure() logfire.configure()
logfire.instrument_django(is_sql_commentor_enabled=True) logfire.instrument_django(is_sql_commentor_enabled=True)
logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv) logfire.info(f'Started ArchiveBox v{archivebox.VERSION}', argv=sys.argv)
except KeyboardInterrupt: except KeyboardInterrupt:
raise SystemExit(2) raise SystemExit(2)

249
archivebox/constants.py Normal file
View file

@ -0,0 +1,249 @@
__package__ = 'archivebox'
import os
from types import MappingProxyType
from typing import Set, Dict, NamedTuple, Tuple
from pathlib import Path
from benedict import benedict
import archivebox
from .misc.logging import DEFAULT_CLI_COLORS
###################### Config ##########################
class ConstantsConfig(NamedTuple):
VERSION: str = archivebox.__version__
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
PACKAGE_DIR: Path = archivebox.PACKAGE_DIR
PACKAGE_DIR_NAME: str = archivebox.PACKAGE_DIR.name
TEMPLATES_DIR_NAME: str = 'templates'
TEMPLATES_DIR: Path = archivebox.PACKAGE_DIR / TEMPLATES_DIR_NAME
STATIC_DIR: Path = TEMPLATES_DIR / 'static'
USER_PLUGINS_DIR_NAME: str = 'user_plugins'
CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
DATA_DIR: Path = archivebox.DATA_DIR
ARCHIVE_DIR_NAME: str = 'archive'
SOURCES_DIR_NAME: str = 'sources'
PERSONAS_DIR_NAME: str = 'personas'
CRONTABS_DIR_NAME: str = 'crontabs'
CACHE_DIR_NAME: str = 'cache'
LOGS_DIR_NAME: str = 'logs'
LIB_DIR_NAME: str = 'lib'
TMP_DIR_NAME: str = 'tmp'
OUTPUT_DIR: Path = archivebox.DATA_DIR
ARCHIVE_DIR: Path = archivebox.DATA_DIR / ARCHIVE_DIR_NAME
SOURCES_DIR: Path = archivebox.DATA_DIR / SOURCES_DIR_NAME
PERSONAS_DIR: Path = archivebox.DATA_DIR / PERSONAS_DIR_NAME
CACHE_DIR: Path = archivebox.DATA_DIR / CACHE_DIR_NAME
LOGS_DIR: Path = archivebox.DATA_DIR / LOGS_DIR_NAME
LIB_DIR: Path = archivebox.DATA_DIR / LIB_DIR_NAME
TMP_DIR: Path = archivebox.DATA_DIR / TMP_DIR_NAME
CUSTOM_TEMPLATES_DIR: Path = archivebox.DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
USER_PLUGINS_DIR: Path = archivebox.DATA_DIR / USER_PLUGINS_DIR_NAME
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
BIN_DIR: Path = LIB_BIN_DIR
CONFIG_FILENAME: str = 'ArchiveBox.conf'
SQL_INDEX_FILENAME: str = 'index.sqlite3'
CONFIG_FILE: Path = archivebox.DATA_DIR / CONFIG_FILENAME
DATABASE_FILE: Path = archivebox.DATA_DIR / SQL_INDEX_FILENAME
QUEUE_DATABASE_FILE: Path = archivebox.DATA_DIR / SQL_INDEX_FILENAME.replace('index.', 'queue.')
JSON_INDEX_FILENAME: str = 'index.json'
HTML_INDEX_FILENAME: str = 'index.html'
ROBOTS_TXT_FILENAME: str = 'robots.txt'
FAVICON_FILENAME: str = 'favicon.ico'
STATICFILE_EXTENSIONSSTATICFILE_EXTENSIONS: frozenset[str] = frozenset((
# 99.999% of the time, URLs ending in these extensions are static files
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img',
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# These are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
))
INGORED_PATHS: frozenset[str] = frozenset((
".git",
".svn",
".DS_Store",
".gitignore",
"lost+found",
".DS_Store",
".env",
"Dockerfile",
))
PIP_RELATED_NAMES: frozenset[str] = frozenset((
".venv",
"venv",
"virtualenv",
".virtualenv",
))
NPM_RELATED_NAMES: frozenset[str] = frozenset((
"node_modules",
"package.json",
"package-lock.json",
"yarn.lock",
))
DATA_DIR_NAMES: frozenset[str] = frozenset((
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
CACHE_DIR_NAME,
LIB_DIR_NAME,
PERSONAS_DIR_NAME,
CUSTOM_TEMPLATES_DIR_NAME,
USER_PLUGINS_DIR_NAME,
))
DATA_DIRS: frozenset[Path] = frozenset(archivebox.DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
DATA_FILE_NAMES: frozenset[str] = frozenset((
CONFIG_FILENAME,
SQL_INDEX_FILENAME,
f"{SQL_INDEX_FILENAME}-wal",
f"{SQL_INDEX_FILENAME}-shm",
"queue.sqlite3",
"queue.sqlite3-wal",
"queue.sqlite3-shm",
"search.sqlite3",
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
CONFIG_FILENAME,
f"{CONFIG_FILENAME}.bak",
"static_index.json",
))
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset((
*INGORED_PATHS,
*PIP_RELATED_NAMES,
*NPM_RELATED_NAMES,
*DATA_DIR_NAMES,
*DATA_FILE_NAMES,
"static", # created by old static exports <v0.6.0
"sonic", # created by docker bind mount
))
CODE_LOCATIONS = MappingProxyType(benedict({
'PACKAGE_DIR': {
'path': (archivebox.PACKAGE_DIR).resolve(),
'enabled': True,
'is_valid': (archivebox.PACKAGE_DIR / '__main__.py').exists(),
},
'LIB_DIR': {
'path': LIB_DIR.resolve(),
'enabled': True,
'is_valid': LIB_DIR.is_dir(),
},
'RUNTIME_CONFIG': {
'path': TMP_DIR.resolve(),
'enabled': True,
'is_valid': TMP_DIR.is_dir(),
},
'TEMPLATES_DIR': {
'path': TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': STATIC_DIR.exists(),
},
'CUSTOM_TEMPLATES_DIR': {
'path': CUSTOM_TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': CUSTOM_TEMPLATES_DIR.is_dir(),
},
}))
DATA_LOCATIONS = MappingProxyType(benedict({
"OUTPUT_DIR": {
"path": archivebox.DATA_DIR.resolve(),
"enabled": True,
"is_valid": DATABASE_FILE.exists(),
"is_mount": os.path.ismount(archivebox.DATA_DIR.resolve()),
},
"CONFIG_FILE": {
"path": CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": CONFIG_FILE.exists(),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": DATABASE_FILE.exists(),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"QUEUE_DATABASE": {
"path": QUEUE_DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": QUEUE_DATABASE_FILE.exists(),
"is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": ARCHIVE_DIR.exists(),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": SOURCES_DIR.exists(),
},
"PERSONAS_DIR": {
"path": PERSONAS_DIR.resolve(),
"enabled": PERSONAS_DIR.exists(),
"is_valid": PERSONAS_DIR.exists(),
},
"LOGS_DIR": {
"path": LOGS_DIR.resolve(),
"enabled": True,
"is_valid": LOGS_DIR.is_dir(),
},
"CACHE_DIR": {
"path": CACHE_DIR.resolve(),
"enabled": True,
"is_valid": CACHE_DIR.is_dir(),
},
}))
def items(self):
return self._asdict().items()
def keys(self):
return self._asdict().keys()
def values(self):
return self._asdict().values()
CONSTANTS = ConstantsConfig()
CONSTANTS_CONFIG = CONSTANTS

View file

@ -2,7 +2,6 @@ __package__ = 'archivebox.core'
import os import os
import threading
from pathlib import Path from pathlib import Path
from django.contrib import admin, messages from django.contrib import admin, messages
@ -19,6 +18,7 @@ from django.template import Template, RequestContext
from django.conf import settings from django.conf import settings
from django import forms from django import forms
import archivebox
from signal_webhooks.admin import WebhookAdmin from signal_webhooks.admin import WebhookAdmin
from signal_webhooks.utils import get_webhook_model from signal_webhooks.utils import get_webhook_model
@ -34,13 +34,13 @@ from queues.tasks import bg_archive_links, bg_archive_link, bg_add
from index.html import snapshot_icons from index.html import snapshot_icons
from logging_util import printable_filesize from logging_util import printable_filesize
from main import add, remove from main import remove
from extractors import archive_links from extractors import archive_links
CONFIG = settings.CONFIG CONFIG = settings.CONFIG
GLOBAL_CONTEXT = {'VERSION': CONFIG.VERSION, 'VERSIONS_AVAILABLE': CONFIG.VERSIONS_AVAILABLE, 'CAN_UPGRADE': CONFIG.CAN_UPGRADE} GLOBAL_CONTEXT = {'VERSION': archivebox.VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
# Admin URLs # Admin URLs
# /admin/ # /admin/

View file

@ -2,36 +2,27 @@ __package__ = 'archivebox.core'
import os import os
import sys import sys
import re
import logging
import inspect import inspect
import tempfile
import archivebox
from typing import Dict from typing import Dict
from pathlib import Path from pathlib import Path
import django from benedict import benedict
from django.utils.crypto import get_random_string from django.utils.crypto import get_random_string
import archivebox
from ..config import CONFIG from ..config import CONFIG
from ..config_stubs import AttrDict
assert isinstance(CONFIG, AttrDict)
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3] IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3] IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
PACKAGE_DIR = archivebox.PACKAGE_DIR
assert PACKAGE_DIR == CONFIG.PACKAGE_DIR
DATA_DIR = archivebox.DATA_DIR
assert DATA_DIR == CONFIG.OUTPUT_DIR
ARCHIVE_DIR = DATA_DIR / 'archive'
assert ARCHIVE_DIR == CONFIG.ARCHIVE_DIR
VERSION = archivebox.__version__ VERSION = archivebox.__version__
PACKAGE_DIR = archivebox.PACKAGE_DIR
DATA_DIR = archivebox.DATA_DIR
ARCHIVE_DIR = archivebox.DATA_DIR / 'archive'
################################################################################ ################################################################################
### ArchiveBox Plugin Settings ### ArchiveBox Plugin Settings
@ -39,17 +30,16 @@ VERSION = archivebox.__version__
def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]: def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
"""{"plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip", "user_plugins.other": "/data/user_plugins/other",...}"""
return { return {
f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py")) # key=get_plugin_order # Someday enforcing plugin import order may be required, but right now it's not needed for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py")) # key=get_plugin_order # Someday enforcing plugin import order may be required, but right now it's not needed
} } # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
PLUGIN_DIRS = { PLUGIN_DIRS = {
'plugins_sys': PACKAGE_DIR / 'plugins_sys', 'plugins_sys': PACKAGE_DIR / 'plugins_sys',
'plugins_pkg': PACKAGE_DIR / 'plugins_pkg', 'plugins_pkg': PACKAGE_DIR / 'plugins_pkg',
'plugins_auth': PACKAGE_DIR / 'plugins_auth', 'plugins_auth': PACKAGE_DIR / 'plugins_auth',
'plugins_search': PACKAGE_DIR / 'plugins_search', 'plugins_search': PACKAGE_DIR / 'plugins_search',
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor', 'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
'user_plugins': DATA_DIR / 'user_plugins', 'user_plugins': DATA_DIR / 'user_plugins',
} }
@ -59,17 +49,17 @@ for plugin_prefix, plugin_dir in PLUGIN_DIRS.items():
### Plugins Globals (filled by plugin_type.pluginname.apps.PluginName.register() after Django startup) ### Plugins Globals (filled by plugin_type.pluginname.apps.PluginName.register() after Django startup)
PLUGINS = AttrDict({}) PLUGINS = benedict({})
HOOKS = AttrDict({}) HOOKS = benedict({})
# Created later by Hook.register(settings) when each Plugin.register(settings) is called # Created later by Plugin.register(settings) -> Hook.register(settings):
# CONFIGS = AttrDict({}) # CONFIGS = benedict({})
# BINPROVIDERS = AttrDict({}) # BINPROVIDERS = benedict({})
# BINARIES = AttrDict({}) # BINARIES = benedict({})
# EXTRACTORS = AttrDict({}) # EXTRACTORS = benedict({})
# REPLAYERS = AttrDict({}) # REPLAYERS = benedict({})
# CHECKS = AttrDict({}) # CHECKS = benedict({})
# ADMINDATAVIEWS = AttrDict({}) # ADMINDATAVIEWS = benedict({})
################################################################################ ################################################################################
@ -113,7 +103,7 @@ INSTALLED_APPS = [
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. 'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
# ArchiveBox plugins # ArchiveBox plugins
*INSTALLED_PLUGINS.keys(), # all plugin django-apps found in archivebox/*_plugins and data/user_plugins, *INSTALLED_PLUGINS.keys(), # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
# plugin.register(settings) is called at import of each plugin (in the order they are listed here), then plugin.ready() is called at AppConfig.ready() time # plugin.register(settings) is called at import of each plugin (in the order they are listed here), then plugin.ready() is called at AppConfig.ready() time
# 3rd-party apps from PyPI that need to be loaded last # 3rd-party apps from PyPI that need to be loaded last
@ -164,7 +154,7 @@ if LDAP_CONFIG.LDAP_ENABLED:
################################################################################ ################################################################################
STATIC_URL = '/static/' STATIC_URL = '/static/'
TEMPLATES_DIR_NAME = 'templates'
STATICFILES_DIRS = [ STATICFILES_DIRS = [
*([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []), *([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
*[ *[
@ -172,7 +162,7 @@ STATICFILES_DIRS = [
for plugin_dir in PLUGIN_DIRS.values() for plugin_dir in PLUGIN_DIRS.values()
if (plugin_dir / 'static').is_dir() if (plugin_dir / 'static').is_dir()
], ],
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'), str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'static'),
] ]
TEMPLATE_DIRS = [ TEMPLATE_DIRS = [
@ -182,9 +172,9 @@ TEMPLATE_DIRS = [
for plugin_dir in PLUGIN_DIRS.values() for plugin_dir in PLUGIN_DIRS.values()
if (plugin_dir / 'templates').is_dir() if (plugin_dir / 'templates').is_dir()
], ],
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'), str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'core'),
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'), str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'admin'),
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME), str(PACKAGE_DIR / TEMPLATES_DIR_NAME),
] ]
TEMPLATES = [ TEMPLATES = [
@ -208,13 +198,14 @@ TEMPLATES = [
### External Service Settings ### External Service Settings
################################################################################ ################################################################################
from ..plugins_sys.config.constants import CONSTANTS
CACHE_DB_FILENAME = 'cache.sqlite3' # CACHE_DB_FILENAME = 'cache.sqlite3'
CACHE_DB_PATH = CONFIG.CACHE_DIR / CACHE_DB_FILENAME # CACHE_DB_PATH = CONSTANTS.CACHE_DIR / CACHE_DB_FILENAME
CACHE_DB_TABLE = 'django_cache' # CACHE_DB_TABLE = 'django_cache'
DATABASE_FILE = DATA_DIR / CONFIG.SQL_INDEX_FILENAME DATABASE_FILE = DATA_DIR / CONSTANTS.SQL_INDEX_FILENAME
DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE)) DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(CONSTANTS.DATABASE_FILE))
QUEUE_DATABASE_NAME = DATABASE_NAME.replace('index.sqlite3', 'queue.sqlite3') QUEUE_DATABASE_NAME = DATABASE_NAME.replace('index.sqlite3', 'queue.sqlite3')
@ -222,6 +213,7 @@ SQLITE_CONNECTION_OPTIONS = {
"TIME_ZONE": CONFIG.TIMEZONE, "TIME_ZONE": CONFIG.TIMEZONE,
"OPTIONS": { "OPTIONS": {
# https://gcollazo.com/optimal-sqlite-settings-for-django/ # https://gcollazo.com/optimal-sqlite-settings-for-django/
# # https://litestream.io/tips/#busy-timeout
"timeout": 5, "timeout": 5,
"check_same_thread": False, "check_same_thread": False,
"transaction_mode": "IMMEDIATE", "transaction_mode": "IMMEDIATE",
@ -345,7 +337,7 @@ STORAGES = {
"BACKEND": "django.core.files.storage.FileSystemStorage", "BACKEND": "django.core.files.storage.FileSystemStorage",
"OPTIONS": { "OPTIONS": {
"base_url": "/archive/", "base_url": "/archive/",
"location": CONFIG.ARCHIVE_DIR, "location": ARCHIVE_DIR,
}, },
}, },
# "personas": { # "personas": {

View file

@ -14,7 +14,6 @@ from ..config import (
SAVE_ALLOWLIST_PTN, SAVE_ALLOWLIST_PTN,
SAVE_DENYLIST_PTN, SAVE_DENYLIST_PTN,
) )
from ..core.settings import ERROR_LOG
from ..index.schema import ArchiveResult, Link from ..index.schema import ArchiveResult, Link
from ..index.sql import write_link_to_sql_index from ..index.sql import write_link_to_sql_index
from ..index import ( from ..index import (
@ -109,6 +108,8 @@ def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link: def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
from django.conf import settings
from ..search import write_search_index from ..search import write_search_index
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
@ -169,7 +170,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
stats['skipped'] += 1 stats['skipped'] += 1
except Exception as e: except Exception as e:
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627 # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
with open(ERROR_LOG, "a", encoding='utf-8') as f: with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv) command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format( f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(

View file

@ -1,5 +1,7 @@
__package__ = 'archivebox.extractors' __package__ = 'archivebox.extractors'
import archivebox
from html.parser import HTMLParser from html.parser import HTMLParser
import io import io
from pathlib import Path from pathlib import Path
@ -8,7 +10,6 @@ from typing import Optional
from ..config import ( from ..config import (
SAVE_HTMLTOTEXT, SAVE_HTMLTOTEXT,
TIMEOUT, TIMEOUT,
VERSION,
) )
from ..index.schema import Link, ArchiveResult, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveError
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@ -153,7 +154,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=VERSION, cmd_version=archivebox.__version__,
output=output, output=output,
status=status, status=status,
index_texts=[extracted_text] if extracted_text else [], index_texts=[extracted_text] if extracted_text else [],

View file

@ -8,17 +8,7 @@ import json
from ..index.schema import Link, ArchiveResult, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveError
from ..system import run, atomic_write from ..system import run, atomic_write
from ..util import ( from ..util import enforce_types, is_static_file
enforce_types,
is_static_file,
)
from ..config import (
TIMEOUT,
CURL_BINARY,
SAVE_READABILITY,
DEPENDENCIES,
READABILITY_VERSION,
)
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from .title import get_html from .title import get_html
@ -31,22 +21,29 @@ def get_embed_path(archiveresult=None):
@enforce_types @enforce_types
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.readability.apps import READABILITY_CONFIG
if is_static_file(link.url): if is_static_file(link.url):
return False return False
out_dir = out_dir or Path(link.link_dir) output_subdir = (Path(out_dir or link.link_dir) / get_output_path())
if not overwrite and (out_dir / get_output_path()).exists(): if not overwrite and output_subdir.exists():
return False return False
return SAVE_READABILITY return READABILITY_CONFIG.SAVE_READABILITY
@enforce_types @enforce_types
def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
"""download reader friendly version using @mozilla/readability""" """download reader friendly version using @mozilla/readability"""
from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY
READABILITY_BIN = READABILITY_BINARY.load()
assert READABILITY_BIN.abspath and READABILITY_BIN.version
out_dir = Path(out_dir or link.link_dir) timeout = timeout or READABILITY_CONFIG.READABILITY_TIMEOUT
output_folder = out_dir.absolute() / get_output_path() output_subdir = Path(out_dir or link.link_dir).absolute() / get_output_path()
output = get_output_path() output = get_output_path()
# Readability Docs: https://github.com/mozilla/readability # Readability Docs: https://github.com/mozilla/readability
@ -54,13 +51,14 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
status = 'succeeded' status = 'succeeded'
# fake command to show the user so they have something to try debugging if get_html fails # fake command to show the user so they have something to try debugging if get_html fails
cmd = [ cmd = [
CURL_BINARY, str(READABILITY_BIN.abspath),
link.url '{dom,singlefile}.html',
link.url,
] ]
readability_content = None readability_content = None
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
try: try:
document = get_html(link, out_dir) document = get_html(link, Path(out_dir or link.link_dir))
temp_doc = NamedTemporaryFile(delete=False) temp_doc = NamedTemporaryFile(delete=False)
temp_doc.write(document.encode("utf-8")) temp_doc.write(document.encode("utf-8"))
temp_doc.close() temp_doc.close()
@ -69,26 +67,26 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
raise ArchiveError('Readability could not find HTML to parse for article text') raise ArchiveError('Readability could not find HTML to parse for article text')
cmd = [ cmd = [
DEPENDENCIES['READABILITY_BINARY']['path'], str(READABILITY_BIN.abspath),
temp_doc.name, temp_doc.name,
link.url, link.url,
] ]
result = run(cmd, cwd=out_dir, timeout=timeout) result = run(cmd, cwd=out_dir, timeout=timeout, text=True)
try: try:
result_json = json.loads(result.stdout) result_json = json.loads(result.stdout)
assert result_json and 'content' in result_json, 'Readability output is not valid JSON' assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
except json.JSONDecodeError: except json.JSONDecodeError:
raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr) raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr)
output_folder.mkdir(exist_ok=True) output_subdir.mkdir(exist_ok=True)
readability_content = result_json.pop("textContent") readability_content = result_json.pop("textContent")
atomic_write(str(output_folder / "content.html"), result_json.pop("content")) atomic_write(str(output_subdir / "content.html"), result_json.pop("content"))
atomic_write(str(output_folder / "content.txt"), readability_content) atomic_write(str(output_subdir / "content.txt"), readability_content)
atomic_write(str(output_folder / "article.json"), result_json) atomic_write(str(output_subdir / "article.json"), result_json)
output_tail = [ output_tail = [
line.strip() line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:] for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
if line.strip() if line.strip()
] ]
hints = ( hints = (
@ -111,7 +109,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=READABILITY_VERSION, cmd_version=str(READABILITY_BIN.version),
output=output, output=output,
status=status, status=status,
index_texts=[readability_content] if readability_content else [], index_texts=[readability_content] if readability_content else [],

View file

@ -11,20 +11,19 @@ from contextlib import contextmanager
from urllib.parse import urlparse from urllib.parse import urlparse
from django.db.models import QuerySet, Q from django.db.models import QuerySet, Q
import archivebox
from ..util import ( from ..util import (
scheme, scheme,
enforce_types, enforce_types,
ExtendedEncoder, ExtendedEncoder,
) )
from ..misc.logging import stderr
from ..config import ( from ..config import (
ARCHIVE_DIR_NAME,
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
OUTPUT_DIR,
TIMEOUT, TIMEOUT,
URL_DENYLIST_PTN, URL_DENYLIST_PTN,
URL_ALLOWLIST_PTN, URL_ALLOWLIST_PTN,
stderr,
OUTPUT_PERMISSIONS OUTPUT_PERMISSIONS
) )
from ..logging_util import ( from ..logging_util import (
@ -224,28 +223,28 @@ def timed_index_update(out_path: Path):
@enforce_types @enforce_types
def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, created_by_id: int | None=None) -> None: def write_main_index(links: List[Link], out_dir: Path=archivebox.DATA_DIR, created_by_id: int | None=None) -> None:
"""Writes links to sqlite3 file for a given list of links""" """Writes links to sqlite3 file for a given list of links"""
log_indexing_process_started(len(links)) log_indexing_process_started(len(links))
try: try:
with timed_index_update(out_dir / SQL_INDEX_FILENAME): with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE):
write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id) write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
stderr('[!] Warning: Still writing index to disk...', color='lightyellow') stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
stderr(' Run archivebox init to fix any inconsistencies from an ungraceful exit.') stderr(' Run archivebox init to fix any inconsistencies from an ungraceful exit.')
with timed_index_update(out_dir / SQL_INDEX_FILENAME): with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE):
write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id) write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id)
os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
raise SystemExit(0) raise SystemExit(0)
log_indexing_process_finished() log_indexing_process_finished()
@enforce_types @enforce_types
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: def load_main_index(out_dir: Path=archivebox.DATA_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in""" """parse and load existing index with any new links from import_path merged in"""
from core.models import Snapshot from core.models import Snapshot
try: try:
@ -255,8 +254,8 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
raise SystemExit(0) raise SystemExit(0)
@enforce_types @enforce_types
def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]: def load_main_index_meta(out_dir: Path=archivebox.DATA_DIR) -> Optional[dict]:
index_path = out_dir / JSON_INDEX_FILENAME index_path = out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME
if index_path.exists(): if index_path.exists():
with open(index_path, 'r', encoding='utf-8') as f: with open(index_path, 'r', encoding='utf-8') as f:
meta_dict = pyjson.load(f) meta_dict = pyjson.load(f)
@ -407,7 +406,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
return search_filter(snapshots, filter_patterns, filter_type) return search_filter(snapshots, filter_patterns, filter_type)
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_indexed_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity""" """indexed links without checking archive status or data directory validity"""
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
return { return {
@ -415,7 +414,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
for link in links for link in links
} }
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_archived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory""" """indexed links that are archived with a valid data directory"""
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
return { return {
@ -423,7 +422,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
for link in filter(is_archived, links) for link in filter(is_archived, links)
} }
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_unarchived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory""" """indexed links that are unarchived with no data directory or an empty data directory"""
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
return { return {
@ -431,12 +430,12 @@ def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opt
for link in filter(is_unarchived, links) for link in filter(is_unarchived, links)
} }
def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_present_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
"""dirs that actually exist in the archive/ folder""" """dirs that actually exist in the archive/ folder"""
all_folders = {} all_folders = {}
for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir(): for entry in (out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir(): if entry.is_dir():
link = None link = None
try: try:
@ -448,7 +447,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
return all_folders return all_folders
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_valid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
"""dirs with a valid index matched to the main index and archived content""" """dirs with a valid index matched to the main index and archived content"""
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)] links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
return { return {
@ -456,16 +455,16 @@ def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional
for link in filter(is_valid, links) for link in filter(is_valid, links)
} }
def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_invalid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR) duplicate = get_duplicate_folders(snapshots, out_dir=out_dir)
orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR) orphaned = get_orphaned_folders(snapshots, out_dir=out_dir)
corrupted = get_corrupted_folders(snapshots, out_dir=OUTPUT_DIR) corrupted = get_corrupted_folders(snapshots, out_dir=out_dir)
unrecognized = get_unrecognized_folders(snapshots, out_dir=OUTPUT_DIR) unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir)
return {**duplicate, **orphaned, **corrupted, **unrecognized} return {**duplicate, **orphaned, **corrupted, **unrecognized}
def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_duplicate_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
"""dirs that conflict with other directories that have the same link URL or timestamp""" """dirs that conflict with other directories that have the same link URL or timestamp"""
by_url = {} by_url = {}
by_timestamp = {} by_timestamp = {}
@ -473,7 +472,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
data_folders = ( data_folders = (
str(entry) str(entry)
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir() for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir()
if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists() if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
) )
@ -499,11 +498,11 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
duplicate_folders[path] = link duplicate_folders[path] = link
return duplicate_folders return duplicate_folders
def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_orphaned_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
"""dirs that contain a valid index but aren't listed in the main index""" """dirs that contain a valid index but aren't listed in the main index"""
orphaned_folders = {} orphaned_folders = {}
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir():
if entry.is_dir(): if entry.is_dir():
link = None link = None
try: try:
@ -517,7 +516,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
return orphaned_folders return orphaned_folders
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_corrupted_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain a valid index and aren't listed in the main index""" """dirs that don't contain a valid index and aren't listed in the main index"""
corrupted = {} corrupted = {}
for snapshot in snapshots.iterator(chunk_size=500): for snapshot in snapshots.iterator(chunk_size=500):
@ -526,11 +525,11 @@ def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
corrupted[link.link_dir] = link corrupted[link.link_dir] = link
return corrupted return corrupted
def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_unrecognized_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain recognizable archive data and aren't listed in the main index""" """dirs that don't contain recognizable archive data and aren't listed in the main index"""
unrecognized_folders: Dict[str, Optional[Link]] = {} unrecognized_folders: Dict[str, Optional[Link]] = {}
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): for entry in (Path(out_dir) / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir():
if entry.is_dir(): if entry.is_dir():
index_exists = (entry / "index.json").exists() index_exists = (entry / "index.json").exists()
link = None link = None
@ -595,10 +594,10 @@ def is_unarchived(link: Link) -> bool:
return not link.is_archived return not link.is_archived
def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: def fix_invalid_folder_locations(out_dir: Path=archivebox.DATA_DIR) -> Tuple[List[str], List[str]]:
fixed = [] fixed = []
cant_fix = [] cant_fix = []
for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME): for entry in os.scandir(out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME):
if entry.is_dir(follow_symlinks=True): if entry.is_dir(follow_symlinks=True):
if (Path(entry.path) / 'index.json').exists(): if (Path(entry.path) / 'index.json').exists():
try: try:
@ -609,7 +608,7 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
continue continue
if not entry.path.endswith(f'/{link.timestamp}'): if not entry.path.endswith(f'/{link.timestamp}'):
dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp dest = out_dir /archivebox.CONSTANTS.ARCHIVE_DIR_NAME / link.timestamp
if dest.exists(): if dest.exists():
cant_fix.append(entry.path) cant_fix.append(entry.path)
else: else:

View file

@ -1,11 +1,12 @@
__package__ = 'archivebox.index' __package__ = 'archivebox.index'
import archivebox
from pathlib import Path from pathlib import Path
from datetime import datetime, timezone from datetime import datetime, timezone
from collections import defaultdict from collections import defaultdict
from typing import List, Optional, Iterator, Mapping from typing import List, Optional, Iterator, Mapping
from django.utils.html import format_html, mark_safe from django.utils.html import format_html, mark_safe # type: ignore
from django.core.cache import cache from django.core.cache import cache
from .schema import Link from .schema import Link
@ -19,10 +20,6 @@ from ..util import (
urldecode, urldecode,
) )
from ..config import ( from ..config import (
OUTPUT_DIR,
VERSION,
FOOTER_INFO,
HTML_INDEX_FILENAME,
SAVE_ARCHIVE_DOT_ORG, SAVE_ARCHIVE_DOT_ORG,
PREVIEW_ORIGINALS, PREVIEW_ORIGINALS,
) )
@ -36,10 +33,12 @@ TITLE_LOADING_MSG = 'Not yet archived...'
### Main Links Index ### Main Links Index
@enforce_types @enforce_types
def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]: def parse_html_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[str]:
"""parse an archive index html file and return the list of urls""" """parse an archive index html file and return the list of urls"""
index_path = Path(out_dir) / HTML_INDEX_FILENAME from plugins_sys.config.constants import CONSTANTS
index_path = Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME
if index_path.exists(): if index_path.exists():
with open(index_path, 'r', encoding='utf-8') as f: with open(index_path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
@ -59,14 +58,16 @@ def generate_index_from_links(links: List[Link], with_headers: bool):
def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str: def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str:
"""render the template for the entire main index""" """render the template for the entire main index"""
from plugins_sys.config.apps import SHELL_CONFIG, SERVER_CONFIG
return render_django_template(template, { return render_django_template(template, {
'version': VERSION, 'version': archivebox.VERSION,
'git_sha': VERSION, # not used anymore, but kept for backwards compatibility 'git_sha': SHELL_CONFIG.COMMIT_HASH or archivebox.VERSION,
'num_links': str(len(links)), 'num_links': str(len(links)),
'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'), 'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'), 'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),
'links': [link._asdict(extended=True) for link in links], 'links': [link._asdict(extended=True) for link in links],
'FOOTER_INFO': FOOTER_INFO, 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
}) })
@ -74,10 +75,11 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
@enforce_types @enforce_types
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None: def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
from plugins_sys.config.constants import CONSTANTS
out_dir = out_dir or link.link_dir out_dir = out_dir or link.link_dir
rendered_html = link_details_template(link) rendered_html = link_details_template(link)
atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html) atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
@enforce_types @enforce_types

View file

@ -8,38 +8,36 @@ from pathlib import Path
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import List, Optional, Iterator, Any, Union from typing import List, Optional, Iterator, Any, Union
import archivebox
from .schema import Link from .schema import Link
from ..system import atomic_write from ..system import atomic_write
from ..util import enforce_types from ..util import enforce_types
from ..config import (
VERSION,
OUTPUT_DIR,
FOOTER_INFO,
DEPENDENCIES,
JSON_INDEX_FILENAME,
ARCHIVE_DIR_NAME,
ANSI
)
MAIN_INDEX_HEADER = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.index.json',
'copyright_info': FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
'version': VERSION,
'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
'source': 'https://github.com/ArchiveBox/ArchiveBox',
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
'dependencies': DEPENDENCIES,
},
}
@enforce_types @enforce_types
def generate_json_index_from_links(links: List[Link], with_headers: bool): def generate_json_index_from_links(links: List[Link], with_headers: bool):
from django.conf import settings
from plugins_sys.config.apps import SERVER_CONFIG
MAIN_INDEX_HEADER = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.index.json',
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
'version': archivebox.VERSION,
'git_sha': archivebox.VERSION, # not used anymore, but kept for backwards compatibility
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
'source': 'https://github.com/ArchiveBox/ArchiveBox',
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
'dependencies': settings.BINARIES.to_dict(),
},
}
if with_headers: if with_headers:
output = { output = {
**MAIN_INDEX_HEADER, **MAIN_INDEX_HEADER,
@ -54,10 +52,12 @@ def generate_json_index_from_links(links: List[Link], with_headers: bool):
@enforce_types @enforce_types
def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: def parse_json_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[Link]:
"""parse an archive index json file and return the list of links""" """parse an archive index json file and return the list of links"""
index_path = Path(out_dir) / JSON_INDEX_FILENAME from plugins_sys.config.constants import CONSTANTS
index_path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
if index_path.exists(): if index_path.exists():
with open(index_path, 'r', encoding='utf-8') as f: with open(index_path, 'r', encoding='utf-8') as f:
try: try:
@ -77,14 +77,14 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
yield Link.from_json(link_json) yield Link.from_json(link_json)
except KeyError: except KeyError:
try: try:
detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp'] detail_index_path = CONSTANTS.ARCHIVE_DIR / link_json['timestamp']
yield parse_json_link_details(str(detail_index_path)) yield parse_json_link_details(str(detail_index_path))
except KeyError: except KeyError:
# as a last effort, try to guess the missing values out of existing ones # as a last effort, try to guess the missing values out of existing ones
try: try:
yield Link.from_json(link_json, guess=True) yield Link.from_json(link_json, guess=True)
except KeyError: except KeyError:
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI)) # print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
continue continue
return () return ()
@ -94,15 +94,19 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None: def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link""" """write a json file with some info about the link"""
from plugins_sys.config.constants import CONSTANTS
out_dir = out_dir or link.link_dir out_dir = out_dir or link.link_dir
path = Path(out_dir) / JSON_INDEX_FILENAME path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
atomic_write(str(path), link._asdict(extended=True)) atomic_write(str(path), link._asdict(extended=True))
@enforce_types @enforce_types
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]: def parse_json_link_details(out_dir: Union[Path, str], guess: bool=False) -> Optional[Link]:
"""load the json link index from a given directory""" """load the json link index from a given directory"""
existing_index = Path(out_dir) / JSON_INDEX_FILENAME from plugins_sys.config.constants import CONSTANTS
existing_index = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
if existing_index.exists(): if existing_index.exists():
with open(existing_index, 'r', encoding='utf-8') as f: with open(existing_index, 'r', encoding='utf-8') as f:
try: try:
@ -117,7 +121,9 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]: def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
"""read through all the archive data folders and return the parsed links""" """read through all the archive data folders and return the parsed links"""
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME): from plugins_sys.config.constants import CONSTANTS
for entry in os.scandir(CONSTANTS.ARCHIVE_DIR):
if entry.is_dir(follow_symlinks=True): if entry.is_dir(follow_symlinks=True):
if (Path(entry.path) / 'index.json').exists(): if (Path(entry.path) / 'index.json').exists():
try: try:

View file

@ -4,8 +4,11 @@ import re
import os import os
import sys import sys
import stat import stat
import shutil
import time import time
import argparse import argparse
import archivebox
from math import log from math import log
from multiprocessing import Process from multiprocessing import Process
from pathlib import Path from pathlib import Path
@ -22,18 +25,7 @@ from rich.panel import Panel
from .system import get_dir_size from .system import get_dir_size
from .util import enforce_types from .util import enforce_types
from .config import ( from .misc.logging import ANSI, stderr
ConfigDict,
OUTPUT_DIR,
VERSION,
ANSI,
IS_TTY,
IN_DOCKER,
TERM_WIDTH,
SHOW_PROGRESS,
SOURCES_DIR_NAME,
stderr,
)
@dataclass @dataclass
class RuntimeStats: class RuntimeStats:
@ -102,7 +94,7 @@ def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
if not stdin: if not stdin:
return None return None
if IN_DOCKER: if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
# when TTY is disabled in docker we cant tell if stdin is being piped in or not # when TTY is disabled in docker we cant tell if stdin is being piped in or not
# if we try to read stdin when its not piped we will hang indefinitely waiting for it # if we try to read stdin when its not piped we will hang indefinitely waiting for it
return None return None
@ -141,9 +133,14 @@ class TimedProgress:
def __init__(self, seconds, prefix=''): def __init__(self, seconds, prefix=''):
self.SHOW_PROGRESS = SHOW_PROGRESS from plugins_sys.config.apps import SHELL_CONFIG
self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS
self.ANSI = SHELL_CONFIG.ANSI
self.TERM_WIDTH = lambda: shutil.get_terminal_size().columns # lambda so it live-updates when terminal is resized
if self.SHOW_PROGRESS: if self.SHOW_PROGRESS:
self.p = Process(target=progress_bar, args=(seconds, prefix)) self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI))
self.p.start() self.p.start()
self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None} self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
@ -172,7 +169,7 @@ class TimedProgress:
# clear whole terminal line # clear whole terminal line
try: try:
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) sys.stdout.write('\r{}{}\r'.format((' ' * self.TERM_WIDTH()), self.ANSI['reset']))
except (IOError, BrokenPipeError): except (IOError, BrokenPipeError):
# ignore when the parent proc has stopped listening to our stdout # ignore when the parent proc has stopped listening to our stdout
pass pass
@ -181,9 +178,10 @@ class TimedProgress:
@enforce_types @enforce_types
def progress_bar(seconds: int, prefix: str='') -> None: def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> None:
"""show timer in the form of progress bar, with percentage and seconds remaining""" """show timer in the form of progress bar, with percentage and seconds remaining"""
chunk = '' if (sys.stdout or sys.__stdout__).encoding.upper() == 'UTF-8' else '#' output_buf = (sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__)
chunk = '' if output_buf and output_buf.encoding.upper() == 'UTF-8' else '#'
last_width = TERM_WIDTH() last_width = TERM_WIDTH()
chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width) chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
try: try:
@ -236,18 +234,15 @@ def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional
args = ' '.join(subcommand_args) args = ' '.join(subcommand_args)
version_msg = '[dark_magenta]\\[i] [{now}] ArchiveBox v{VERSION}: [/dark_magenta][green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format( version_msg = '[dark_magenta]\\[i] [{now}] ArchiveBox v{VERSION}: [/dark_magenta][green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
VERSION=VERSION, VERSION=archivebox.__version__,
subcommand=subcommand, subcommand=subcommand,
args=args, args=args,
) )
# stderr() # stderr()
# stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI)) # stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI))
# stderr() # stderr()
if SHOW_PROGRESS: print(Panel(version_msg), file=sys.stderr)
print(Panel(version_msg), file=sys.stderr)
else:
print(version_msg, file=sys.stderr)
### Parsing Stage ### Parsing Stage
@ -261,7 +256,8 @@ def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: b
)) ))
def log_source_saved(source_file: str): def log_source_saved(source_file: str):
print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1])) from plugins_sys.config.constants import CONSTANTS
print(' > Saved verbatim input to {}/{}'.format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
def log_parsing_finished(num_parsed: int, parser_name: str): def log_parsing_finished(num_parsed: int, parser_name: str):
_LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc) _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
@ -293,12 +289,14 @@ def log_indexing_process_finished():
def log_indexing_started(out_path: str): def log_indexing_started(out_path: str):
if IS_TTY: from plugins_sys.config.apps import SHELL_CONFIG
sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
if SHELL_CONFIG.IS_TTY:
sys.stdout.write(f' > ./{Path(out_path).relative_to(archivebox.DATA_DIR)}')
def log_indexing_finished(out_path: str): def log_indexing_finished(out_path: str):
print(f'\r √ ./{Path(out_path).relative_to(OUTPUT_DIR)}') print(f'\r √ ./{Path(out_path).relative_to(archivebox.DATA_DIR)}')
### Archiving Stage ### Archiving Stage
@ -447,7 +445,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
) )
docker_hints = () docker_hints = ()
if IN_DOCKER: if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
docker_hints = ( docker_hints = (
' docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash', ' docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
) )
@ -534,7 +532,7 @@ def log_shell_welcome_msg():
### Helpers ### Helpers
@enforce_types @enforce_types
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str: def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=archivebox.DATA_DIR) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = str(Path(pwd)) # .resolve() pwd = str(Path(pwd)) # .resolve()
path = str(path) path = str(path)
@ -577,7 +575,7 @@ def printable_folders(folders: Dict[str, Optional["Link"]],
@enforce_types @enforce_types
def printable_config(config: ConfigDict, prefix: str='') -> str: def printable_config(config: dict, prefix: str='') -> str:
return f'\n{prefix}'.join( return f'\n{prefix}'.join(
f'{key}={val}' f'{key}={val}'
for key, val in config.items() for key, val in config.items()

View file

@ -6,6 +6,8 @@ import shutil
import platform import platform
import archivebox import archivebox
CONSTANTS = archivebox.CONSTANTS
from typing import Dict, List, Optional, Iterable, IO, Union from typing import Dict, List, Optional, Iterable, IO, Union
from pathlib import Path from pathlib import Path
from datetime import date, datetime from datetime import date, datetime
@ -66,47 +68,25 @@ from .index.html import (
) )
from .index.csv import links_to_csv from .index.csv import links_to_csv
from .extractors import archive_links, archive_link, ignore_methods from .extractors import archive_links, archive_link, ignore_methods
from .misc.logging import stderr, hint from .misc.logging import stderr, hint, ANSI
from .misc.checks import check_data_folder, check_dependencies from .misc.checks import check_data_folder, check_dependencies
from .config import ( from .config import (
setup_django_minimal, setup_django_minimal,
ConfigDict, ConfigDict,
ANSI,
IS_TTY, IS_TTY,
DEBUG, DEBUG,
IN_DOCKER, IN_DOCKER,
IN_QEMU, IN_QEMU,
PUID, PUID,
PGID, PGID,
USER,
TIMEZONE, TIMEZONE,
ENFORCE_ATOMIC_WRITES,
OUTPUT_PERMISSIONS,
ONLY_NEW, ONLY_NEW,
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
LOGS_DIR,
PACKAGE_DIR,
CONFIG_FILE,
ARCHIVE_DIR_NAME,
JSON_INDEX_FILENAME, JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME, HTML_INDEX_FILENAME,
SQL_INDEX_FILENAME, SQL_INDEX_FILENAME,
ALLOWED_IN_OUTPUT_DIR,
LDAP, LDAP,
write_config_file, write_config_file,
VERSION,
COMMIT_HASH,
BUILD_TIME,
CODE_LOCATIONS,
DATA_LOCATIONS,
DEPENDENCIES, DEPENDENCIES,
YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION,
SINGLEFILE_VERSION,
READABILITY_VERSION,
MERCURY_VERSION,
load_all_config, load_all_config,
CONFIG, CONFIG,
USER_CONFIG, USER_CONFIG,
@ -114,7 +94,6 @@ from .config import (
setup_django, setup_django,
) )
from .logging_util import ( from .logging_util import (
TERM_WIDTH,
TimedProgress, TimedProgress,
log_importing_started, log_importing_started,
log_crawl_started, log_crawl_started,
@ -129,9 +108,14 @@ from .logging_util import (
printable_dependency_version, printable_dependency_version,
) )
VERSION = archivebox.VERSION
PACKAGE_DIR = archivebox.PACKAGE_DIR
OUTPUT_DIR = archivebox.DATA_DIR
ARCHIVE_DIR = archivebox.DATA_DIR / 'archive'
@enforce_types @enforce_types
def help(out_dir: Path=OUTPUT_DIR) -> None: def help(out_dir: Path=archivebox.DATA_DIR) -> None:
"""Print the ArchiveBox help message and usage""" """Print the ArchiveBox help message and usage"""
all_subcommands = CLI_SUBCOMMANDS all_subcommands = CLI_SUBCOMMANDS
@ -207,7 +191,7 @@ def version(quiet: bool=False,
"""Print the ArchiveBox version and dependency information""" """Print the ArchiveBox version and dependency information"""
setup_django_minimal() setup_django_minimal()
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG, CONSTANTS
from plugins_auth.ldap.apps import LDAP_CONFIG from plugins_auth.ldap.apps import LDAP_CONFIG
from django.conf import settings from django.conf import settings
@ -223,8 +207,8 @@ def version(quiet: bool=False,
p = platform.uname() p = platform.uname()
print( print(
'ArchiveBox v{}'.format(archivebox.__version__), 'ArchiveBox v{}'.format(archivebox.__version__),
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}', f'COMMIT_HASH={SHELL_CONFIG.COMMIT_HASH[:7] if SHELL_CONFIG.COMMIT_HASH else "unknown"}',
f'BUILD_TIME={BUILD_TIME}', f'BUILD_TIME={SHELL_CONFIG.BUILD_TIME}',
) )
print( print(
f'IN_DOCKER={IN_DOCKER}', f'IN_DOCKER={IN_DOCKER}',
@ -234,7 +218,7 @@ def version(quiet: bool=False,
f'PLATFORM={platform.platform()}', f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}', f'PYTHON={sys.implementation.name.title()}',
) )
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount'] OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or CONSTANTS.DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
print( print(
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}', f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
@ -268,17 +252,18 @@ def version(quiet: bool=False,
except Exception as e: except Exception as e:
err = e err = e
loaded_bin = binary loaded_bin = binary
raise
print('', '' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath or str(err)) print('', '' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath or str(err))
print() print()
print('{white}[i] Source-code locations:{reset}'.format(**ANSI)) print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
for name, path in CODE_LOCATIONS.items(): for name, path in CONSTANTS.CODE_LOCATIONS.items():
print(printable_folder_status(name, path)) print(printable_folder_status(name, path))
print() print()
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']: if CONSTANTS.DATABASE_FILE.exists() or CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists():
print('{white}[i] Data locations:{reset}'.format(**ANSI)) print('{white}[i] Data locations:{reset}'.format(**ANSI))
for name, path in DATA_LOCATIONS.items(): for name, path in CONSTANTS.DATA_LOCATIONS.items():
print(printable_folder_status(name, path)) print(printable_folder_status(name, path))
else: else:
print() print()
@ -303,19 +288,19 @@ def run(subcommand: str,
@enforce_types @enforce_types
def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=OUTPUT_DIR) -> None: def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=archivebox.DATA_DIR) -> None:
"""Initialize a new ArchiveBox collection in the current directory""" """Initialize a new ArchiveBox collection in the current directory"""
from core.models import Snapshot from core.models import Snapshot
out_dir.mkdir(exist_ok=True) out_dir.mkdir(exist_ok=True)
is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR) is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_OUTPUT_DIR)
if (out_dir / JSON_INDEX_FILENAME).exists(): if (out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME).exists():
stderr("[!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.", color="lightyellow") stderr("[!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.", color="lightyellow")
stderr(" You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.", color="lightyellow") stderr(" You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.", color="lightyellow")
existing_index = (out_dir / SQL_INDEX_FILENAME).exists() existing_index = archivebox.CONSTANTS.DATABASE_FILE.exists()
if is_empty and not existing_index: if is_empty and not existing_index:
print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI)) print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
@ -344,25 +329,24 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
else: else:
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI)) print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...') print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(OUTPUT_DIR)}...')
Path(SOURCES_DIR).mkdir(exist_ok=True) Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
Path(ARCHIVE_DIR).mkdir(exist_ok=True) Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
Path(LOGS_DIR).mkdir(exist_ok=True) Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...') print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
write_config_file({}, out_dir=out_dir) write_config_file({}, out_dir=out_dir)
if (out_dir / SQL_INDEX_FILENAME).exists(): if CONSTANTS.DATABASE_FILE.exists():
print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI)) print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
else: else:
print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI)) print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
DATABASE_FILE = out_dir / SQL_INDEX_FILENAME
for migration_line in apply_migrations(out_dir): for migration_line in apply_migrations(out_dir):
print(f' {migration_line}') print(f' {migration_line}')
assert DATABASE_FILE.exists() assert CONSTANTS.DATABASE_FILE.exists()
print() print()
print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}') print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(OUTPUT_DIR)}')
# from django.contrib.auth.models import User # from django.contrib.auth.models import User
# if IS_TTY and not User.objects.filter(is_superuser=True).exists(): # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
@ -477,7 +461,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
check_data_folder(CONFIG) check_data_folder(CONFIG)
from core.models import Snapshot from core.models import Snapshot
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_mod, SHELL_CONFIG
User = get_user_model() User = get_user_model()
print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI)) print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI))
@ -491,7 +475,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
num_sql_links = links.count() num_sql_links = links.count()
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)') print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
print() print()
print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI)) print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset']) print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset'])
@ -539,7 +523,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
print() print()
print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI)) print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI))
print(ANSI['lightyellow'], f' {LOGS_DIR}/*', ANSI['reset']) print(ANSI['lightyellow'], f' {CONSTANTS.LOGS_DIR}/*', ANSI['reset'])
users = get_admins().values_list('username', flat=True) users = get_admins().values_list('username', flat=True)
print(f' UI users {len(users)}: {", ".join(users)}') print(f' UI users {len(users)}: {", ".join(users)}')
last_login = User.objects.order_by('last_login').last() last_login = User.objects.order_by('last_login').last()
@ -564,7 +548,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
f' > {str(snapshot.downloaded_at)[:16]} ' f' > {str(snapshot.downloaded_at)[:16]} '
f'[{snapshot.num_outputs} {("X", "")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' f'[{snapshot.num_outputs} {("X", "")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
f'"{snapshot.title}": {snapshot.url}' f'"{snapshot.title}": {snapshot.url}'
)[:TERM_WIDTH()], )[:SHELL_CONFIG.TERM_WIDTH],
ANSI['reset'], ANSI['reset'],
) )
print(ANSI['black'], ' ...', ANSI['reset']) print(ANSI['black'], ' ...', ANSI['reset'])
@ -976,7 +960,7 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
from rich import print from rich import print
if not (out_dir / ARCHIVE_DIR_NAME).exists(): if not ARCHIVE_DIR.exists():
run_subcommand('init', stdin=None, pwd=out_dir) run_subcommand('init', stdin=None, pwd=out_dir)
setup_django(out_dir=out_dir, check_db=True) setup_django(out_dir=out_dir, check_db=True)
@ -992,9 +976,13 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
from plugins_extractor.readability.apps import READABILITY_BINARY
print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
from plugins_pkg.npm.apps import npm from plugins_pkg.npm.apps import npm
print(npm.load_or_install('readability-extractor', overrides={'packages': lambda: ['github:ArchiveBox/readability-extractor']}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) # TODO: move these to their own plugin binaries
print(npm.load_or_install('postlight-parser', overrides={'packages': lambda: ['@postlight/parser@^2.2.3'], 'version': lambda: '2.2.3'}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths'})) print(npm.load_or_install('postlight-parser', overrides={'packages': lambda: ['@postlight/parser@^2.2.3'], 'version': lambda: '2.2.3'}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths'}))
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_model
@ -1020,7 +1008,6 @@ def config(config_options_str: Optional[str]=None,
"""Get and set your ArchiveBox project configuration values""" """Get and set your ArchiveBox project configuration values"""
check_data_folder(CONFIG) check_data_folder(CONFIG)
if config_options and config_options_str: if config_options and config_options_str:
stderr( stderr(
'[X] You should either pass config values as an arguments ' '[X] You should either pass config values as an arguments '
@ -1096,7 +1083,6 @@ def config(config_options_str: Optional[str]=None,
elif reset: elif reset:
stderr('[X] This command is not implemented yet.', color='red') stderr('[X] This command is not implemented yet.', color='red')
stderr(' Please manually remove the relevant lines from your config file:') stderr(' Please manually remove the relevant lines from your config file:')
stderr(f' {CONFIG_FILE}')
raise SystemExit(2) raise SystemExit(2)
else: else:
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red') stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
@ -1125,8 +1111,9 @@ def schedule(add: bool=False,
check_data_folder(CONFIG) check_data_folder(CONFIG)
setup_django_minimal() setup_django_minimal()
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
from plugins_sys.config.apps import SHELL_CONFIG, CONSTANTS
Path(LOGS_DIR).mkdir(exist_ok=True) Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
cron = CronTab(user=True) cron = CronTab(user=True)
cron = dedupe_cron_jobs(cron) cron = dedupe_cron_jobs(cron)
@ -1155,7 +1142,7 @@ def schedule(add: bool=False,
f'"{import_path}"', f'"{import_path}"',
] if import_path else ['update']), ] if import_path else ['update']),
'>>', '>>',
quoted(Path(LOGS_DIR) / 'schedule.log'), quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'),
'2>&1', '2>&1',
] ]
@ -1167,7 +1154,7 @@ def schedule(add: bool=False,
elif CronSlices.is_valid(every): elif CronSlices.is_valid(every):
new_job.setall(every) new_job.setall(every)
else: else:
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI)) stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI))
stderr(' It must be one of minute/hour/day/month') stderr(' It must be one of minute/hour/day/month')
stderr(' or a quoted cron-format schedule like:') stderr(' or a quoted cron-format schedule like:')
stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml') stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml')
@ -1181,11 +1168,11 @@ def schedule(add: bool=False,
existing_jobs = list(cron.find_comment(CRON_COMMENT)) existing_jobs = list(cron.find_comment(CRON_COMMENT))
print() print()
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI)) print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(SHELL_CONFIG.USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
if total_runs > 60 and not quiet: if total_runs > 60 and not quiet:
stderr() stderr()
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI)) stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI))
stderr(' Congrats on being an enthusiastic internet archiver! 👌') stderr(' Congrats on being an enthusiastic internet archiver! 👌')
stderr() stderr()
stderr(' Make sure you have enough storage space available to hold all the data.') stderr(' Make sure you have enough storage space available to hold all the data.')
@ -1195,7 +1182,7 @@ def schedule(add: bool=False,
if existing_jobs: if existing_jobs:
print('\n'.join(str(cmd) for cmd in existing_jobs)) print('\n'.join(str(cmd) for cmd in existing_jobs))
else: else:
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI)) stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(SHELL_CONFIG.USER, **SHELL_CONFIG.ANSI))
stderr(' To schedule a new job, run:') stderr(' To schedule a new job, run:')
stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
raise SystemExit(0) raise SystemExit(0)
@ -1206,11 +1193,11 @@ def schedule(add: bool=False,
if foreground or run_all: if foreground or run_all:
if not existing_jobs: if not existing_jobs:
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI)) stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI))
stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml') stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml')
raise SystemExit(1) raise SystemExit(1)
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI)) print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI))
if run_all: if run_all:
try: try:
for job in existing_jobs: for job in existing_jobs:
@ -1220,7 +1207,7 @@ def schedule(add: bool=False,
job.run() job.run()
sys.stdout.write(f'\r{job.command.split("/archivebox ")[-1]}\n') sys.stdout.write(f'\r{job.command.split("/archivebox ")[-1]}\n')
except KeyboardInterrupt: except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
raise SystemExit(1) raise SystemExit(1)
if foreground: if foreground:
@ -1230,7 +1217,7 @@ def schedule(add: bool=False,
for result in cron.run_scheduler(): for result in cron.run_scheduler():
print(result) print(result)
except KeyboardInterrupt: except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI))
raise SystemExit(1) raise SystemExit(1)
# if CAN_UPGRADE: # if CAN_UPGRADE:

View file

@ -5,51 +5,55 @@ __package__ = 'archivebox.misc'
from benedict import benedict from benedict import benedict
from pathlib import Path from pathlib import Path
from .logging import stderr, hint import archivebox
from .logging import stderr, hint, ANSI
def check_dependencies(config: benedict, show_help: bool=True) -> None: def check_dependencies(config: benedict, show_help: bool=True) -> None:
invalid_dependencies = [ # dont do this on startup anymore, it's too slow
(name, info) for name, info in config['DEPENDENCIES'].items() pass
if info['enabled'] and not info['is_valid'] # invalid_dependencies = [
] # (name, binary) for name, info in settings.BINARIES.items()
if invalid_dependencies and show_help: # if not binary.
stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow') # ]
for dependency, info in invalid_dependencies: # if invalid_dependencies and show_help:
stderr( # stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
' ! {}: {} ({})'.format( # for dependency, info in invalid_dependencies:
dependency, # stderr(
info['path'] or 'unable to find binary', # ' ! {}: {} ({})'.format(
info['version'] or 'unable to detect version', # dependency,
) # info['path'] or 'unable to find binary',
) # info['version'] or 'unable to detect version',
if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'): # )
hint(('To install all packages automatically run: archivebox setup', # )
f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False', # if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
''), prefix=' ') # hint(('To install all packages automatically run: archivebox setup',
stderr('') # f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
# ''), prefix=' ')
# stderr('')
def check_data_folder(config: benedict) -> None: def check_data_folder(config: benedict) -> None:
output_dir = config['OUTPUT_DIR'] output_dir = archivebox.DATA_DIR
archive_dir_exists = (Path(output_dir) / 'archive').exists() archive_dir_exists = (archivebox.CONSTANTS.ARCHIVE_DIR).exists()
if not archive_dir_exists: if not archive_dir_exists:
stderr('[X] No archivebox index found in the current directory.', color='red') stderr('[X] No archivebox index found in the current directory.', color='red')
stderr(f' {output_dir}', color='lightyellow') stderr(f' {output_dir}', color='lightyellow')
stderr() stderr()
stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI'])) stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**ANSI))
stderr(' cd path/to/your/archive/folder') stderr(' cd path/to/your/archive/folder')
stderr(' archivebox [command]') stderr(' archivebox [command]')
stderr() stderr()
stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI'])) stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**ANSI))
stderr(' archivebox init') stderr(' archivebox init')
raise SystemExit(2) raise SystemExit(2)
def check_migrations(config: benedict): def check_migrations(config: benedict):
output_dir = config['OUTPUT_DIR'] output_dir = archivebox.DATA_DIR
from ..index.sql import list_migrations from ..index.sql import list_migrations
@ -63,8 +67,8 @@ def check_migrations(config: benedict):
stderr(' archivebox init') stderr(' archivebox init')
raise SystemExit(3) raise SystemExit(3)
(Path(output_dir) / config['SOURCES_DIR_NAME']).mkdir(exist_ok=True) archivebox.CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
(Path(output_dir) / config['LOGS_DIR_NAME']).mkdir(exist_ok=True) archivebox.CONSTANTS.LOGS_DIR.mkdir(exist_ok=True)
(Path(output_dir) / config['CACHE_DIR_NAME']).mkdir(exist_ok=True) archivebox.CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
(Path(output_dir) / config['LIB_DIR_NAME'] / 'bin').mkdir(exist_ok=True, parents=True) (archivebox.CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
(Path(output_dir) / config['PERSONAS_DIR_NAME'] / 'Default').mkdir(exist_ok=True, parents=True) (archivebox.CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)

View file

@ -8,8 +8,6 @@ from collections import defaultdict
from benedict import benedict from benedict import benedict
from rich.console import Console from rich.console import Console
from ..config_stubs import ConfigDict
# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS # SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
CONSOLE = Console() CONSOLE = Console()
IS_TTY = CONSOLE.is_interactive IS_TTY = CONSOLE.is_interactive
@ -43,7 +41,7 @@ COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
}) })
# Logging Helpers # Logging Helpers
def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if color: if color:
@ -53,7 +51,7 @@ def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[Co
sys.stdout.write(prefix + ''.join(strs)) sys.stdout.write(prefix + ''.join(strs))
def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if color: if color:
@ -63,7 +61,7 @@ def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[Co
sys.stderr.write(prefix + ''.join(strs)) sys.stderr.write(prefix + ''.join(strs))
def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None: def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[benedict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if isinstance(text, str): if isinstance(text, str):

View file

@ -2,25 +2,24 @@ __package__ = 'archivebox.parsers'
import re import re
import archivebox
from typing import IO, Iterable, Optional from typing import IO, Iterable, Optional
from configparser import ConfigParser from configparser import ConfigParser
from pathlib import Path
from pocket import Pocket from pocket import Pocket
from ..index.schema import Link from ..index.schema import Link
from ..util import enforce_types from ..util import enforce_types
from ..system import atomic_write from ..system import atomic_write
from ..config import ( from ..config import (
SOURCES_DIR,
POCKET_CONSUMER_KEY, POCKET_CONSUMER_KEY,
POCKET_ACCESS_TOKENS, POCKET_ACCESS_TOKENS,
) )
COUNT_PER_PAGE = 500 COUNT_PER_PAGE = 500
API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db' API_DB_PATH = archivebox.DATA_DIR / 'sources' / 'pocket_api.db'
# search for broken protocols that sometimes come from the Pocket API # search for broken protocols that sometimes come from the Pocket API
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))') _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')

View file

@ -3,23 +3,19 @@ __package__ = "archivebox.parsers"
import re import re
import requests import requests
import archivebox
from datetime import datetime from datetime import datetime
from typing import IO, Iterable, Optional from typing import IO, Iterable, Optional
from configparser import ConfigParser from configparser import ConfigParser
from pathlib import Path
from ..index.schema import Link from ..index.schema import Link
from ..util import enforce_types from ..util import enforce_types
from ..system import atomic_write from ..system import atomic_write
from ..config import ( from ..config import READWISE_READER_TOKENS
SOURCES_DIR,
READWISE_READER_TOKENS,
)
API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db" API_DB_PATH = archivebox.DATA_DIR / "sources" / "readwise_reader_api.db"
class ReadwiseReaderAPI: class ReadwiseReaderAPI:

View file

@ -17,6 +17,8 @@ from pydantic_pkgr import (
from django.conf import settings from django.conf import settings
import archivebox
from .base_hook import BaseHook, HookType from .base_hook import BaseHook, HookType
@ -64,7 +66,9 @@ class BaseBinary(BaseHook, Binary):
super().register(settings, parent_plugin=parent_plugin) super().register(settings, parent_plugin=parent_plugin)
@staticmethod @staticmethod
def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None: def symlink_to_lib(binary, bin_dir=None) -> None:
bin_dir = bin_dir or archivebox.CONSTANTS.LIB_BIN_DIR
if not (binary.abspath and binary.abspath.exists()): if not (binary.abspath and binary.abspath.exists()):
return return
@ -77,19 +81,19 @@ class BaseBinary(BaseHook, Binary):
@validate_call @validate_call
def load(self, **kwargs) -> Self: def load(self, **kwargs) -> Self:
binary = super().load(**kwargs) binary = super().load(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR) self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
return binary return binary
@validate_call @validate_call
def install(self, **kwargs) -> Self: def install(self, **kwargs) -> Self:
binary = super().install(**kwargs) binary = super().install(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR) self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
return binary return binary
@validate_call @validate_call
def load_or_install(self, **kwargs) -> Self: def load_or_install(self, **kwargs) -> Self:
binary = super().load_or_install(**kwargs) binary = super().load_or_install(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR) self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR)
return binary return binary
@property @property

View file

@ -123,6 +123,10 @@ class ArchiveBoxBaseConfig(BaseSettings):
validate_return=True, validate_return=True,
revalidate_instances="always", revalidate_instances="always",
) )
load_from_defaults: ClassVar[bool] = True
load_from_configfile: ClassVar[bool] = True
load_from_environment: ClassVar[bool] = True
@classmethod @classmethod
def settings_customise_sources( def settings_customise_sources(
@ -140,20 +144,22 @@ class ArchiveBoxBaseConfig(BaseSettings):
# import ipdb; ipdb.set_trace() # import ipdb; ipdb.set_trace()
precedence_order = {}
# if ArchiveBox.conf does not exist yet, return defaults -> env order # if ArchiveBox.conf does not exist yet, return defaults -> env order
if not ARCHIVEBOX_CONFIG_FILE.is_file(): if not ARCHIVEBOX_CONFIG_FILE.is_file():
return ( precedence_order = {
init_settings, 'defaults': init_settings,
env_settings, 'environment': env_settings,
) }
# if ArchiveBox.conf exists and is in TOML format, return default -> TOML -> env order # if ArchiveBox.conf exists and is in TOML format, return default -> TOML -> env order
try: try:
return ( precedence_order = precedence_order or {
init_settings, 'defaults': init_settings,
FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), 'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
env_settings, 'environment': env_settings,
) }
except Exception as err: except Exception as err:
if err.__class__.__name__ != "TOMLDecodeError": if err.__class__.__name__ != "TOMLDecodeError":
raise raise
@ -165,11 +171,20 @@ class ArchiveBoxBaseConfig(BaseSettings):
new_toml = ini_to_toml.convert(original_ini) new_toml = ini_to_toml.convert(original_ini)
ARCHIVEBOX_CONFIG_FILE.write_text(new_toml) ARCHIVEBOX_CONFIG_FILE.write_text(new_toml)
return ( precedence_order = {
init_settings, 'defaults': init_settings,
FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), 'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE),
env_settings, 'environment': env_settings,
) }
if not cls.load_from_environment:
precedence_order.pop('environment')
if not cls.load_from_configfile:
precedence_order.pop('configfile')
if not cls.load_from_defaults:
precedence_order.pop('defaults')
return tuple(precedence_order.values())
@model_validator(mode="after") @model_validator(mode="after")
def fill_defaults(self): def fill_defaults(self):

View file

@ -1,72 +1,72 @@
__package__ = 'archivebox.plugantic.management.commands' # __package__ = 'archivebox.plugantic.management.commands'
from django.core.management.base import BaseCommand # from django.core.management.base import BaseCommand
from django.conf import settings # from django.conf import settings
from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer # from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer
from pydantic_pkgr.binprovider import bin_abspath # from pydantic_pkgr.binprovider import bin_abspath
from ....config import NODE_BIN_PATH, bin_path # from ....config import bin_path
from ...base_binary import env # from ...base_binary import env
class Command(BaseCommand): # class Command(BaseCommand):
def handle(self, *args, method, **options): # def handle(self, *args, method, **options):
method(*args, **options) # method(*args, **options)
def add_arguments(self, parser): # def add_arguments(self, parser):
subparsers = parser.add_subparsers(title="sub-commands", required=True) # subparsers = parser.add_subparsers(title="sub-commands", required=True)
list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.") # list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.")
list_parser.set_defaults(method=self.list) # list_parser.set_defaults(method=self.list)
install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.") # install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.")
install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.") # install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.")
install_parser.add_argument("package_names", nargs="+", type=str) # install_parser.add_argument("package_names", nargs="+", type=str)
install_parser.set_defaults(method=self.install) # install_parser.set_defaults(method=self.install)
def list(self, *args, **options): # def list(self, *args, **options):
self.stdout.write('################# PLUGINS ####################') # self.stdout.write('################# PLUGINS ####################')
for plugin in settings.PLUGINS.values(): # for plugin in settings.PLUGINS.values():
self.stdout.write(f'{plugin.name}:') # self.stdout.write(f'{plugin.name}:')
for binary in plugin.binaries: # for binary in plugin.binaries:
try: # try:
binary = binary.load() # binary = binary.load()
except Exception as e: # except Exception as e:
# import ipdb; ipdb.set_trace() # # import ipdb; ipdb.set_trace()
raise # raise
self.stdout.write(f' {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)} {binary.abspath}') # self.stdout.write(f' {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)} {binary.abspath}')
self.stdout.write('\n################# LEGACY ####################') # self.stdout.write('\n################# LEGACY ####################')
for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items(): # for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items():
bin_name = settings.CONFIG[bin_key] # bin_name = settings.CONFIG[bin_key]
self.stdout.write(f'{bin_key}: {bin_name}') # self.stdout.write(f'{bin_key}: {bin_name}')
# binary = Binary(name=package_name, providers=[env]) # # binary = Binary(name=package_name, providers=[env])
# print(binary) # # print(binary)
# try: # # try:
# loaded_bin = binary.load() # # loaded_bin = binary.load()
# self.stdout.write( # # self.stdout.write(
# self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin) # # self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
# ) # # )
# except Exception as e: # # except Exception as e:
# self.stderr.write( # # self.stderr.write(
# self.style.ERROR(f"Error loading {package_name}: {e}") # # self.style.ERROR(f"Error loading {package_name}: {e}")
# ) # # )
def install(self, *args, bright, **options): # def install(self, *args, bright, **options):
for package_name in options["package_names"]: # for package_name in options["package_names"]:
binary = Binary(name=package_name, providers=[env]) # binary = Binary(name=package_name, providers=[env])
print(binary) # print(binary)
try: # try:
loaded_bin = binary.load() # loaded_bin = binary.load()
self.stdout.write( # self.stdout.write(
self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin) # self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin)
) # )
except Exception as e: # except Exception as e:
self.stderr.write( # self.stderr.write(
self.style.ERROR(f"Error loading {package_name}: {e}") # self.style.ERROR(f"Error loading {package_name}: {e}")
) # )

View file

@ -18,6 +18,8 @@ from pydantic_pkgr import (
bin_abspath, bin_abspath,
) )
import archivebox
# Depends on other Django apps: # Depends on other Django apps:
from plugantic.base_plugin import BasePlugin from plugantic.base_plugin import BasePlugin
from plugantic.base_configset import BaseConfigSet, ConfigSectionName from plugantic.base_configset import BaseConfigSet, ConfigSectionName
@ -215,7 +217,7 @@ class ChromeBinary(BaseBinary):
} }
@staticmethod @staticmethod
def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None: def symlink_to_lib(binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR) -> None:
if not (binary.abspath and binary.abspath.exists()): if not (binary.abspath and binary.abspath.exists()):
return return

View file

@ -0,0 +1,103 @@
__package__ = 'archivebox.plugins_extractor.readability'
from pathlib import Path
from typing import List, Dict, Optional, ClassVar
# from typing_extensions import Self
from django.conf import settings
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field, validate_call
from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName, ShallowBinary
# Depends on other Django apps:
from plugantic.base_plugin import BasePlugin
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
from plugantic.base_binary import BaseBinary, env
from plugantic.base_extractor import BaseExtractor
from plugantic.base_hook import BaseHook
# Depends on Other Plugins:
from plugins_sys.config.apps import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################
class ReadabilityConfig(BaseConfigSet):
section: ClassVar[ConfigSectionName] = 'ARCHIVING_CONFIG'
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
READABILITY_BINARY: str = Field(default='readability-extractor')
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
READABILITY_CONFIG = ReadabilityConfig()
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
class ReadabilityBinary(BaseBinary):
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
LIB_NPM_BINPROVIDER.name: {"packages": lambda: [READABILITY_PACKAGE_NAME]},
SYS_NPM_BINPROVIDER.name: {"packages": lambda: []}, # prevent modifying system global npm packages
}
@validate_call
def install(self, binprovider_name: Optional[BinProviderName]=None) -> ShallowBinary:
# force install to only use lib/npm provider, we never want to modify global NPM packages
return BaseBinary.install(self, binprovider_name=binprovider_name or LIB_NPM_BINPROVIDER.name)
@validate_call
def load_or_install(self, binprovider_name: Optional[BinProviderName] = None) -> ShallowBinary:
# force install to only use lib/npm provider, we never want to modify global NPM packages
try:
return self.load()
except Exception:
return BaseBinary.install(self, binprovider_name=binprovider_name or LIB_NPM_BINPROVIDER.name)
READABILITY_BINARY = ReadabilityBinary()
class ReadabilityExtractor(BaseExtractor):
name: str = 'readability'
binary: BinName = READABILITY_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'readability' / 'content.html'
READABILITY_BINARY = ReadabilityBinary()
READABILITY_EXTRACTOR = ReadabilityExtractor()
# class ReadabilityQueue(BaseQueue):
# name: str = 'singlefile'
# binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY]
# READABILITY_QUEUE = ReadabilityQueue()
class ReadabilityPlugin(BasePlugin):
app_label: str ='singlefile'
verbose_name: str = 'SingleFile'
hooks: List[InstanceOf[BaseHook]] = [
READABILITY_CONFIG,
READABILITY_BINARY,
READABILITY_EXTRACTOR,
# READABILITY_QUEUE,
]
PLUGIN = ReadabilityPlugin()
PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -34,7 +34,7 @@ class SinglefileConfig(BaseConfigSet):
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
SINGLEFILE_BINARY: str = Field(default='wget') SINGLEFILE_BINARY: str = Field(default='single-file')
SINGLEFILE_EXTRA_ARGS: List[str] = [] SINGLEFILE_EXTRA_ARGS: List[str] = []
@ -46,17 +46,21 @@ SINGLEFILE_MAX_VERSION = '1.1.60'
class SinglefileBinary(BaseBinary): class SinglefileBinary(BaseBinary):
name: BinName = 'single-file' name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
env.name: { env.name: {
'abspath': lambda: 'abspath': lambda:
bin_abspath('single-file', PATH=env.PATH) or bin_abspath('single-file-node.js', PATH=env.PATH), bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
or bin_abspath('single-file', PATH=env.PATH)
or bin_abspath('single-file-node.js', PATH=env.PATH),
}, },
LIB_NPM_BINPROVIDER.name: { LIB_NPM_BINPROVIDER.name: {
"abspath": lambda: "abspath": lambda:
bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH) or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH), bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
"packages": lambda: "packages": lambda:
[f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"], [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
}, },

View file

@ -1,10 +1,13 @@
__package__ = 'archivebox.plugins_pkg.npm' __package__ = 'archivebox.plugins_pkg.npm'
import archivebox
from pathlib import Path from pathlib import Path
from typing import List, Optional from typing import List, Optional
from django.conf import settings from django.conf import settings
from pydantic import InstanceOf
from pydantic import InstanceOf, model_validator
from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName
@ -14,8 +17,6 @@ from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
from plugantic.base_hook import BaseHook from plugantic.base_hook import BaseHook
from ...config import CONFIG
###################### Config ########################## ###################### Config ##########################
@ -35,17 +36,24 @@ DEFAULT_GLOBAL_CONFIG = {
NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG) NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
OLD_NODE_BIN_PATH = archivebox.DATA_DIR / 'node_modules' / '.bin'
NEW_NODE_BIN_PATH = archivebox.CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin'
class SystemNpmProvider(NpmProvider, BaseBinProvider): class SystemNpmProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "sys_npm" name: BinProviderName = "sys_npm"
PATH: PATHStr = str(CONFIG.NODE_BIN_PATH)
npm_prefix: Optional[Path] = None npm_prefix: Optional[Path] = None
class LibNpmProvider(NpmProvider, BaseBinProvider): class LibNpmProvider(NpmProvider, BaseBinProvider):
name: BinProviderName = "lib_npm" name: BinProviderName = "lib_npm"
PATH: PATHStr = str(CONFIG.NODE_BIN_PATH) PATH: PATHStr = str(OLD_NODE_BIN_PATH)
npm_prefix: Optional[Path] = settings.CONFIG.LIB_DIR / 'npm' npm_prefix: Optional[Path] = archivebox.CONSTANTS.LIB_NPM_DIR
@model_validator(mode='after')
def validate_path(self):
assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent
return self
SYS_NPM_BINPROVIDER = SystemNpmProvider() SYS_NPM_BINPROVIDER = SystemNpmProvider()

View file

@ -1,13 +1,14 @@
__package__ = 'archivebox.plugins_pkg.pip'
import os import os
import sys import sys
import inspect import inspect
import archivebox import archivebox
from pathlib import Path from pathlib import Path
from typing import List, Dict, Optional, ClassVar from typing import List, Dict, Optional, ClassVar
from pydantic import InstanceOf, Field from pydantic import InstanceOf, Field, model_validator
import django import django
from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type] from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type]
from django.core.checks import Error, Tags from django.core.checks import Error, Tags
from django.conf import settings from django.conf import settings
@ -19,6 +20,8 @@ from plugantic.base_check import BaseCheck
from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew
from plugantic.base_hook import BaseHook from plugantic.base_hook import BaseHook
from ...misc.logging import hint
###################### Config ########################## ###################### Config ##########################
@ -66,7 +69,7 @@ class LibPipBinProvider(PipProvider, BaseBinProvider):
name: BinProviderName = "lib_pip" name: BinProviderName = "lib_pip"
INSTALLER_BIN: BinName = "pip" INSTALLER_BIN: BinName = "pip"
pip_venv: Optional[Path] = settings.CONFIG.OUTPUT_DIR / 'lib' / 'pip' / 'venv' pip_venv: Optional[Path] = archivebox.CONSTANTS.LIB_PIP_DIR / 'venv'
SYS_PIP_BINPROVIDER = SystemPipBinProvider() SYS_PIP_BINPROVIDER = SystemPipBinProvider()
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider() PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
@ -117,6 +120,20 @@ class SqliteBinary(BaseBinary):
"version": lambda: SemVer(django_sqlite3.version), "version": lambda: SemVer(django_sqlite3.version),
}, },
} }
@model_validator(mode='after')
def validate_json_extension_is_available(self):
# Check to make sure JSON extension is available in our Sqlite3 instance
try:
cursor = django_sqlite3.connect(':memory:').cursor()
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
except django_sqlite3.OperationalError as exc:
print(f'[red][X] Your SQLite3 version is missing the required JSON1 extension: {exc}[/red]')
hint([
'Upgrade your Python version or install the extension manually:',
'https://code.djangoproject.com/wiki/JSON1Extension'
])
return self
SQLITE_BINARY = SqliteBinary() SQLITE_BINARY = SqliteBinary()

View file

@ -19,6 +19,8 @@ from pydantic_pkgr import (
DEFAULT_ENV_PATH, DEFAULT_ENV_PATH,
) )
import archivebox
# Depends on other Django apps: # Depends on other Django apps:
from plugantic.base_plugin import BasePlugin from plugantic.base_plugin import BasePlugin
from plugantic.base_configset import BaseConfigSet from plugantic.base_configset import BaseConfigSet
@ -42,12 +44,10 @@ class PlaywrightConfigs(BaseConfigSet):
# PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] # PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
pass pass
DEFAULT_GLOBAL_CONFIG = {
}
PLAYWRIGHT_CONFIG = PlaywrightConfigs(**DEFAULT_GLOBAL_CONFIG) PLAYWRIGHT_CONFIG = PlaywrightConfigs()
LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers" LIB_DIR_BROWSERS = archivebox.CONSTANTS.LIB_BROWSERS_DIR
@ -65,12 +65,12 @@ class PlaywrightBinProvider(BaseBinProvider):
name: BinProviderName = "playwright" name: BinProviderName = "playwright"
INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
PATH: PATHStr = f"{settings.CONFIG.BIN_DIR}:{DEFAULT_ENV_PATH}" PATH: PATHStr = f"{archivebox.CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}"
puppeteer_browsers_dir: Optional[Path] = ( puppeteer_browsers_dir: Optional[Path] = (
Path("~/Library/Caches/ms-playwright").expanduser() Path("~/Library/Caches/ms-playwright").expanduser() # macos playwright cache dir
if OPERATING_SYSTEM == "darwin" else if OPERATING_SYSTEM == "darwin" else
Path("~/.cache/ms-playwright").expanduser() Path("~/.cache/ms-playwright").expanduser() # linux playwright cache dir
) )
puppeteer_install_args: List[str] = ["install"] # --with-deps puppeteer_install_args: List[str] = ["install"] # --with-deps

View file

@ -16,6 +16,8 @@ from pydantic_pkgr import (
HostBinPath, HostBinPath,
) )
import archivebox
# Depends on other Django apps: # Depends on other Django apps:
from plugantic.base_plugin import BasePlugin from plugantic.base_plugin import BasePlugin
from plugantic.base_configset import BaseConfigSet from plugantic.base_configset import BaseConfigSet
@ -40,12 +42,10 @@ class PuppeteerConfigs(BaseConfigSet):
# PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
pass pass
DEFAULT_GLOBAL_CONFIG = {
}
PUPPETEER_CONFIG = PuppeteerConfigs(**DEFAULT_GLOBAL_CONFIG) PUPPETEER_CONFIG = PuppeteerConfigs()
LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers" LIB_DIR_BROWSERS = archivebox.CONSTANTS.LIB_BROWSERS_DIR
class PuppeteerBinary(BaseBinary): class PuppeteerBinary(BaseBinary):
@ -60,8 +60,8 @@ PUPPETEER_BINARY = PuppeteerBinary()
class PuppeteerBinProvider(BaseBinProvider): class PuppeteerBinProvider(BaseBinProvider):
name: BinProviderName = "puppeteer" name: BinProviderName = "puppeteer"
INSTALLER_BIN: BinName = "npx" INSTALLER_BIN: BinName = "npx"
PATH: PATHStr = str(settings.CONFIG.BIN_DIR) PATH: PATHStr = str(archivebox.CONSTANTS.LIB_BIN_DIR)
puppeteer_browsers_dir: Optional[Path] = LIB_DIR_BROWSERS puppeteer_browsers_dir: Optional[Path] = LIB_DIR_BROWSERS
puppeteer_install_args: List[str] = ["@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)] puppeteer_install_args: List[str] = ["@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)]
@ -140,7 +140,7 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
# ALTERNATIVE INSTALL METHOD using Ansible: # ALTERNATIVE INSTALL METHOD using Ansible:
# install_playbook = self.plugin_dir / 'install_puppeteer.yml' # install_playbook = self.plugin_dir / 'install_puppeteer.yml'
# chrome_bin = run_playbook(install_playbook, data_dir=settings.CONFIG.OUTPUT_DIR, quiet=quiet).BINARIES.chrome # chrome_bin = run_playbook(install_playbook, data_dir=archivebox.DATA_DIR, quiet=quiet).BINARIES.chrome
# return self.__class__.model_validate( # return self.__class__.model_validate(
# { # {
# **self.model_dump(), # **self.model_dump(),

View file

@ -1,18 +1,24 @@
__package__ = 'archivebox.plugins_sys.config'
import os import os
import sys import sys
import shutil
import platform import platform
import archivebox
from typing import List, ClassVar from typing import List, ClassVar, Dict, Optional
from datetime import datetime
from pathlib import Path from pathlib import Path
from pydantic import InstanceOf, Field, field_validator, model_validator from pydantic import InstanceOf, Field, field_validator, model_validator, computed_field
from benedict import benedict
from rich import print from rich import print
from django.conf import settings from django.conf import settings
from django.utils.crypto import get_random_string
from plugantic.base_plugin import BasePlugin from plugantic.base_plugin import BasePlugin
from plugantic.base_configset import BaseConfigSet, ConfigSectionName from plugantic.base_configset import BaseConfigSet, ConfigSectionName
from plugantic.base_hook import BaseHook from plugantic.base_hook import BaseHook
from .constants import CONSTANTS, CONSTANTS_CONFIG
###################### Config ########################## ###################### Config ##########################
@ -24,17 +30,57 @@ class ShellConfig(BaseConfigSet):
IS_TTY: bool = Field(default=sys.stdout.isatty()) IS_TTY: bool = Field(default=sys.stdout.isatty())
USE_COLOR: bool = Field(default=lambda c: c.IS_TTY) USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
SHOW_PROGRESS: bool = Field(default=lambda c: (c.IS_TTY and platform.system() != 'darwin')) # progress bars are buggy on mac, disable for now SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
IN_DOCKER: bool = Field(default=False) IN_DOCKER: bool = Field(default=False)
IN_QEMU: bool = Field(default=False) IN_QEMU: bool = Field(default=False)
USER: str = Field(default=Path('~').expanduser().resolve().name)
PUID: int = Field(default=os.getuid()) PUID: int = Field(default=os.getuid())
PGID: int = Field(default=os.getgid()) PGID: int = Field(default=os.getgid())
PYTHON_ENCODING: str = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')) PYTHON_ENCODING: str = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)},
CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)},
@computed_field
@property
def TERM_WIDTH(self) -> int:
return shutil.get_terminal_size((100, 10)).columns
@computed_field
@property
def COMMIT_HASH(self) -> Optional[str]:
try:
git_dir = archivebox.PACKAGE_DIR / '../.git'
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
commit_hash = git_dir.joinpath(ref).read_text().strip()
return commit_hash
except Exception:
pass
try:
return list((archivebox.PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
pass
return None
@computed_field
@property
def BUILD_TIME(self) -> str:
if self.IN_DOCKER:
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
return docker_build_end_time
src_last_modified_unix_timestamp = (archivebox.PACKAGE_DIR / 'config.py').stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
@model_validator(mode='after') @model_validator(mode='after')
def validate_not_running_as_root(self): def validate_not_running_as_root(self):
attempted_command = ' '.join(sys.argv[:3]) attempted_command = ' '.join(sys.argv[:3])
@ -92,7 +138,7 @@ GENERAL_CONFIG = GeneralConfig()
class ServerConfig(BaseConfigSet): class ServerConfig(BaseConfigSet):
section: ClassVar[ConfigSectionName] = 'SERVER_CONFIG' section: ClassVar[ConfigSectionName] = 'SERVER_CONFIG'
SECRET_KEY: str = Field(default=None) SECRET_KEY: str = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
BIND_ADDR: str = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER]) BIND_ADDR: str = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
ALLOWED_HOSTS: str = Field(default='*') ALLOWED_HOSTS: str = Field(default='*')
CSRF_TRUSTED_ORIGINS: str = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR)) CSRF_TRUSTED_ORIGINS: str = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
@ -179,7 +225,7 @@ SEARCH_BACKEND_CONFIG = SearchBackendConfig()
class ConfigPlugin(BasePlugin): class ConfigPlugin(BasePlugin):
app_label: str = 'config' app_label: str = 'CONFIG'
verbose_name: str = 'Configuration' verbose_name: str = 'Configuration'
hooks: List[InstanceOf[BaseHook]] = [ hooks: List[InstanceOf[BaseHook]] = [
@ -190,6 +236,12 @@ class ConfigPlugin(BasePlugin):
ARCHIVING_CONFIG, ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG, SEARCH_BACKEND_CONFIG,
] ]
# def register(self, settings, parent_plugin=None):
# try:
# super().register(settings, parent_plugin=parent_plugin)
# except Exception as e:
# print(f'[red][X] Error registering config plugin: {e}[/red]', file=sys.stderr)
PLUGIN = ConfigPlugin() PLUGIN = ConfigPlugin()

View file

@ -0,0 +1,47 @@
# def get_versions_available_on_github(config):
# """
# returns a dictionary containing the ArchiveBox GitHub release info for
# the recommended upgrade version and the currently installed version
# """
# # we only want to perform the (relatively expensive) check for new versions
# # when its most relevant, e.g. when the user runs a long-running command
# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
# if subcommand_run_by_user not in long_running_commands:
# return None
# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
# response = requests.get(github_releases_api)
# if response.status_code != 200:
# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
# return None
# all_releases = response.json()
# installed_version = parse_version_string(config['VERSION'])
# # find current version or nearest older version (to link to)
# current_version = None
# for idx, release in enumerate(all_releases):
# release_version = parse_version_string(release['tag_name'])
# if release_version <= installed_version:
# current_version = release
# break
# current_version = current_version or all_releases[-1]
# # recommended version is whatever comes after current_version in the release list
# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
# try:
# recommended_version = all_releases[idx+1]
# except IndexError:
# recommended_version = None
# return {'recommended_version': recommended_version, 'current_version': current_version}
# def can_upgrade(config):
# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
# return recommended_version > current_version
# return False

View file

@ -0,0 +1 @@
from archivebox.constants import *

View file

@ -1,16 +1,13 @@
from pathlib import Path from pathlib import Path
from django.conf import settings
import archivebox
OUTPUT_DIR = archivebox.DATA_DIR
LOGS_DIR = archivebox.CONSTANTS.LOGS_DIR
OUTPUT_DIR = settings.CONFIG.OUTPUT_DIR TMP_DIR = archivebox.CONSTANTS.TMP_DIR
LOGS_DIR = settings.CONFIG.LOGS_DIR
TMP_DIR = OUTPUT_DIR / "tmp"
Path.mkdir(TMP_DIR, exist_ok=True) Path.mkdir(TMP_DIR, exist_ok=True)
CONFIG_FILE = TMP_DIR / "supervisord.conf" CONFIG_FILE = TMP_DIR / "supervisord.conf"
PID_FILE = TMP_DIR / "supervisord.pid" PID_FILE = TMP_DIR / "supervisord.pid"
SOCK_FILE = TMP_DIR / "supervisord.sock" SOCK_FILE = TMP_DIR / "supervisord.sock"

View file

@ -4,6 +4,7 @@ __package__ = 'archivebox'
import os import os
import signal import signal
import shutil import shutil
import getpass
from json import dump from json import dump
from pathlib import Path from pathlib import Path
@ -229,3 +230,31 @@ class suppress_output(object):
if self.stderr: if self.stderr:
os.dup2(self.real_stderr, 2) os.dup2(self.real_stderr, 2)
os.close(self.null_stderr) os.close(self.null_stderr)
def get_system_user() -> str:
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
# uid 999 is especially problematic and breaks many attempts
SYSTEM_USER = None
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
# Option 1
try:
import pwd
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
except (ModuleNotFoundError, Exception):
pass
# Option 2
try:
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
except Exception:
pass
# Option 3
try:
SYSTEM_USER = SYSTEM_USER or os.getlogin()
except Exception:
pass
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER