From bb65b2dbecbfeac908059bb8dfd9b7bf82ac1293 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 25 Sep 2024 05:10:09 -0700 Subject: [PATCH] move almost all config into new archivebox.CONSTANTS --- archivebox/__init__.py | 15 +- archivebox/config.py | 484 +++--------------- archivebox/constants.py | 249 +++++++++ archivebox/core/admin.py | 6 +- archivebox/core/settings.py | 72 ++- archivebox/extractors/__init__.py | 5 +- archivebox/extractors/htmltotext.py | 5 +- archivebox/extractors/readability.py | 54 +- archivebox/index/__init__.py | 67 ++- archivebox/index/html.py | 24 +- archivebox/index/json.py | 70 +-- archivebox/logging_util.py | 60 ++- archivebox/main.py | 113 ++-- archivebox/misc/checks.py | 64 +-- archivebox/misc/logging.py | 8 +- archivebox/parsers/pocket_api.py | 5 +- archivebox/parsers/readwise_reader_api.py | 10 +- archivebox/plugantic/base_binary.py | 12 +- archivebox/plugantic/base_configset.py | 43 +- .../plugantic/management/commands/pkg.py | 114 ++--- archivebox/plugins_extractor/chrome/apps.py | 4 +- .../plugins_extractor/readability/apps.py | 103 ++++ .../plugins_extractor/singlefile/apps.py | 12 +- archivebox/plugins_pkg/npm/apps.py | 20 +- archivebox/plugins_pkg/pip/apps.py | 23 +- archivebox/plugins_pkg/playwright/apps.py | 14 +- archivebox/plugins_pkg/puppeteer/apps.py | 14 +- archivebox/plugins_sys/config/apps.py | 64 ++- .../plugins_sys/config/check_for_update.py | 47 ++ archivebox/plugins_sys/config/constants.py | 1 + archivebox/queues/settings.py | 11 +- archivebox/system.py | 29 ++ 32 files changed, 982 insertions(+), 840 deletions(-) create mode 100644 archivebox/constants.py create mode 100644 archivebox/plugins_extractor/readability/apps.py create mode 100644 archivebox/plugins_sys/config/check_for_update.py create mode 100644 archivebox/plugins_sys/config/constants.py diff --git a/archivebox/__init__.py b/archivebox/__init__.py index e668db33..b7355260 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1,14 +1,15 @@ __package__ = 'archivebox' -# print('INSTALLING MONKEY PATCHES') -from .monkey_patches import * +# print('INSTALLING MONKEY PATCHES') +from .monkey_patches import * # noqa +# print('DONE INSTALLING MONKEY PATCHES') + import os -import importlib +import importlib.metadata from pathlib import Path - PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir @@ -28,7 +29,9 @@ def _detect_installed_version(): raise Exception('Failed to detect installed archivebox version!') +VERSION = _detect_installed_version() -__version__ = _detect_installed_version() +__version__ = VERSION -# print('DONE INSTALLING MONKEY PATCHES') + +from .constants import CONSTANTS diff --git a/archivebox/config.py b/archivebox/config.py index e05fbe5a..0852da83 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -26,10 +26,7 @@ import io import re import sys import json -import inspect -import getpass import shutil -import requests import archivebox from hashlib import md5 @@ -38,7 +35,6 @@ from datetime import datetime, timezone from typing import Optional, Type, Tuple, Dict from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired from configparser import ConfigParser -import importlib.metadata from pydantic_pkgr import SemVer from rich.progress import Progress @@ -49,7 +45,6 @@ from django.db.backends.sqlite3.base import Database as sqlite3 from .config_stubs import ( AttrDict, - SimpleConfigValueDict, ConfigValue, ConfigDict, ConfigDefaultValue, @@ -61,7 +56,7 @@ from .misc.logging import ( ANSI, COLOR_DICT, stderr, - hint, + hint, # noqa ) # print('STARTING CONFIG LOADING') @@ -165,8 +160,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, 'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, - 'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'}, - 'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'}, + 'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'}, + 'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' wget/{WGET_VERSION}'}, 'COOKIES_FILE': {'type': str, 'default': None}, @@ -254,12 +249,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2 - 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, - 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')}, - 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl 'NODE_BINARY': {'type': str, 'default': 'node'}, - 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, + # 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl + # 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, + # 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, + # 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, @@ -308,212 +303,16 @@ CONFIG_FILENAME = 'ArchiveBox.conf' -STATICFILE_EXTENSIONS = { - # 99.999% of the time, URLs ending in these extensions are static files - # that can be downloaded as-is, not html pages that need to be rendered - 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', - 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', - 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', - 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8', - 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', - 'atom', 'rss', 'css', 'js', 'json', - 'dmg', 'iso', 'img', - 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z', - - # Less common extensions to consider adding later - # jar, swf, bin, com, exe, dll, deb - # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, - # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, - # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml - - # These are always treated as pages, not as static files, never add them: - # html, htm, shtml, xhtml, xml, aspx, php, cgi -} - -# When initializing archivebox in a new directory, we check to make sure the dir is -# actually empty so that we dont clobber someone's home directory or desktop by accident. -# These files are exceptions to the is_empty check when we're trying to init a new dir, -# as they could be from a previous archivebox version, system artifacts, dependencies, etc. -ALLOWED_IN_OUTPUT_DIR = { - ".gitignore", - "lost+found", - ".DS_Store", - ".venv", - "venv", - "virtualenv", - ".virtualenv", - "node_modules", - "package.json", - "package-lock.json", - "yarn.lock", - "static", - "sonic", - "search.sqlite3", - CRONTABS_DIR_NAME, - ARCHIVE_DIR_NAME, - SOURCES_DIR_NAME, - LOGS_DIR_NAME, - CACHE_DIR_NAME, - LIB_DIR_NAME, - PERSONAS_DIR_NAME, - SQL_INDEX_FILENAME, - f"{SQL_INDEX_FILENAME}-wal", - f"{SQL_INDEX_FILENAME}-shm", - "queue.sqlite3", - "queue.sqlite3-wal", - "queue.sqlite3-shm", - JSON_INDEX_FILENAME, - HTML_INDEX_FILENAME, - ROBOTS_TXT_FILENAME, - FAVICON_FILENAME, - CONFIG_FILENAME, - f"{CONFIG_FILENAME}.bak", - "static_index.json", -} - ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE -CONSTANTS = { - "PACKAGE_DIR_NAME": {'default': lambda c: PACKAGE_DIR_NAME}, - "LIB_DIR_NAME": {'default': lambda c: LIB_DIR_NAME}, - "TEMPLATES_DIR_NAME": {'default': lambda c: TEMPLATES_DIR_NAME}, - "ARCHIVE_DIR_NAME": {'default': lambda c: ARCHIVE_DIR_NAME}, - "SOURCES_DIR_NAME": {'default': lambda c: SOURCES_DIR_NAME}, - "LOGS_DIR_NAME": {'default': lambda c: LOGS_DIR_NAME}, - "CACHE_DIR_NAME": {'default': lambda c: CACHE_DIR_NAME}, - "PERSONAS_DIR_NAME": {'default': lambda c: PERSONAS_DIR_NAME}, - "CRONTABS_DIR_NAME": {'default': lambda c: CRONTABS_DIR_NAME}, - "SQL_INDEX_FILENAME": {'default': lambda c: SQL_INDEX_FILENAME}, - "JSON_INDEX_FILENAME": {'default': lambda c: JSON_INDEX_FILENAME}, - "HTML_INDEX_FILENAME": {'default': lambda c: HTML_INDEX_FILENAME}, - "ROBOTS_TXT_FILENAME": {'default': lambda c: ROBOTS_TXT_FILENAME}, - "FAVICON_FILENAME": {'default': lambda c: FAVICON_FILENAME}, - "CONFIG_FILENAME": {'default': lambda c: CONFIG_FILENAME}, - "DEFAULT_CLI_COLORS": {'default': lambda c: DEFAULT_CLI_COLORS}, - "ANSI": {'default': lambda c: ANSI}, - "COLOR_DICT": {'default': lambda c: COLOR_DICT}, - "STATICFILE_EXTENSIONS": {'default': lambda c: STATICFILE_EXTENSIONS}, - "ALLOWED_IN_OUTPUT_DIR": {'default': lambda c: ALLOWED_IN_OUTPUT_DIR}, - # "ALLOWDENYLIST_REGEX_FLAGS": {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS}, -} +CONSTANTS = archivebox.CONSTANTS._asdict() ############################## Version Config ################################## -def get_system_user() -> str: - # some host OS's are unable to provide a username (k3s, Windows), making this complicated - # uid 999 is especially problematic and breaks many attempts - SYSTEM_USER = None - FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}' - # Option 1 - try: - import pwd - SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name - except (ModuleNotFoundError, Exception): - pass - # Option 2 - try: - SYSTEM_USER = SYSTEM_USER or getpass.getuser() - except Exception: - pass - - # Option 3 - try: - SYSTEM_USER = SYSTEM_USER or os.getlogin() - except Exception: - pass - - return SYSTEM_USER or FALLBACK_USER_PLACHOLDER - -def get_version(config): - try: - return importlib.metadata.version(__package__ or 'archivebox') - except importlib.metadata.PackageNotFoundError: - try: - pyproject_config = (config['PACKAGE_DIR'] / 'pyproject.toml').read_text() - for line in pyproject_config: - if line.startswith('version = '): - return line.split(' = ', 1)[-1].strip('"') - except FileNotFoundError: - # building docs, pyproject.toml is not available - return 'dev' - - raise Exception('Failed to detect installed archivebox version!') - -def get_commit_hash(config) -> Optional[str]: - try: - git_dir = config['PACKAGE_DIR'] / '../.git' - ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1] - commit_hash = git_dir.joinpath(ref).read_text().strip() - return commit_hash - except Exception: - pass - - try: - return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip() - except Exception: - pass - - return None - -def get_build_time(config) -> str: - if config['IN_DOCKER']: - docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0] - return docker_build_end_time - - src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime - return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s') - -def get_versions_available_on_github(config): - """ - returns a dictionary containing the ArchiveBox GitHub release info for - the recommended upgrade version and the currently installed version - """ - - # we only want to perform the (relatively expensive) check for new versions - # when its most relevant, e.g. when the user runs a long-running command - subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help' - long_running_commands = ('add', 'schedule', 'update', 'status', 'server') - if subcommand_run_by_user not in long_running_commands: - return None - - github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases" - response = requests.get(github_releases_api) - if response.status_code != 200: - stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config) - return None - all_releases = response.json() - - installed_version = parse_version_string(config['VERSION']) - - # find current version or nearest older version (to link to) - current_version = None - for idx, release in enumerate(all_releases): - release_version = parse_version_string(release['tag_name']) - if release_version <= installed_version: - current_version = release - break - - current_version = current_version or all_releases[-1] - - # recommended version is whatever comes after current_version in the release list - # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest) - try: - recommended_version = all_releases[idx+1] - except IndexError: - recommended_version = None - - return {'recommended_version': recommended_version, 'current_version': current_version} - -def can_upgrade(config): - if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']: - recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name']) - current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name']) - return recommended_version > current_version - return False ############################## Derived Config ################################## @@ -523,55 +322,25 @@ def can_upgrade(config): # These are derived/computed values calculated *after* all user-provided config values are ingested # they appear in `archivebox config` output and are intended to be read-only for the user DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { - **CONSTANTS, + **{ + key: {'default': lambda c: val} + for key, val in archivebox.CONSTANTS.items() + }, - 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns}, - 'USER': {'default': lambda c: get_system_user()}, - 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()})}, - 'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent}, + 'PACKAGE_DIR': {'default': lambda c: archivebox.PACKAGE_DIR.resolve()}, 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME}, 'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])}, - 'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()}, - 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME}, - 'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME}, - 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME}, - 'CACHE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME}, - 'LIB_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME}, - 'BIN_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME / 'bin'}, - 'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME}, - 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, - 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories - 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')}, - 'NODE_BIN_PATH': {'default': lambda c: str((Path(c["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))}, - - 'VERSION': {'default': lambda c: get_version(c).split('+', 1)[0]}, # remove +editable from user-displayed version string - 'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)}, # short git commit hash of codebase HEAD commit - 'BUILD_TIME': {'default': lambda c: get_build_time(c)}, # docker build completed time or python src last modified time - - 'VERSIONS_AVAILABLE': {'default': lambda c: False}, # get_versions_available_on_github(c)}, - 'CAN_UPGRADE': {'default': lambda c: False}, # can_upgrade(c)}, - - 'PYTHON_BINARY': {'default': lambda c: sys.executable}, - 'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])}, - - 'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)}, - 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])}, - - 'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)}, - 'SQLITE_VERSION': {'default': lambda c: sqlite3.version}, - #'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting if changed later but unused for now because its always expected to be wal - #'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])}, 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None}, - 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)}, + # 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)}, 'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []}, 'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []}, 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']}, @@ -580,23 +349,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])}, 'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None}, 'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False}, - 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)}, + # 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)}, 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, 'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []}, - # 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, - - 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, - 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, - 'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []}, - 'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []}, - - 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, - 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, - 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, + 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']}, 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750 'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []}, 'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []}, @@ -605,21 +365,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, - 'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']}, - 'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None}, - 'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']}, - 'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []}, - 'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []}, - - 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']}, - 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']}, - - 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])}, + 'USE_NODE': {'default': lambda c: True}, 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None}, 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, - 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, - 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, + # 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, + # 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, @@ -696,12 +447,10 @@ def load_config_val(key: str, raise Exception('Config values can only be str, bool, int, or json') -def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]: +def load_config_file(out_dir: str | None=archivebox.DATA_DIR) -> Optional[ConfigDict]: """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" - out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() - assert out_dir and out_dir.is_dir() - config_path = Path(out_dir) / CONFIG_FILENAME + config_path = archivebox.CONSTANTS.CONFIG_FILE if config_path.exists(): config_file = ConfigParser() config_file.optionxform = str @@ -718,7 +467,7 @@ def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]: return None -def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> ConfigDict: +def write_config_file(config: Dict[str, str], out_dir: str | None=archivebox.DATA_DIR) -> ConfigDict: """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" from .system import atomic_write @@ -737,8 +486,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> Confi """) - out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() - config_path = Path(out_dir) / CONFIG_FILENAME + config_path = archivebox.CONSTANTS.CONFIG_FILE if not config_path.exists(): atomic_write(config_path, CONFIG_HEADER) @@ -833,7 +581,7 @@ def load_config(defaults: ConfigDefaultDict, stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration') stderr() # raise - raise SystemExit(2) + # raise SystemExit(2) return AttrDict(extended_config) @@ -984,98 +732,6 @@ def wget_supports_compression(config): except (FileNotFoundError, OSError): return False -def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: - return { - 'PACKAGE_DIR': { - 'path': (config['PACKAGE_DIR']).resolve(), - 'enabled': True, - 'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(), - }, - 'TEMPLATES_DIR': { - 'path': (config['TEMPLATES_DIR']).resolve(), - 'enabled': True, - 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(), - }, - 'LIB_DIR': { - 'path': (config['LIB_DIR']).resolve(), - 'enabled': True, - 'is_valid': config['LIB_DIR'].is_dir(), - }, - # 'NODE_MODULES_DIR': { - # 'path': , - # 'enabled': , - # 'is_valid': (...).exists(), - # }, - } - -def get_data_locations(config: ConfigDict) -> ConfigValue: - return { - # OLD: migrating to personas - # 'CHROME_USER_DATA_DIR': { - # 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']), - # 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'], - # 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(), - # }, - # 'COOKIES_FILE': { - # 'path': os.path.abspath(config['COOKIES_FILE']), - # 'enabled': config['USE_WGET'] and config['COOKIES_FILE'], - # 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(), - # }, - "OUTPUT_DIR": { - "path": config["OUTPUT_DIR"].resolve(), - "enabled": True, - "is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(), - "is_mount": os.path.ismount(config["OUTPUT_DIR"].resolve()), - }, - "CONFIG_FILE": { - "path": config["CONFIG_FILE"].resolve(), - "enabled": True, - "is_valid": config["CONFIG_FILE"].exists(), - }, - "SQL_INDEX": { - "path": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve(), - "enabled": True, - "is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(), - "is_mount": os.path.ismount((config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve()), - }, - "ARCHIVE_DIR": { - "path": config["ARCHIVE_DIR"].resolve(), - "enabled": True, - "is_valid": config["ARCHIVE_DIR"].exists(), - "is_mount": os.path.ismount(config["ARCHIVE_DIR"].resolve()), - }, - "SOURCES_DIR": { - "path": config["SOURCES_DIR"].resolve(), - "enabled": True, - "is_valid": config["SOURCES_DIR"].exists(), - }, - "PERSONAS_DIR": { - "path": config["PERSONAS_DIR"].resolve(), - "enabled": True, - "is_valid": config["PERSONAS_DIR"].exists(), - }, - "LOGS_DIR": { - "path": config["LOGS_DIR"].resolve(), - "enabled": True, - "is_valid": config["LOGS_DIR"].exists(), - }, - "CACHE_DIR": { - "path": config["CACHE_DIR"].resolve(), - "enabled": True, - "is_valid": config["CACHE_DIR"].exists(), - }, - "CUSTOM_TEMPLATES_DIR": { - "path": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).resolve(), - "enabled": bool(config["CUSTOM_TEMPLATES_DIR"]), - "is_valid": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).exists(), - }, - # managed by bin/docker_entrypoint.sh and python-crontab: - # 'CRONTABS_DIR': { - # 'path': config['CRONTABS_DIR'].resolve(), - # 'enabled': True, - # 'is_valid': config['CRONTABS_DIR'].exists(), - # }, - } def get_dependency_info(config: ConfigDict) -> ConfigValue: return { @@ -1129,20 +785,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_NODE'], 'is_valid': bool(config['NODE_VERSION']), }, - 'SINGLEFILE_BINARY': { - 'path': bin_path(config['SINGLEFILE_BINARY']), - 'version': config['SINGLEFILE_VERSION'], - 'hash': bin_hash(config['SINGLEFILE_BINARY']), - 'enabled': config['USE_SINGLEFILE'], - 'is_valid': bool(config['SINGLEFILE_VERSION']), - }, - 'READABILITY_BINARY': { - 'path': bin_path(config['READABILITY_BINARY']), - 'version': config['READABILITY_VERSION'], - 'hash': bin_hash(config['READABILITY_BINARY']), - 'enabled': config['USE_READABILITY'], - 'is_valid': bool(config['READABILITY_VERSION']), - }, 'MERCURY_BINARY': { 'path': bin_path(config['MERCURY_BINARY']), 'version': config['MERCURY_VERSION'], @@ -1157,13 +799,27 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_GIT'], 'is_valid': bool(config['GIT_VERSION']), }, - 'YOUTUBEDL_BINARY': { - 'path': bin_path(config['YOUTUBEDL_BINARY']), - 'version': config['YOUTUBEDL_VERSION'], - 'hash': bin_hash(config['YOUTUBEDL_BINARY']), - 'enabled': config['USE_YOUTUBEDL'], - 'is_valid': bool(config['YOUTUBEDL_VERSION']), - }, + # 'SINGLEFILE_BINARY': { + # 'path': bin_path(config['SINGLEFILE_BINARY']), + # 'version': config['SINGLEFILE_VERSION'], + # 'hash': bin_hash(config['SINGLEFILE_BINARY']), + # 'enabled': config['USE_SINGLEFILE'], + # 'is_valid': bool(config['SINGLEFILE_VERSION']), + # }, + # 'READABILITY_BINARY': { + # 'path': bin_path(config['READABILITY_BINARY']), + # 'version': config['READABILITY_VERSION'], + # 'hash': bin_hash(config['READABILITY_BINARY']), + # 'enabled': config['USE_READABILITY'], + # 'is_valid': bool(config['READABILITY_VERSION']), + # }, + # 'YOUTUBEDL_BINARY': { + # 'path': bin_path(config['YOUTUBEDL_BINARY']), + # 'version': config['YOUTUBEDL_VERSION'], + # 'hash': bin_hash(config['YOUTUBEDL_BINARY']), + # 'enabled': config['USE_YOUTUBEDL'], + # 'is_valid': bool(config['YOUTUBEDL_VERSION']), + # }, # 'CHROME_BINARY': { # 'path': bin_path(config['CHROME_BINARY']), # 'version': config['CHROME_VERSION'], @@ -1227,10 +883,6 @@ assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # n os.environ["TZ"] = TIMEZONE # noqa: F821 os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821 -# add ./node_modules/.bin to $PATH so we can use node scripts in extractors -sys.path.append(CONFIG.NODE_BIN_PATH) - - ########################### Config Validity Checkers ########################### if not CONFIG.USE_COLOR: @@ -1256,6 +908,7 @@ def bump_startup_progress_bar(): def setup_django_minimal(): sys.path.append(str(archivebox.PACKAGE_DIR)) + os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR)) os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') django.setup() @@ -1267,29 +920,18 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS: INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25) - output_dir = out_dir or Path(config['OUTPUT_DIR']) + output_dir = out_dir or archivebox.DATA_DIR - assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path) + assert isinstance(output_dir, Path) and isinstance(archivebox.PACKAGE_DIR, Path) bump_startup_progress_bar() try: from django.core.management import call_command - sys.path.append(str(config['PACKAGE_DIR'])) - os.environ.setdefault('OUTPUT_DIR', str(output_dir)) - assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py' + sys.path.append(str(archivebox.PACKAGE_DIR)) + os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR)) + os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:") os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') - - # Check to make sure JSON extension is available in our Sqlite3 instance - try: - cursor = sqlite3.connect(':memory:').cursor() - cursor.execute('SELECT JSON(\'{"a": "b"}\')') - except sqlite3.OperationalError as exc: - stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red') - hint([ - 'Upgrade your Python version or install the extension manually:', - 'https://code.djangoproject.com/wiki/JSON1Extension' - ]) bump_startup_progress_bar() @@ -1310,28 +952,16 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, bump_startup_progress_bar() from django.conf import settings + + from plugins_sys.config.apps import SHELL_CONFIG # log startup message to the error log with open(settings.ERROR_LOG, "a", encoding='utf-8') as f: command = ' '.join(sys.argv) ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') - f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n") + f.write(f"\n> {command}; TS={ts} VERSION={archivebox.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n") if check_db: - # Enable WAL mode in sqlite3 - from django.db import connection - with connection.cursor() as cursor: - - # Set Journal mode to WAL to allow for multiple writers - current_mode = cursor.execute("PRAGMA journal_mode") - if current_mode != 'wal': - cursor.execute("PRAGMA journal_mode=wal;") - - # Set max blocking delay for concurrent writes and write sync mode - # https://litestream.io/tips/#busy-timeout - cursor.execute("PRAGMA busy_timeout = 5000;") - cursor.execute("PRAGMA synchronous = NORMAL;") - # Create cache table in DB if needed try: from django.core.cache import cache @@ -1348,9 +978,9 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, for conn in connections.all(): conn.close_if_unusable_or_obsolete() - sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME + sql_index_path = archivebox.CONSTANTS.DATABASE_FILE assert sql_index_path.exists(), ( - f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)') + f'No database file {sql_index_path} found in: {archivebox.DATA_DIR} (Are you in an ArchiveBox collection directory?)') bump_startup_progress_bar() @@ -1363,7 +993,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, logfire.configure() logfire.instrument_django(is_sql_commentor_enabled=True) - logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv) + logfire.info(f'Started ArchiveBox v{archivebox.VERSION}', argv=sys.argv) except KeyboardInterrupt: raise SystemExit(2) diff --git a/archivebox/constants.py b/archivebox/constants.py new file mode 100644 index 00000000..295a8f28 --- /dev/null +++ b/archivebox/constants.py @@ -0,0 +1,249 @@ +__package__ = 'archivebox' + + +import os +from types import MappingProxyType +from typing import Set, Dict, NamedTuple, Tuple +from pathlib import Path + +from benedict import benedict + +import archivebox + +from .misc.logging import DEFAULT_CLI_COLORS + +###################### Config ########################## + +class ConstantsConfig(NamedTuple): + + VERSION: str = archivebox.__version__ + + DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS + DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS}) + + PACKAGE_DIR: Path = archivebox.PACKAGE_DIR + PACKAGE_DIR_NAME: str = archivebox.PACKAGE_DIR.name + TEMPLATES_DIR_NAME: str = 'templates' + TEMPLATES_DIR: Path = archivebox.PACKAGE_DIR / TEMPLATES_DIR_NAME + STATIC_DIR: Path = TEMPLATES_DIR / 'static' + USER_PLUGINS_DIR_NAME: str = 'user_plugins' + CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates' + + DATA_DIR: Path = archivebox.DATA_DIR + ARCHIVE_DIR_NAME: str = 'archive' + SOURCES_DIR_NAME: str = 'sources' + PERSONAS_DIR_NAME: str = 'personas' + CRONTABS_DIR_NAME: str = 'crontabs' + CACHE_DIR_NAME: str = 'cache' + LOGS_DIR_NAME: str = 'logs' + LIB_DIR_NAME: str = 'lib' + TMP_DIR_NAME: str = 'tmp' + OUTPUT_DIR: Path = archivebox.DATA_DIR + ARCHIVE_DIR: Path = archivebox.DATA_DIR / ARCHIVE_DIR_NAME + SOURCES_DIR: Path = archivebox.DATA_DIR / SOURCES_DIR_NAME + PERSONAS_DIR: Path = archivebox.DATA_DIR / PERSONAS_DIR_NAME + CACHE_DIR: Path = archivebox.DATA_DIR / CACHE_DIR_NAME + LOGS_DIR: Path = archivebox.DATA_DIR / LOGS_DIR_NAME + LIB_DIR: Path = archivebox.DATA_DIR / LIB_DIR_NAME + TMP_DIR: Path = archivebox.DATA_DIR / TMP_DIR_NAME + CUSTOM_TEMPLATES_DIR: Path = archivebox.DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME + USER_PLUGINS_DIR: Path = archivebox.DATA_DIR / USER_PLUGINS_DIR_NAME + + LIB_PIP_DIR: Path = LIB_DIR / 'pip' + LIB_NPM_DIR: Path = LIB_DIR / 'npm' + LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers' + LIB_BIN_DIR: Path = LIB_DIR / 'bin' + BIN_DIR: Path = LIB_BIN_DIR + + CONFIG_FILENAME: str = 'ArchiveBox.conf' + SQL_INDEX_FILENAME: str = 'index.sqlite3' + + CONFIG_FILE: Path = archivebox.DATA_DIR / CONFIG_FILENAME + DATABASE_FILE: Path = archivebox.DATA_DIR / SQL_INDEX_FILENAME + QUEUE_DATABASE_FILE: Path = archivebox.DATA_DIR / SQL_INDEX_FILENAME.replace('index.', 'queue.') + + JSON_INDEX_FILENAME: str = 'index.json' + HTML_INDEX_FILENAME: str = 'index.html' + ROBOTS_TXT_FILENAME: str = 'robots.txt' + FAVICON_FILENAME: str = 'favicon.ico' + + STATICFILE_EXTENSIONSSTATICFILE_EXTENSIONS: frozenset[str] = frozenset(( + # 99.999% of the time, URLs ending in these extensions are static files + # that can be downloaded as-is, not html pages that need to be rendered + 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', + 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', + 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', + 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8', + 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', + 'atom', 'rss', 'css', 'js', 'json', + 'dmg', 'iso', 'img', + 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z', + + # Less common extensions to consider adding later + # jar, swf, bin, com, exe, dll, deb + # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, + # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, + # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml + + # These are always treated as pages, not as static files, never add them: + # html, htm, shtml, xhtml, xml, aspx, php, cgi + )) + + INGORED_PATHS: frozenset[str] = frozenset(( + ".git", + ".svn", + ".DS_Store", + ".gitignore", + "lost+found", + ".DS_Store", + ".env", + "Dockerfile", + )) + PIP_RELATED_NAMES: frozenset[str] = frozenset(( + ".venv", + "venv", + "virtualenv", + ".virtualenv", + )) + NPM_RELATED_NAMES: frozenset[str] = frozenset(( + "node_modules", + "package.json", + "package-lock.json", + "yarn.lock", + )) + + DATA_DIR_NAMES: frozenset[str] = frozenset(( + ARCHIVE_DIR_NAME, + SOURCES_DIR_NAME, + LOGS_DIR_NAME, + CACHE_DIR_NAME, + LIB_DIR_NAME, + PERSONAS_DIR_NAME, + CUSTOM_TEMPLATES_DIR_NAME, + USER_PLUGINS_DIR_NAME, + )) + DATA_DIRS: frozenset[Path] = frozenset(archivebox.DATA_DIR / dirname for dirname in DATA_DIR_NAMES) + DATA_FILE_NAMES: frozenset[str] = frozenset(( + CONFIG_FILENAME, + SQL_INDEX_FILENAME, + f"{SQL_INDEX_FILENAME}-wal", + f"{SQL_INDEX_FILENAME}-shm", + "queue.sqlite3", + "queue.sqlite3-wal", + "queue.sqlite3-shm", + "search.sqlite3", + JSON_INDEX_FILENAME, + HTML_INDEX_FILENAME, + ROBOTS_TXT_FILENAME, + FAVICON_FILENAME, + CONFIG_FILENAME, + f"{CONFIG_FILENAME}.bak", + "static_index.json", + )) + + # When initializing archivebox in a new directory, we check to make sure the dir is + # actually empty so that we dont clobber someone's home directory or desktop by accident. + # These files are exceptions to the is_empty check when we're trying to init a new dir, + # as they could be from a previous archivebox version, system artifacts, dependencies, etc. + ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset(( + *INGORED_PATHS, + *PIP_RELATED_NAMES, + *NPM_RELATED_NAMES, + *DATA_DIR_NAMES, + *DATA_FILE_NAMES, + "static", # created by old static exports Dict[str, Path]: - """{"plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip", "user_plugins.other": "/data/user_plugins/other",...}""" return { f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py")) # key=get_plugin_order # Someday enforcing plugin import order may be required, but right now it's not needed - } + } # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip" PLUGIN_DIRS = { 'plugins_sys': PACKAGE_DIR / 'plugins_sys', 'plugins_pkg': PACKAGE_DIR / 'plugins_pkg', 'plugins_auth': PACKAGE_DIR / 'plugins_auth', - 'plugins_search': PACKAGE_DIR / 'plugins_search', + 'plugins_search': PACKAGE_DIR / 'plugins_search', 'plugins_extractor': PACKAGE_DIR / 'plugins_extractor', 'user_plugins': DATA_DIR / 'user_plugins', } @@ -59,17 +49,17 @@ for plugin_prefix, plugin_dir in PLUGIN_DIRS.items(): ### Plugins Globals (filled by plugin_type.pluginname.apps.PluginName.register() after Django startup) -PLUGINS = AttrDict({}) -HOOKS = AttrDict({}) +PLUGINS = benedict({}) +HOOKS = benedict({}) -# Created later by Hook.register(settings) when each Plugin.register(settings) is called -# CONFIGS = AttrDict({}) -# BINPROVIDERS = AttrDict({}) -# BINARIES = AttrDict({}) -# EXTRACTORS = AttrDict({}) -# REPLAYERS = AttrDict({}) -# CHECKS = AttrDict({}) -# ADMINDATAVIEWS = AttrDict({}) +# Created later by Plugin.register(settings) -> Hook.register(settings): +# CONFIGS = benedict({}) +# BINPROVIDERS = benedict({}) +# BINARIES = benedict({}) +# EXTRACTORS = benedict({}) +# REPLAYERS = benedict({}) +# CHECKS = benedict({}) +# ADMINDATAVIEWS = benedict({}) ################################################################################ @@ -113,7 +103,7 @@ INSTALLED_APPS = [ 'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. # ArchiveBox plugins - *INSTALLED_PLUGINS.keys(), # all plugin django-apps found in archivebox/*_plugins and data/user_plugins, + *INSTALLED_PLUGINS.keys(), # all plugin django-apps found in archivebox/plugins_* and data/user_plugins, # plugin.register(settings) is called at import of each plugin (in the order they are listed here), then plugin.ready() is called at AppConfig.ready() time # 3rd-party apps from PyPI that need to be loaded last @@ -164,7 +154,7 @@ if LDAP_CONFIG.LDAP_ENABLED: ################################################################################ STATIC_URL = '/static/' - +TEMPLATES_DIR_NAME = 'templates' STATICFILES_DIRS = [ *([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []), *[ @@ -172,7 +162,7 @@ STATICFILES_DIRS = [ for plugin_dir in PLUGIN_DIRS.values() if (plugin_dir / 'static').is_dir() ], - str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'), + str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'static'), ] TEMPLATE_DIRS = [ @@ -182,9 +172,9 @@ TEMPLATE_DIRS = [ for plugin_dir in PLUGIN_DIRS.values() if (plugin_dir / 'templates').is_dir() ], - str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'), - str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'), - str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME), + str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'core'), + str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'admin'), + str(PACKAGE_DIR / TEMPLATES_DIR_NAME), ] TEMPLATES = [ @@ -208,13 +198,14 @@ TEMPLATES = [ ### External Service Settings ################################################################################ +from ..plugins_sys.config.constants import CONSTANTS -CACHE_DB_FILENAME = 'cache.sqlite3' -CACHE_DB_PATH = CONFIG.CACHE_DIR / CACHE_DB_FILENAME -CACHE_DB_TABLE = 'django_cache' +# CACHE_DB_FILENAME = 'cache.sqlite3' +# CACHE_DB_PATH = CONSTANTS.CACHE_DIR / CACHE_DB_FILENAME +# CACHE_DB_TABLE = 'django_cache' -DATABASE_FILE = DATA_DIR / CONFIG.SQL_INDEX_FILENAME -DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE)) +DATABASE_FILE = DATA_DIR / CONSTANTS.SQL_INDEX_FILENAME +DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(CONSTANTS.DATABASE_FILE)) QUEUE_DATABASE_NAME = DATABASE_NAME.replace('index.sqlite3', 'queue.sqlite3') @@ -222,6 +213,7 @@ SQLITE_CONNECTION_OPTIONS = { "TIME_ZONE": CONFIG.TIMEZONE, "OPTIONS": { # https://gcollazo.com/optimal-sqlite-settings-for-django/ + # # https://litestream.io/tips/#busy-timeout "timeout": 5, "check_same_thread": False, "transaction_mode": "IMMEDIATE", @@ -345,7 +337,7 @@ STORAGES = { "BACKEND": "django.core.files.storage.FileSystemStorage", "OPTIONS": { "base_url": "/archive/", - "location": CONFIG.ARCHIVE_DIR, + "location": ARCHIVE_DIR, }, }, # "personas": { diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 04ccc8d6..700aede7 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -14,7 +14,6 @@ from ..config import ( SAVE_ALLOWLIST_PTN, SAVE_DENYLIST_PTN, ) -from ..core.settings import ERROR_LOG from ..index.schema import ArchiveResult, Link from ..index.sql import write_link_to_sql_index from ..index import ( @@ -109,6 +108,8 @@ def ignore_methods(to_ignore: List[str]) -> Iterable[str]: def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link: """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" + from django.conf import settings + from ..search import write_search_index # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. @@ -169,7 +170,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s stats['skipped'] += 1 except Exception as e: # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627 - with open(ERROR_LOG, "a", encoding='utf-8') as f: + with open(settings.ERROR_LOG, "a", encoding='utf-8') as f: command = ' '.join(sys.argv) ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format( diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py index 1957579a..29591e69 100644 --- a/archivebox/extractors/htmltotext.py +++ b/archivebox/extractors/htmltotext.py @@ -1,5 +1,7 @@ __package__ = 'archivebox.extractors' +import archivebox + from html.parser import HTMLParser import io from pathlib import Path @@ -8,7 +10,6 @@ from typing import Optional from ..config import ( SAVE_HTMLTOTEXT, TIMEOUT, - VERSION, ) from ..index.schema import Link, ArchiveResult, ArchiveError from ..logging_util import TimedProgress @@ -153,7 +154,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=VERSION, + cmd_version=archivebox.__version__, output=output, status=status, index_texts=[extracted_text] if extracted_text else [], diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 155438d3..e45d9600 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -8,17 +8,7 @@ import json from ..index.schema import Link, ArchiveResult, ArchiveError from ..system import run, atomic_write -from ..util import ( - enforce_types, - is_static_file, -) -from ..config import ( - TIMEOUT, - CURL_BINARY, - SAVE_READABILITY, - DEPENDENCIES, - READABILITY_VERSION, -) +from ..util import enforce_types, is_static_file from ..logging_util import TimedProgress from .title import get_html @@ -31,22 +21,29 @@ def get_embed_path(archiveresult=None): @enforce_types def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: + from plugins_extractor.readability.apps import READABILITY_CONFIG + if is_static_file(link.url): return False - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / get_output_path()).exists(): + output_subdir = (Path(out_dir or link.link_dir) / get_output_path()) + if not overwrite and output_subdir.exists(): return False - return SAVE_READABILITY + return READABILITY_CONFIG.SAVE_READABILITY @enforce_types -def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult: """download reader friendly version using @mozilla/readability""" + + from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY + + READABILITY_BIN = READABILITY_BINARY.load() + assert READABILITY_BIN.abspath and READABILITY_BIN.version - out_dir = Path(out_dir or link.link_dir) - output_folder = out_dir.absolute() / get_output_path() + timeout = timeout or READABILITY_CONFIG.READABILITY_TIMEOUT + output_subdir = Path(out_dir or link.link_dir).absolute() / get_output_path() output = get_output_path() # Readability Docs: https://github.com/mozilla/readability @@ -54,13 +51,14 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO status = 'succeeded' # fake command to show the user so they have something to try debugging if get_html fails cmd = [ - CURL_BINARY, - link.url + str(READABILITY_BIN.abspath), + '{dom,singlefile}.html', + link.url, ] readability_content = None timer = TimedProgress(timeout, prefix=' ') try: - document = get_html(link, out_dir) + document = get_html(link, Path(out_dir or link.link_dir)) temp_doc = NamedTemporaryFile(delete=False) temp_doc.write(document.encode("utf-8")) temp_doc.close() @@ -69,26 +67,26 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO raise ArchiveError('Readability could not find HTML to parse for article text') cmd = [ - DEPENDENCIES['READABILITY_BINARY']['path'], + str(READABILITY_BIN.abspath), temp_doc.name, link.url, ] - result = run(cmd, cwd=out_dir, timeout=timeout) + result = run(cmd, cwd=out_dir, timeout=timeout, text=True) try: result_json = json.loads(result.stdout) assert result_json and 'content' in result_json, 'Readability output is not valid JSON' except json.JSONDecodeError: raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr) - output_folder.mkdir(exist_ok=True) + output_subdir.mkdir(exist_ok=True) readability_content = result_json.pop("textContent") - atomic_write(str(output_folder / "content.html"), result_json.pop("content")) - atomic_write(str(output_folder / "content.txt"), readability_content) - atomic_write(str(output_folder / "article.json"), result_json) + atomic_write(str(output_subdir / "content.html"), result_json.pop("content")) + atomic_write(str(output_subdir / "content.txt"), readability_content) + atomic_write(str(output_subdir / "article.json"), result_json) output_tail = [ line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:] + for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:] if line.strip() ] hints = ( @@ -111,7 +109,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=READABILITY_VERSION, + cmd_version=str(READABILITY_BIN.version), output=output, status=status, index_texts=[readability_content] if readability_content else [], diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index cb6e0e2a..9b9619e0 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -11,20 +11,19 @@ from contextlib import contextmanager from urllib.parse import urlparse from django.db.models import QuerySet, Q + +import archivebox + from ..util import ( scheme, enforce_types, ExtendedEncoder, ) +from ..misc.logging import stderr from ..config import ( - ARCHIVE_DIR_NAME, - SQL_INDEX_FILENAME, - JSON_INDEX_FILENAME, - OUTPUT_DIR, TIMEOUT, URL_DENYLIST_PTN, URL_ALLOWLIST_PTN, - stderr, OUTPUT_PERMISSIONS ) from ..logging_util import ( @@ -224,28 +223,28 @@ def timed_index_update(out_path: Path): @enforce_types -def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, created_by_id: int | None=None) -> None: +def write_main_index(links: List[Link], out_dir: Path=archivebox.DATA_DIR, created_by_id: int | None=None) -> None: """Writes links to sqlite3 file for a given list of links""" log_indexing_process_started(len(links)) try: - with timed_index_update(out_dir / SQL_INDEX_FILENAME): + with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE): write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id) - os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes + os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes except (KeyboardInterrupt, SystemExit): stderr('[!] Warning: Still writing index to disk...', color='lightyellow') stderr(' Run archivebox init to fix any inconsistencies from an ungraceful exit.') - with timed_index_update(out_dir / SQL_INDEX_FILENAME): + with timed_index_update(archivebox.CONSTANTS.DATABASE_FILE): write_sql_main_index(links, out_dir=out_dir, created_by_id=created_by_id) - os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes + os.chmod(archivebox.CONSTANTS.DATABASE_FILE, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes raise SystemExit(0) log_indexing_process_finished() @enforce_types -def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: +def load_main_index(out_dir: Path=archivebox.DATA_DIR, warn: bool=True) -> List[Link]: """parse and load existing index with any new links from import_path merged in""" from core.models import Snapshot try: @@ -255,8 +254,8 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: raise SystemExit(0) @enforce_types -def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]: - index_path = out_dir / JSON_INDEX_FILENAME +def load_main_index_meta(out_dir: Path=archivebox.DATA_DIR) -> Optional[dict]: + index_path = out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME if index_path.exists(): with open(index_path, 'r', encoding='utf-8') as f: meta_dict = pyjson.load(f) @@ -407,7 +406,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type return search_filter(snapshots, filter_patterns, filter_type) -def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_indexed_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity""" links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) return { @@ -415,7 +414,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option for link in links } -def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_archived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]: """indexed links that are archived with a valid data directory""" links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) return { @@ -423,7 +422,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio for link in filter(is_archived, links) } -def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_unarchived_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]: """indexed links that are unarchived with no data directory or an empty data directory""" links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) return { @@ -431,12 +430,12 @@ def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opt for link in filter(is_unarchived, links) } -def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_present_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]: """dirs that actually exist in the archive/ folder""" all_folders = {} - for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir(): + for entry in (out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir(): if entry.is_dir(): link = None try: @@ -448,7 +447,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option return all_folders -def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_valid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]: """dirs with a valid index matched to the main index and archived content""" links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)] return { @@ -456,16 +455,16 @@ def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional for link in filter(is_valid, links) } -def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_invalid_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]: """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" - duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR) - orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR) - corrupted = get_corrupted_folders(snapshots, out_dir=OUTPUT_DIR) - unrecognized = get_unrecognized_folders(snapshots, out_dir=OUTPUT_DIR) + duplicate = get_duplicate_folders(snapshots, out_dir=out_dir) + orphaned = get_orphaned_folders(snapshots, out_dir=out_dir) + corrupted = get_corrupted_folders(snapshots, out_dir=out_dir) + unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir) return {**duplicate, **orphaned, **corrupted, **unrecognized} -def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_duplicate_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]: """dirs that conflict with other directories that have the same link URL or timestamp""" by_url = {} by_timestamp = {} @@ -473,7 +472,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti data_folders = ( str(entry) - for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir() + for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir() if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists() ) @@ -499,11 +498,11 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti duplicate_folders[path] = link return duplicate_folders -def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_orphaned_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]: """dirs that contain a valid index but aren't listed in the main index""" orphaned_folders = {} - for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): + for entry in archivebox.CONSTANTS.ARCHIVE_DIR.iterdir(): if entry.is_dir(): link = None try: @@ -517,7 +516,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio return orphaned_folders -def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_corrupted_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]: """dirs that don't contain a valid index and aren't listed in the main index""" corrupted = {} for snapshot in snapshots.iterator(chunk_size=500): @@ -526,11 +525,11 @@ def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti corrupted[link.link_dir] = link return corrupted -def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_unrecognized_folders(snapshots, out_dir: Path=archivebox.DATA_DIR) -> Dict[str, Optional[Link]]: """dirs that don't contain recognizable archive data and aren't listed in the main index""" unrecognized_folders: Dict[str, Optional[Link]] = {} - for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): + for entry in (Path(out_dir) / archivebox.CONSTANTS.ARCHIVE_DIR_NAME).iterdir(): if entry.is_dir(): index_exists = (entry / "index.json").exists() link = None @@ -595,10 +594,10 @@ def is_unarchived(link: Link) -> bool: return not link.is_archived -def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: +def fix_invalid_folder_locations(out_dir: Path=archivebox.DATA_DIR) -> Tuple[List[str], List[str]]: fixed = [] cant_fix = [] - for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME): + for entry in os.scandir(out_dir / archivebox.CONSTANTS.ARCHIVE_DIR_NAME): if entry.is_dir(follow_symlinks=True): if (Path(entry.path) / 'index.json').exists(): try: @@ -609,7 +608,7 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L continue if not entry.path.endswith(f'/{link.timestamp}'): - dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp + dest = out_dir /archivebox.CONSTANTS.ARCHIVE_DIR_NAME / link.timestamp if dest.exists(): cant_fix.append(entry.path) else: diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 8ea32446..747928c5 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -1,11 +1,12 @@ __package__ = 'archivebox.index' +import archivebox from pathlib import Path from datetime import datetime, timezone from collections import defaultdict from typing import List, Optional, Iterator, Mapping -from django.utils.html import format_html, mark_safe +from django.utils.html import format_html, mark_safe # type: ignore from django.core.cache import cache from .schema import Link @@ -19,10 +20,6 @@ from ..util import ( urldecode, ) from ..config import ( - OUTPUT_DIR, - VERSION, - FOOTER_INFO, - HTML_INDEX_FILENAME, SAVE_ARCHIVE_DOT_ORG, PREVIEW_ORIGINALS, ) @@ -36,10 +33,12 @@ TITLE_LOADING_MSG = 'Not yet archived...' ### Main Links Index @enforce_types -def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]: +def parse_html_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[str]: """parse an archive index html file and return the list of urls""" - index_path = Path(out_dir) / HTML_INDEX_FILENAME + from plugins_sys.config.constants import CONSTANTS + + index_path = Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME if index_path.exists(): with open(index_path, 'r', encoding='utf-8') as f: for line in f: @@ -59,14 +58,16 @@ def generate_index_from_links(links: List[Link], with_headers: bool): def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str: """render the template for the entire main index""" + from plugins_sys.config.apps import SHELL_CONFIG, SERVER_CONFIG + return render_django_template(template, { - 'version': VERSION, - 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility + 'version': archivebox.VERSION, + 'git_sha': SHELL_CONFIG.COMMIT_HASH or archivebox.VERSION, 'num_links': str(len(links)), 'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'), 'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'), 'links': [link._asdict(extended=True) for link in links], - 'FOOTER_INFO': FOOTER_INFO, + 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO, }) @@ -74,10 +75,11 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> @enforce_types def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None: + from plugins_sys.config.constants import CONSTANTS out_dir = out_dir or link.link_dir rendered_html = link_details_template(link) - atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html) + atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html) @enforce_types diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 6585009d..06455053 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -8,38 +8,36 @@ from pathlib import Path from datetime import datetime, timezone from typing import List, Optional, Iterator, Any, Union +import archivebox + from .schema import Link from ..system import atomic_write from ..util import enforce_types -from ..config import ( - VERSION, - OUTPUT_DIR, - FOOTER_INFO, - DEPENDENCIES, - JSON_INDEX_FILENAME, - ARCHIVE_DIR_NAME, - ANSI -) -MAIN_INDEX_HEADER = { - 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.', - 'schema': 'archivebox.index.json', - 'copyright_info': FOOTER_INFO, - 'meta': { - 'project': 'ArchiveBox', - 'version': VERSION, - 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility - 'website': 'https://ArchiveBox.io', - 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', - 'source': 'https://github.com/ArchiveBox/ArchiveBox', - 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues', - 'dependencies': DEPENDENCIES, - }, -} @enforce_types def generate_json_index_from_links(links: List[Link], with_headers: bool): + from django.conf import settings + from plugins_sys.config.apps import SERVER_CONFIG + + MAIN_INDEX_HEADER = { + 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.', + 'schema': 'archivebox.index.json', + 'copyright_info': SERVER_CONFIG.FOOTER_INFO, + 'meta': { + 'project': 'ArchiveBox', + 'version': archivebox.VERSION, + 'git_sha': archivebox.VERSION, # not used anymore, but kept for backwards compatibility + 'website': 'https://ArchiveBox.io', + 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', + 'source': 'https://github.com/ArchiveBox/ArchiveBox', + 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues', + 'dependencies': settings.BINARIES.to_dict(), + }, + } + + if with_headers: output = { **MAIN_INDEX_HEADER, @@ -54,10 +52,12 @@ def generate_json_index_from_links(links: List[Link], with_headers: bool): @enforce_types -def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: +def parse_json_main_index(out_dir: Path=archivebox.DATA_DIR) -> Iterator[Link]: """parse an archive index json file and return the list of links""" - index_path = Path(out_dir) / JSON_INDEX_FILENAME + from plugins_sys.config.constants import CONSTANTS + + index_path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME if index_path.exists(): with open(index_path, 'r', encoding='utf-8') as f: try: @@ -77,14 +77,14 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: yield Link.from_json(link_json) except KeyError: try: - detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp'] + detail_index_path = CONSTANTS.ARCHIVE_DIR / link_json['timestamp'] yield parse_json_link_details(str(detail_index_path)) except KeyError: # as a last effort, try to guess the missing values out of existing ones try: yield Link.from_json(link_json, guess=True) except KeyError: - print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI)) + # print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI)) continue return () @@ -94,15 +94,19 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None: """write a json file with some info about the link""" + from plugins_sys.config.constants import CONSTANTS + out_dir = out_dir or link.link_dir - path = Path(out_dir) / JSON_INDEX_FILENAME + path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME atomic_write(str(path), link._asdict(extended=True)) @enforce_types -def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]: +def parse_json_link_details(out_dir: Union[Path, str], guess: bool=False) -> Optional[Link]: """load the json link index from a given directory""" - existing_index = Path(out_dir) / JSON_INDEX_FILENAME + from plugins_sys.config.constants import CONSTANTS + + existing_index = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME if existing_index.exists(): with open(existing_index, 'r', encoding='utf-8') as f: try: @@ -117,7 +121,9 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]: """read through all the archive data folders and return the parsed links""" - for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME): + from plugins_sys.config.constants import CONSTANTS + + for entry in os.scandir(CONSTANTS.ARCHIVE_DIR): if entry.is_dir(follow_symlinks=True): if (Path(entry.path) / 'index.json').exists(): try: diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 9563011e..b4e4f975 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -4,8 +4,11 @@ import re import os import sys import stat +import shutil import time import argparse +import archivebox + from math import log from multiprocessing import Process from pathlib import Path @@ -22,18 +25,7 @@ from rich.panel import Panel from .system import get_dir_size from .util import enforce_types -from .config import ( - ConfigDict, - OUTPUT_DIR, - VERSION, - ANSI, - IS_TTY, - IN_DOCKER, - TERM_WIDTH, - SHOW_PROGRESS, - SOURCES_DIR_NAME, - stderr, -) +from .misc.logging import ANSI, stderr @dataclass class RuntimeStats: @@ -102,7 +94,7 @@ def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None: if not stdin: return None - if IN_DOCKER: + if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'): # when TTY is disabled in docker we cant tell if stdin is being piped in or not # if we try to read stdin when its not piped we will hang indefinitely waiting for it return None @@ -141,9 +133,14 @@ class TimedProgress: def __init__(self, seconds, prefix=''): - self.SHOW_PROGRESS = SHOW_PROGRESS + from plugins_sys.config.apps import SHELL_CONFIG + + self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS + self.ANSI = SHELL_CONFIG.ANSI + self.TERM_WIDTH = lambda: shutil.get_terminal_size().columns # lambda so it live-updates when terminal is resized + if self.SHOW_PROGRESS: - self.p = Process(target=progress_bar, args=(seconds, prefix)) + self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI)) self.p.start() self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None} @@ -172,7 +169,7 @@ class TimedProgress: # clear whole terminal line try: - sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) + sys.stdout.write('\r{}{}\r'.format((' ' * self.TERM_WIDTH()), self.ANSI['reset'])) except (IOError, BrokenPipeError): # ignore when the parent proc has stopped listening to our stdout pass @@ -181,9 +178,10 @@ class TimedProgress: @enforce_types -def progress_bar(seconds: int, prefix: str='') -> None: +def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> None: """show timer in the form of progress bar, with percentage and seconds remaining""" - chunk = '█' if (sys.stdout or sys.__stdout__).encoding.upper() == 'UTF-8' else '#' + output_buf = (sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__) + chunk = '█' if output_buf and output_buf.encoding.upper() == 'UTF-8' else '#' last_width = TERM_WIDTH() chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width) try: @@ -236,18 +234,15 @@ def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional args = ' '.join(subcommand_args) version_msg = '[dark_magenta]\\[i] [{now}] ArchiveBox v{VERSION}: [/dark_magenta][green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format( now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), - VERSION=VERSION, + VERSION=archivebox.__version__, subcommand=subcommand, args=args, ) # stderr() # stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI)) # stderr() - if SHOW_PROGRESS: - print(Panel(version_msg), file=sys.stderr) - else: - print(version_msg, file=sys.stderr) - + print(Panel(version_msg), file=sys.stderr) + ### Parsing Stage @@ -261,7 +256,8 @@ def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: b )) def log_source_saved(source_file: str): - print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1])) + from plugins_sys.config.constants import CONSTANTS + print(' > Saved verbatim input to {}/{}'.format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1])) def log_parsing_finished(num_parsed: int, parser_name: str): _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc) @@ -293,12 +289,14 @@ def log_indexing_process_finished(): def log_indexing_started(out_path: str): - if IS_TTY: - sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}') + from plugins_sys.config.apps import SHELL_CONFIG + + if SHELL_CONFIG.IS_TTY: + sys.stdout.write(f' > ./{Path(out_path).relative_to(archivebox.DATA_DIR)}') def log_indexing_finished(out_path: str): - print(f'\r √ ./{Path(out_path).relative_to(OUTPUT_DIR)}') + print(f'\r √ ./{Path(out_path).relative_to(archivebox.DATA_DIR)}') ### Archiving Stage @@ -447,7 +445,7 @@ def log_archive_method_finished(result: "ArchiveResult"): ) docker_hints = () - if IN_DOCKER: + if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'): docker_hints = ( ' docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash', ) @@ -534,7 +532,7 @@ def log_shell_welcome_msg(): ### Helpers @enforce_types -def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str: +def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=archivebox.DATA_DIR) -> str: """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" pwd = str(Path(pwd)) # .resolve() path = str(path) @@ -577,7 +575,7 @@ def printable_folders(folders: Dict[str, Optional["Link"]], @enforce_types -def printable_config(config: ConfigDict, prefix: str='') -> str: +def printable_config(config: dict, prefix: str='') -> str: return f'\n{prefix}'.join( f'{key}={val}' for key, val in config.items() diff --git a/archivebox/main.py b/archivebox/main.py index 0b6b83ca..10c96807 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -6,6 +6,8 @@ import shutil import platform import archivebox +CONSTANTS = archivebox.CONSTANTS + from typing import Dict, List, Optional, Iterable, IO, Union from pathlib import Path from datetime import date, datetime @@ -66,47 +68,25 @@ from .index.html import ( ) from .index.csv import links_to_csv from .extractors import archive_links, archive_link, ignore_methods -from .misc.logging import stderr, hint +from .misc.logging import stderr, hint, ANSI from .misc.checks import check_data_folder, check_dependencies from .config import ( setup_django_minimal, ConfigDict, - ANSI, IS_TTY, DEBUG, IN_DOCKER, IN_QEMU, PUID, PGID, - USER, TIMEZONE, - ENFORCE_ATOMIC_WRITES, - OUTPUT_PERMISSIONS, ONLY_NEW, - OUTPUT_DIR, - SOURCES_DIR, - ARCHIVE_DIR, - LOGS_DIR, - PACKAGE_DIR, - CONFIG_FILE, - ARCHIVE_DIR_NAME, JSON_INDEX_FILENAME, HTML_INDEX_FILENAME, SQL_INDEX_FILENAME, - ALLOWED_IN_OUTPUT_DIR, LDAP, write_config_file, - VERSION, - COMMIT_HASH, - BUILD_TIME, - CODE_LOCATIONS, - DATA_LOCATIONS, DEPENDENCIES, - YOUTUBEDL_BINARY, - YOUTUBEDL_VERSION, - SINGLEFILE_VERSION, - READABILITY_VERSION, - MERCURY_VERSION, load_all_config, CONFIG, USER_CONFIG, @@ -114,7 +94,6 @@ from .config import ( setup_django, ) from .logging_util import ( - TERM_WIDTH, TimedProgress, log_importing_started, log_crawl_started, @@ -129,9 +108,14 @@ from .logging_util import ( printable_dependency_version, ) +VERSION = archivebox.VERSION +PACKAGE_DIR = archivebox.PACKAGE_DIR +OUTPUT_DIR = archivebox.DATA_DIR +ARCHIVE_DIR = archivebox.DATA_DIR / 'archive' + @enforce_types -def help(out_dir: Path=OUTPUT_DIR) -> None: +def help(out_dir: Path=archivebox.DATA_DIR) -> None: """Print the ArchiveBox help message and usage""" all_subcommands = CLI_SUBCOMMANDS @@ -207,7 +191,7 @@ def version(quiet: bool=False, """Print the ArchiveBox version and dependency information""" setup_django_minimal() - from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG + from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG, CONSTANTS from plugins_auth.ldap.apps import LDAP_CONFIG from django.conf import settings @@ -223,8 +207,8 @@ def version(quiet: bool=False, p = platform.uname() print( 'ArchiveBox v{}'.format(archivebox.__version__), - f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}', - f'BUILD_TIME={BUILD_TIME}', + f'COMMIT_HASH={SHELL_CONFIG.COMMIT_HASH[:7] if SHELL_CONFIG.COMMIT_HASH else "unknown"}', + f'BUILD_TIME={SHELL_CONFIG.BUILD_TIME}', ) print( f'IN_DOCKER={IN_DOCKER}', @@ -234,7 +218,7 @@ def version(quiet: bool=False, f'PLATFORM={platform.platform()}', f'PYTHON={sys.implementation.name.title()}', ) - OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount'] + OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or CONSTANTS.DATA_LOCATIONS['ARCHIVE_DIR']['is_mount'] print( f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}', f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', @@ -268,17 +252,18 @@ def version(quiet: bool=False, except Exception as e: err = e loaded_bin = binary + raise print('', '√' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath or str(err)) print() print('{white}[i] Source-code locations:{reset}'.format(**ANSI)) - for name, path in CODE_LOCATIONS.items(): + for name, path in CONSTANTS.CODE_LOCATIONS.items(): print(printable_folder_status(name, path)) print() - if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']: + if CONSTANTS.DATABASE_FILE.exists() or CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists(): print('{white}[i] Data locations:{reset}'.format(**ANSI)) - for name, path in DATA_LOCATIONS.items(): + for name, path in CONSTANTS.DATA_LOCATIONS.items(): print(printable_folder_status(name, path)) else: print() @@ -303,19 +288,19 @@ def run(subcommand: str, @enforce_types -def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=OUTPUT_DIR) -> None: +def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=archivebox.DATA_DIR) -> None: """Initialize a new ArchiveBox collection in the current directory""" from core.models import Snapshot out_dir.mkdir(exist_ok=True) - is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR) + is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_OUTPUT_DIR) - if (out_dir / JSON_INDEX_FILENAME).exists(): + if (out_dir / archivebox.CONSTANTS.JSON_INDEX_FILENAME).exists(): stderr("[!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.", color="lightyellow") stderr(" You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.", color="lightyellow") - existing_index = (out_dir / SQL_INDEX_FILENAME).exists() + existing_index = archivebox.CONSTANTS.DATABASE_FILE.exists() if is_empty and not existing_index: print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI)) @@ -344,25 +329,24 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path= else: print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI)) - print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...') - Path(SOURCES_DIR).mkdir(exist_ok=True) - Path(ARCHIVE_DIR).mkdir(exist_ok=True) - Path(LOGS_DIR).mkdir(exist_ok=True) - print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...') + print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(OUTPUT_DIR)}...') + Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True) + Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True) + Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) + print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(OUTPUT_DIR)}...') write_config_file({}, out_dir=out_dir) - if (out_dir / SQL_INDEX_FILENAME).exists(): + if CONSTANTS.DATABASE_FILE.exists(): print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI)) else: print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI)) - DATABASE_FILE = out_dir / SQL_INDEX_FILENAME for migration_line in apply_migrations(out_dir): print(f' {migration_line}') - assert DATABASE_FILE.exists() + assert CONSTANTS.DATABASE_FILE.exists() print() - print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}') + print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(OUTPUT_DIR)}') # from django.contrib.auth.models import User # if IS_TTY and not User.objects.filter(is_superuser=True).exists(): @@ -477,7 +461,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: check_data_folder(CONFIG) from core.models import Snapshot - from django.contrib.auth import get_user_model + from django.contrib.auth import get_user_mod, SHELL_CONFIG User = get_user_model() print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI)) @@ -491,7 +475,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: num_sql_links = links.count() num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') - print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)') + print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)') print() print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI)) print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset']) @@ -539,7 +523,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: print() print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI)) - print(ANSI['lightyellow'], f' {LOGS_DIR}/*', ANSI['reset']) + print(ANSI['lightyellow'], f' {CONSTANTS.LOGS_DIR}/*', ANSI['reset']) users = get_admins().values_list('username', flat=True) print(f' UI users {len(users)}: {", ".join(users)}') last_login = User.objects.order_by('last_login').last() @@ -564,7 +548,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: f' > {str(snapshot.downloaded_at)[:16]} ' f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' f'"{snapshot.title}": {snapshot.url}' - )[:TERM_WIDTH()], + )[:SHELL_CONFIG.TERM_WIDTH], ANSI['reset'], ) print(ANSI['black'], ' ...', ANSI['reset']) @@ -976,7 +960,7 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None: from rich import print - if not (out_dir / ARCHIVE_DIR_NAME).exists(): + if not ARCHIVE_DIR.exists(): run_subcommand('init', stdin=None, pwd=out_dir) setup_django(out_dir=out_dir, check_db=True) @@ -992,9 +976,13 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None: from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) + from plugins_extractor.readability.apps import READABILITY_BINARY + print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) + + from plugins_pkg.npm.apps import npm - print(npm.load_or_install('readability-extractor', overrides={'packages': lambda: ['github:ArchiveBox/readability-extractor']}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) + # TODO: move these to their own plugin binaries print(npm.load_or_install('postlight-parser', overrides={'packages': lambda: ['@postlight/parser@^2.2.3'], 'version': lambda: '2.2.3'}).model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths'})) from django.contrib.auth import get_user_model @@ -1020,7 +1008,6 @@ def config(config_options_str: Optional[str]=None, """Get and set your ArchiveBox project configuration values""" check_data_folder(CONFIG) - if config_options and config_options_str: stderr( '[X] You should either pass config values as an arguments ' @@ -1096,7 +1083,6 @@ def config(config_options_str: Optional[str]=None, elif reset: stderr('[X] This command is not implemented yet.', color='red') stderr(' Please manually remove the relevant lines from your config file:') - stderr(f' {CONFIG_FILE}') raise SystemExit(2) else: stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red') @@ -1125,8 +1111,9 @@ def schedule(add: bool=False, check_data_folder(CONFIG) setup_django_minimal() from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY + from plugins_sys.config.apps import SHELL_CONFIG, CONSTANTS - Path(LOGS_DIR).mkdir(exist_ok=True) + Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) cron = CronTab(user=True) cron = dedupe_cron_jobs(cron) @@ -1155,7 +1142,7 @@ def schedule(add: bool=False, f'"{import_path}"', ] if import_path else ['update']), '>>', - quoted(Path(LOGS_DIR) / 'schedule.log'), + quoted(Path(CONSTANTS.LOGS_DIR) / 'schedule.log'), '2>&1', ] @@ -1167,7 +1154,7 @@ def schedule(add: bool=False, elif CronSlices.is_valid(every): new_job.setall(every) else: - stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI)) + stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**SHELL_CONFIG.ANSI)) stderr(' It must be one of minute/hour/day/month') stderr(' or a quoted cron-format schedule like:') stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml') @@ -1181,11 +1168,11 @@ def schedule(add: bool=False, existing_jobs = list(cron.find_comment(CRON_COMMENT)) print() - print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI)) + print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(SHELL_CONFIG.USER, len(existing_jobs), **SHELL_CONFIG.ANSI)) print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) if total_runs > 60 and not quiet: stderr() - stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI)) + stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **SHELL_CONFIG.ANSI)) stderr(' Congrats on being an enthusiastic internet archiver! 👌') stderr() stderr(' Make sure you have enough storage space available to hold all the data.') @@ -1195,7 +1182,7 @@ def schedule(add: bool=False, if existing_jobs: print('\n'.join(str(cmd) for cmd in existing_jobs)) else: - stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI)) + stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(SHELL_CONFIG.USER, **SHELL_CONFIG.ANSI)) stderr(' To schedule a new job, run:') stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') raise SystemExit(0) @@ -1206,11 +1193,11 @@ def schedule(add: bool=False, if foreground or run_all: if not existing_jobs: - stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI)) + stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**SHELL_CONFIG.ANSI)) stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml') raise SystemExit(1) - print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI)) + print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **SHELL_CONFIG.ANSI)) if run_all: try: for job in existing_jobs: @@ -1220,7 +1207,7 @@ def schedule(add: bool=False, job.run() sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n') except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) + print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) raise SystemExit(1) if foreground: @@ -1230,7 +1217,7 @@ def schedule(add: bool=False, for result in cron.run_scheduler(): print(result) except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) + print('\n{green}[√] Stopped.{reset}'.format(**SHELL_CONFIG.ANSI)) raise SystemExit(1) # if CAN_UPGRADE: diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index 3d0e4493..e2964bcf 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -5,51 +5,55 @@ __package__ = 'archivebox.misc' from benedict import benedict from pathlib import Path -from .logging import stderr, hint +import archivebox + +from .logging import stderr, hint, ANSI def check_dependencies(config: benedict, show_help: bool=True) -> None: - invalid_dependencies = [ - (name, info) for name, info in config['DEPENDENCIES'].items() - if info['enabled'] and not info['is_valid'] - ] - if invalid_dependencies and show_help: - stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow') - for dependency, info in invalid_dependencies: - stderr( - ' ! {}: {} ({})'.format( - dependency, - info['path'] or 'unable to find binary', - info['version'] or 'unable to detect version', - ) - ) - if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'): - hint(('To install all packages automatically run: archivebox setup', - f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False', - ''), prefix=' ') - stderr('') + # dont do this on startup anymore, it's too slow + pass + # invalid_dependencies = [ + # (name, binary) for name, info in settings.BINARIES.items() + # if not binary. + # ] + # if invalid_dependencies and show_help: + # stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow') + # for dependency, info in invalid_dependencies: + # stderr( + # ' ! {}: {} ({})'.format( + # dependency, + # info['path'] or 'unable to find binary', + # info['version'] or 'unable to detect version', + # ) + # ) + # if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'): + # hint(('To install all packages automatically run: archivebox setup', + # f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False', + # ''), prefix=' ') + # stderr('') def check_data_folder(config: benedict) -> None: - output_dir = config['OUTPUT_DIR'] + output_dir = archivebox.DATA_DIR - archive_dir_exists = (Path(output_dir) / 'archive').exists() + archive_dir_exists = (archivebox.CONSTANTS.ARCHIVE_DIR).exists() if not archive_dir_exists: stderr('[X] No archivebox index found in the current directory.', color='red') stderr(f' {output_dir}', color='lightyellow') stderr() - stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI'])) + stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**ANSI)) stderr(' cd path/to/your/archive/folder') stderr(' archivebox [command]') stderr() - stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI'])) + stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**ANSI)) stderr(' archivebox init') raise SystemExit(2) def check_migrations(config: benedict): - output_dir = config['OUTPUT_DIR'] + output_dir = archivebox.DATA_DIR from ..index.sql import list_migrations @@ -63,8 +67,8 @@ def check_migrations(config: benedict): stderr(' archivebox init') raise SystemExit(3) - (Path(output_dir) / config['SOURCES_DIR_NAME']).mkdir(exist_ok=True) - (Path(output_dir) / config['LOGS_DIR_NAME']).mkdir(exist_ok=True) - (Path(output_dir) / config['CACHE_DIR_NAME']).mkdir(exist_ok=True) - (Path(output_dir) / config['LIB_DIR_NAME'] / 'bin').mkdir(exist_ok=True, parents=True) - (Path(output_dir) / config['PERSONAS_DIR_NAME'] / 'Default').mkdir(exist_ok=True, parents=True) + archivebox.CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True) + archivebox.CONSTANTS.LOGS_DIR.mkdir(exist_ok=True) + archivebox.CONSTANTS.CACHE_DIR.mkdir(exist_ok=True) + (archivebox.CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True) + (archivebox.CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True) diff --git a/archivebox/misc/logging.py b/archivebox/misc/logging.py index 5abdfee7..00dacab4 100644 --- a/archivebox/misc/logging.py +++ b/archivebox/misc/logging.py @@ -8,8 +8,6 @@ from collections import defaultdict from benedict import benedict from rich.console import Console -from ..config_stubs import ConfigDict - # SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS CONSOLE = Console() IS_TTY = CONSOLE.is_interactive @@ -43,7 +41,7 @@ COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], { }) # Logging Helpers -def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: +def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None: ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI if color: @@ -53,7 +51,7 @@ def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[Co sys.stdout.write(prefix + ''.join(strs)) -def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: +def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None: ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI if color: @@ -63,7 +61,7 @@ def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[Co sys.stderr.write(prefix + ''.join(strs)) -def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None: +def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[benedict]=None) -> None: ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI if isinstance(text, str): diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py index 3415f35e..429f4a9d 100644 --- a/archivebox/parsers/pocket_api.py +++ b/archivebox/parsers/pocket_api.py @@ -2,25 +2,24 @@ __package__ = 'archivebox.parsers' import re +import archivebox from typing import IO, Iterable, Optional from configparser import ConfigParser -from pathlib import Path from pocket import Pocket from ..index.schema import Link from ..util import enforce_types from ..system import atomic_write from ..config import ( - SOURCES_DIR, POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS, ) COUNT_PER_PAGE = 500 -API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db' +API_DB_PATH = archivebox.DATA_DIR / 'sources' / 'pocket_api.db' # search for broken protocols that sometimes come from the Pocket API _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))') diff --git a/archivebox/parsers/readwise_reader_api.py b/archivebox/parsers/readwise_reader_api.py index 7dd44267..b676dfe8 100644 --- a/archivebox/parsers/readwise_reader_api.py +++ b/archivebox/parsers/readwise_reader_api.py @@ -3,23 +3,19 @@ __package__ = "archivebox.parsers" import re import requests +import archivebox from datetime import datetime from typing import IO, Iterable, Optional from configparser import ConfigParser -from pathlib import Path - from ..index.schema import Link from ..util import enforce_types from ..system import atomic_write -from ..config import ( - SOURCES_DIR, - READWISE_READER_TOKENS, -) +from ..config import READWISE_READER_TOKENS -API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db" +API_DB_PATH = archivebox.DATA_DIR / "sources" / "readwise_reader_api.db" class ReadwiseReaderAPI: diff --git a/archivebox/plugantic/base_binary.py b/archivebox/plugantic/base_binary.py index cafccae8..e3a995ef 100644 --- a/archivebox/plugantic/base_binary.py +++ b/archivebox/plugantic/base_binary.py @@ -17,6 +17,8 @@ from pydantic_pkgr import ( from django.conf import settings +import archivebox + from .base_hook import BaseHook, HookType @@ -64,7 +66,9 @@ class BaseBinary(BaseHook, Binary): super().register(settings, parent_plugin=parent_plugin) @staticmethod - def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None: + def symlink_to_lib(binary, bin_dir=None) -> None: + bin_dir = bin_dir or archivebox.CONSTANTS.LIB_BIN_DIR + if not (binary.abspath and binary.abspath.exists()): return @@ -77,19 +81,19 @@ class BaseBinary(BaseHook, Binary): @validate_call def load(self, **kwargs) -> Self: binary = super().load(**kwargs) - self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR) + self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR) return binary @validate_call def install(self, **kwargs) -> Self: binary = super().install(**kwargs) - self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR) + self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR) return binary @validate_call def load_or_install(self, **kwargs) -> Self: binary = super().load_or_install(**kwargs) - self.symlink_to_lib(binary=binary, bin_dir=settings.CONFIG.BIN_DIR) + self.symlink_to_lib(binary=binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR) return binary @property diff --git a/archivebox/plugantic/base_configset.py b/archivebox/plugantic/base_configset.py index 01f9d12d..11ca16ef 100644 --- a/archivebox/plugantic/base_configset.py +++ b/archivebox/plugantic/base_configset.py @@ -123,6 +123,10 @@ class ArchiveBoxBaseConfig(BaseSettings): validate_return=True, revalidate_instances="always", ) + + load_from_defaults: ClassVar[bool] = True + load_from_configfile: ClassVar[bool] = True + load_from_environment: ClassVar[bool] = True @classmethod def settings_customise_sources( @@ -140,20 +144,22 @@ class ArchiveBoxBaseConfig(BaseSettings): # import ipdb; ipdb.set_trace() + precedence_order = {} + # if ArchiveBox.conf does not exist yet, return defaults -> env order if not ARCHIVEBOX_CONFIG_FILE.is_file(): - return ( - init_settings, - env_settings, - ) + precedence_order = { + 'defaults': init_settings, + 'environment': env_settings, + } # if ArchiveBox.conf exists and is in TOML format, return default -> TOML -> env order try: - return ( - init_settings, - FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), - env_settings, - ) + precedence_order = precedence_order or { + 'defaults': init_settings, + 'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), + 'environment': env_settings, + } except Exception as err: if err.__class__.__name__ != "TOMLDecodeError": raise @@ -165,11 +171,20 @@ class ArchiveBoxBaseConfig(BaseSettings): new_toml = ini_to_toml.convert(original_ini) ARCHIVEBOX_CONFIG_FILE.write_text(new_toml) - return ( - init_settings, - FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), - env_settings, - ) + precedence_order = { + 'defaults': init_settings, + 'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), + 'environment': env_settings, + } + + if not cls.load_from_environment: + precedence_order.pop('environment') + if not cls.load_from_configfile: + precedence_order.pop('configfile') + if not cls.load_from_defaults: + precedence_order.pop('defaults') + + return tuple(precedence_order.values()) @model_validator(mode="after") def fill_defaults(self): diff --git a/archivebox/plugantic/management/commands/pkg.py b/archivebox/plugantic/management/commands/pkg.py index 6718baf1..2621021b 100644 --- a/archivebox/plugantic/management/commands/pkg.py +++ b/archivebox/plugantic/management/commands/pkg.py @@ -1,72 +1,72 @@ -__package__ = 'archivebox.plugantic.management.commands' +# __package__ = 'archivebox.plugantic.management.commands' -from django.core.management.base import BaseCommand -from django.conf import settings +# from django.core.management.base import BaseCommand +# from django.conf import settings -from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer -from pydantic_pkgr.binprovider import bin_abspath +# from pydantic_pkgr import Binary, BinProvider, BrewProvider, EnvProvider, SemVer +# from pydantic_pkgr.binprovider import bin_abspath -from ....config import NODE_BIN_PATH, bin_path -from ...base_binary import env +# from ....config import bin_path +# from ...base_binary import env -class Command(BaseCommand): - def handle(self, *args, method, **options): - method(*args, **options) +# class Command(BaseCommand): +# def handle(self, *args, method, **options): +# method(*args, **options) - def add_arguments(self, parser): - subparsers = parser.add_subparsers(title="sub-commands", required=True) +# def add_arguments(self, parser): +# subparsers = parser.add_subparsers(title="sub-commands", required=True) - list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.") - list_parser.set_defaults(method=self.list) +# list_parser = subparsers.add_parser("list", help="List archivebox runtime dependencies.") +# list_parser.set_defaults(method=self.list) - install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.") - install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.") - install_parser.add_argument("package_names", nargs="+", type=str) - install_parser.set_defaults(method=self.install) +# install_parser = subparsers.add_parser("install", help="Install archivebox runtime dependencies.") +# install_parser.add_argument("--update", action="store_true", help="Update dependencies to latest versions.") +# install_parser.add_argument("package_names", nargs="+", type=str) +# install_parser.set_defaults(method=self.install) - def list(self, *args, **options): - self.stdout.write('################# PLUGINS ####################') - for plugin in settings.PLUGINS.values(): - self.stdout.write(f'{plugin.name}:') - for binary in plugin.binaries: - try: - binary = binary.load() - except Exception as e: - # import ipdb; ipdb.set_trace() - raise - self.stdout.write(f' {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)} {binary.abspath}') +# def list(self, *args, **options): +# self.stdout.write('################# PLUGINS ####################') +# for plugin in settings.PLUGINS.values(): +# self.stdout.write(f'{plugin.name}:') +# for binary in plugin.binaries: +# try: +# binary = binary.load() +# except Exception as e: +# # import ipdb; ipdb.set_trace() +# raise +# self.stdout.write(f' {binary.name.ljust(14)} {str(binary.version).ljust(11)} {binary.binprovider.INSTALLER_BIN.ljust(5)} {binary.abspath}') - self.stdout.write('\n################# LEGACY ####################') - for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items(): - bin_name = settings.CONFIG[bin_key] +# self.stdout.write('\n################# LEGACY ####################') +# for bin_key, dependency in settings.CONFIG.DEPENDENCIES.items(): +# bin_name = settings.CONFIG[bin_key] - self.stdout.write(f'{bin_key}: {bin_name}') +# self.stdout.write(f'{bin_key}: {bin_name}') - # binary = Binary(name=package_name, providers=[env]) - # print(binary) +# # binary = Binary(name=package_name, providers=[env]) +# # print(binary) - # try: - # loaded_bin = binary.load() - # self.stdout.write( - # self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin) - # ) - # except Exception as e: - # self.stderr.write( - # self.style.ERROR(f"Error loading {package_name}: {e}") - # ) +# # try: +# # loaded_bin = binary.load() +# # self.stdout.write( +# # self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin) +# # ) +# # except Exception as e: +# # self.stderr.write( +# # self.style.ERROR(f"Error loading {package_name}: {e}") +# # ) - def install(self, *args, bright, **options): - for package_name in options["package_names"]: - binary = Binary(name=package_name, providers=[env]) - print(binary) +# def install(self, *args, bright, **options): +# for package_name in options["package_names"]: +# binary = Binary(name=package_name, providers=[env]) +# print(binary) - try: - loaded_bin = binary.load() - self.stdout.write( - self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin) - ) - except Exception as e: - self.stderr.write( - self.style.ERROR(f"Error loading {package_name}: {e}") - ) +# try: +# loaded_bin = binary.load() +# self.stdout.write( +# self.style.SUCCESS(f'Successfully loaded {package_name}:') + str(loaded_bin) +# ) +# except Exception as e: +# self.stderr.write( +# self.style.ERROR(f"Error loading {package_name}: {e}") +# ) diff --git a/archivebox/plugins_extractor/chrome/apps.py b/archivebox/plugins_extractor/chrome/apps.py index 61405e0f..fa295c37 100644 --- a/archivebox/plugins_extractor/chrome/apps.py +++ b/archivebox/plugins_extractor/chrome/apps.py @@ -18,6 +18,8 @@ from pydantic_pkgr import ( bin_abspath, ) +import archivebox + # Depends on other Django apps: from plugantic.base_plugin import BasePlugin from plugantic.base_configset import BaseConfigSet, ConfigSectionName @@ -215,7 +217,7 @@ class ChromeBinary(BaseBinary): } @staticmethod - def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None: + def symlink_to_lib(binary, bin_dir=archivebox.CONSTANTS.LIB_BIN_DIR) -> None: if not (binary.abspath and binary.abspath.exists()): return diff --git a/archivebox/plugins_extractor/readability/apps.py b/archivebox/plugins_extractor/readability/apps.py new file mode 100644 index 00000000..40347845 --- /dev/null +++ b/archivebox/plugins_extractor/readability/apps.py @@ -0,0 +1,103 @@ +__package__ = 'archivebox.plugins_extractor.readability' + +from pathlib import Path +from typing import List, Dict, Optional, ClassVar +# from typing_extensions import Self + +from django.conf import settings + +# Depends on other PyPI/vendor packages: +from pydantic import InstanceOf, Field, validate_call +from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName, ShallowBinary + +# Depends on other Django apps: +from plugantic.base_plugin import BasePlugin +from plugantic.base_configset import BaseConfigSet, ConfigSectionName +from plugantic.base_binary import BaseBinary, env +from plugantic.base_extractor import BaseExtractor +from plugantic.base_hook import BaseHook + +# Depends on Other Plugins: +from plugins_sys.config.apps import ARCHIVING_CONFIG +from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER + +###################### Config ########################## + +class ReadabilityConfig(BaseConfigSet): + section: ClassVar[ConfigSectionName] = 'ARCHIVING_CONFIG' + + SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY') + + READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + + READABILITY_BINARY: str = Field(default='readability-extractor') + # READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args + + +READABILITY_CONFIG = ReadabilityConfig() + + +READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor' + +class ReadabilityBinary(BaseBinary): + name: BinName = READABILITY_CONFIG.READABILITY_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] + + provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { + LIB_NPM_BINPROVIDER.name: {"packages": lambda: [READABILITY_PACKAGE_NAME]}, + SYS_NPM_BINPROVIDER.name: {"packages": lambda: []}, # prevent modifying system global npm packages + } + + @validate_call + def install(self, binprovider_name: Optional[BinProviderName]=None) -> ShallowBinary: + # force install to only use lib/npm provider, we never want to modify global NPM packages + return BaseBinary.install(self, binprovider_name=binprovider_name or LIB_NPM_BINPROVIDER.name) + + @validate_call + def load_or_install(self, binprovider_name: Optional[BinProviderName] = None) -> ShallowBinary: + # force install to only use lib/npm provider, we never want to modify global NPM packages + try: + return self.load() + except Exception: + return BaseBinary.install(self, binprovider_name=binprovider_name or LIB_NPM_BINPROVIDER.name) + + + + +READABILITY_BINARY = ReadabilityBinary() + + +class ReadabilityExtractor(BaseExtractor): + name: str = 'readability' + binary: BinName = READABILITY_BINARY.name + + def get_output_path(self, snapshot) -> Path: + return Path(snapshot.link_dir) / 'readability' / 'content.html' + + +READABILITY_BINARY = ReadabilityBinary() +READABILITY_EXTRACTOR = ReadabilityExtractor() + +# class ReadabilityQueue(BaseQueue): +# name: str = 'singlefile' + +# binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY] + +# READABILITY_QUEUE = ReadabilityQueue() + +class ReadabilityPlugin(BasePlugin): + app_label: str ='singlefile' + verbose_name: str = 'SingleFile' + + hooks: List[InstanceOf[BaseHook]] = [ + READABILITY_CONFIG, + READABILITY_BINARY, + READABILITY_EXTRACTOR, + # READABILITY_QUEUE, + ] + + + +PLUGIN = ReadabilityPlugin() +PLUGIN.register(settings) +DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/singlefile/apps.py b/archivebox/plugins_extractor/singlefile/apps.py index b937b7db..b7741213 100644 --- a/archivebox/plugins_extractor/singlefile/apps.py +++ b/archivebox/plugins_extractor/singlefile/apps.py @@ -34,7 +34,7 @@ class SinglefileConfig(BaseConfigSet): SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) - SINGLEFILE_BINARY: str = Field(default='wget') + SINGLEFILE_BINARY: str = Field(default='single-file') SINGLEFILE_EXTRA_ARGS: List[str] = [] @@ -46,17 +46,21 @@ SINGLEFILE_MAX_VERSION = '1.1.60' class SinglefileBinary(BaseBinary): - name: BinName = 'single-file' + name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { env.name: { 'abspath': lambda: - bin_abspath('single-file', PATH=env.PATH) or bin_abspath('single-file-node.js', PATH=env.PATH), + bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH) + or bin_abspath('single-file', PATH=env.PATH) + or bin_abspath('single-file-node.js', PATH=env.PATH), }, LIB_NPM_BINPROVIDER.name: { "abspath": lambda: - bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH) or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH), + bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH) + or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH) + or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH), "packages": lambda: [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"], }, diff --git a/archivebox/plugins_pkg/npm/apps.py b/archivebox/plugins_pkg/npm/apps.py index 51eb1214..ea2db87e 100644 --- a/archivebox/plugins_pkg/npm/apps.py +++ b/archivebox/plugins_pkg/npm/apps.py @@ -1,10 +1,13 @@ __package__ = 'archivebox.plugins_pkg.npm' +import archivebox + from pathlib import Path from typing import List, Optional from django.conf import settings -from pydantic import InstanceOf + +from pydantic import InstanceOf, model_validator from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName @@ -14,8 +17,6 @@ from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew from plugantic.base_hook import BaseHook -from ...config import CONFIG - ###################### Config ########################## @@ -35,17 +36,24 @@ DEFAULT_GLOBAL_CONFIG = { NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG) +OLD_NODE_BIN_PATH = archivebox.DATA_DIR / 'node_modules' / '.bin' +NEW_NODE_BIN_PATH = archivebox.CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin' + class SystemNpmProvider(NpmProvider, BaseBinProvider): name: BinProviderName = "sys_npm" - PATH: PATHStr = str(CONFIG.NODE_BIN_PATH) npm_prefix: Optional[Path] = None class LibNpmProvider(NpmProvider, BaseBinProvider): name: BinProviderName = "lib_npm" - PATH: PATHStr = str(CONFIG.NODE_BIN_PATH) + PATH: PATHStr = str(OLD_NODE_BIN_PATH) - npm_prefix: Optional[Path] = settings.CONFIG.LIB_DIR / 'npm' + npm_prefix: Optional[Path] = archivebox.CONSTANTS.LIB_NPM_DIR + + @model_validator(mode='after') + def validate_path(self): + assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent + return self SYS_NPM_BINPROVIDER = SystemNpmProvider() diff --git a/archivebox/plugins_pkg/pip/apps.py b/archivebox/plugins_pkg/pip/apps.py index 4da87a69..45be3374 100644 --- a/archivebox/plugins_pkg/pip/apps.py +++ b/archivebox/plugins_pkg/pip/apps.py @@ -1,13 +1,14 @@ +__package__ = 'archivebox.plugins_pkg.pip' + import os import sys import inspect import archivebox from pathlib import Path from typing import List, Dict, Optional, ClassVar -from pydantic import InstanceOf, Field +from pydantic import InstanceOf, Field, model_validator import django - from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type] from django.core.checks import Error, Tags from django.conf import settings @@ -19,6 +20,8 @@ from plugantic.base_check import BaseCheck from plugantic.base_binary import BaseBinary, BaseBinProvider, env, apt, brew from plugantic.base_hook import BaseHook +from ...misc.logging import hint + ###################### Config ########################## @@ -66,7 +69,7 @@ class LibPipBinProvider(PipProvider, BaseBinProvider): name: BinProviderName = "lib_pip" INSTALLER_BIN: BinName = "pip" - pip_venv: Optional[Path] = settings.CONFIG.OUTPUT_DIR / 'lib' / 'pip' / 'venv' + pip_venv: Optional[Path] = archivebox.CONSTANTS.LIB_PIP_DIR / 'venv' SYS_PIP_BINPROVIDER = SystemPipBinProvider() PIPX_PIP_BINPROVIDER = SystemPipxBinProvider() @@ -117,6 +120,20 @@ class SqliteBinary(BaseBinary): "version": lambda: SemVer(django_sqlite3.version), }, } + + @model_validator(mode='after') + def validate_json_extension_is_available(self): + # Check to make sure JSON extension is available in our Sqlite3 instance + try: + cursor = django_sqlite3.connect(':memory:').cursor() + cursor.execute('SELECT JSON(\'{"a": "b"}\')') + except django_sqlite3.OperationalError as exc: + print(f'[red][X] Your SQLite3 version is missing the required JSON1 extension: {exc}[/red]') + hint([ + 'Upgrade your Python version or install the extension manually:', + 'https://code.djangoproject.com/wiki/JSON1Extension' + ]) + return self SQLITE_BINARY = SqliteBinary() diff --git a/archivebox/plugins_pkg/playwright/apps.py b/archivebox/plugins_pkg/playwright/apps.py index 830e4139..dabb8ec8 100644 --- a/archivebox/plugins_pkg/playwright/apps.py +++ b/archivebox/plugins_pkg/playwright/apps.py @@ -19,6 +19,8 @@ from pydantic_pkgr import ( DEFAULT_ENV_PATH, ) +import archivebox + # Depends on other Django apps: from plugantic.base_plugin import BasePlugin from plugantic.base_configset import BaseConfigSet @@ -42,12 +44,10 @@ class PlaywrightConfigs(BaseConfigSet): # PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] pass -DEFAULT_GLOBAL_CONFIG = { -} -PLAYWRIGHT_CONFIG = PlaywrightConfigs(**DEFAULT_GLOBAL_CONFIG) +PLAYWRIGHT_CONFIG = PlaywrightConfigs() -LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers" +LIB_DIR_BROWSERS = archivebox.CONSTANTS.LIB_BROWSERS_DIR @@ -65,12 +65,12 @@ class PlaywrightBinProvider(BaseBinProvider): name: BinProviderName = "playwright" INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name - PATH: PATHStr = f"{settings.CONFIG.BIN_DIR}:{DEFAULT_ENV_PATH}" + PATH: PATHStr = f"{archivebox.CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}" puppeteer_browsers_dir: Optional[Path] = ( - Path("~/Library/Caches/ms-playwright").expanduser() + Path("~/Library/Caches/ms-playwright").expanduser() # macos playwright cache dir if OPERATING_SYSTEM == "darwin" else - Path("~/.cache/ms-playwright").expanduser() + Path("~/.cache/ms-playwright").expanduser() # linux playwright cache dir ) puppeteer_install_args: List[str] = ["install"] # --with-deps diff --git a/archivebox/plugins_pkg/puppeteer/apps.py b/archivebox/plugins_pkg/puppeteer/apps.py index c6faf324..2677ac06 100644 --- a/archivebox/plugins_pkg/puppeteer/apps.py +++ b/archivebox/plugins_pkg/puppeteer/apps.py @@ -16,6 +16,8 @@ from pydantic_pkgr import ( HostBinPath, ) +import archivebox + # Depends on other Django apps: from plugantic.base_plugin import BasePlugin from plugantic.base_configset import BaseConfigSet @@ -40,12 +42,10 @@ class PuppeteerConfigs(BaseConfigSet): # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] pass -DEFAULT_GLOBAL_CONFIG = { -} -PUPPETEER_CONFIG = PuppeteerConfigs(**DEFAULT_GLOBAL_CONFIG) +PUPPETEER_CONFIG = PuppeteerConfigs() -LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers" +LIB_DIR_BROWSERS = archivebox.CONSTANTS.LIB_BROWSERS_DIR class PuppeteerBinary(BaseBinary): @@ -60,8 +60,8 @@ PUPPETEER_BINARY = PuppeteerBinary() class PuppeteerBinProvider(BaseBinProvider): name: BinProviderName = "puppeteer" INSTALLER_BIN: BinName = "npx" - - PATH: PATHStr = str(settings.CONFIG.BIN_DIR) + + PATH: PATHStr = str(archivebox.CONSTANTS.LIB_BIN_DIR) puppeteer_browsers_dir: Optional[Path] = LIB_DIR_BROWSERS puppeteer_install_args: List[str] = ["@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)] @@ -140,7 +140,7 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider() # ALTERNATIVE INSTALL METHOD using Ansible: # install_playbook = self.plugin_dir / 'install_puppeteer.yml' -# chrome_bin = run_playbook(install_playbook, data_dir=settings.CONFIG.OUTPUT_DIR, quiet=quiet).BINARIES.chrome +# chrome_bin = run_playbook(install_playbook, data_dir=archivebox.DATA_DIR, quiet=quiet).BINARIES.chrome # return self.__class__.model_validate( # { # **self.model_dump(), diff --git a/archivebox/plugins_sys/config/apps.py b/archivebox/plugins_sys/config/apps.py index d5c68d25..ac893ab2 100644 --- a/archivebox/plugins_sys/config/apps.py +++ b/archivebox/plugins_sys/config/apps.py @@ -1,18 +1,24 @@ +__package__ = 'archivebox.plugins_sys.config' import os import sys +import shutil import platform +import archivebox -from typing import List, ClassVar +from typing import List, ClassVar, Dict, Optional +from datetime import datetime from pathlib import Path -from pydantic import InstanceOf, Field, field_validator, model_validator +from pydantic import InstanceOf, Field, field_validator, model_validator, computed_field +from benedict import benedict from rich import print from django.conf import settings - +from django.utils.crypto import get_random_string from plugantic.base_plugin import BasePlugin from plugantic.base_configset import BaseConfigSet, ConfigSectionName from plugantic.base_hook import BaseHook +from .constants import CONSTANTS, CONSTANTS_CONFIG ###################### Config ########################## @@ -24,17 +30,57 @@ class ShellConfig(BaseConfigSet): IS_TTY: bool = Field(default=sys.stdout.isatty()) USE_COLOR: bool = Field(default=lambda c: c.IS_TTY) - SHOW_PROGRESS: bool = Field(default=lambda c: (c.IS_TTY and platform.system() != 'darwin')) # progress bars are buggy on mac, disable for now + SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY) IN_DOCKER: bool = Field(default=False) IN_QEMU: bool = Field(default=False) + USER: str = Field(default=Path('~').expanduser().resolve().name) PUID: int = Field(default=os.getuid()) PGID: int = Field(default=os.getgid()) PYTHON_ENCODING: str = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')) + ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS) + + VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)}, + CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)}, + + @computed_field + @property + def TERM_WIDTH(self) -> int: + return shutil.get_terminal_size((100, 10)).columns + + @computed_field + @property + def COMMIT_HASH(self) -> Optional[str]: + try: + git_dir = archivebox.PACKAGE_DIR / '../.git' + ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1] + commit_hash = git_dir.joinpath(ref).read_text().strip() + return commit_hash + except Exception: + pass + + try: + return list((archivebox.PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip() + except Exception: + pass + + return None + + @computed_field + @property + def BUILD_TIME(self) -> str: + if self.IN_DOCKER: + docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0] + return docker_build_end_time + + src_last_modified_unix_timestamp = (archivebox.PACKAGE_DIR / 'config.py').stat().st_mtime + return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s') + + @model_validator(mode='after') def validate_not_running_as_root(self): attempted_command = ' '.join(sys.argv[:3]) @@ -92,7 +138,7 @@ GENERAL_CONFIG = GeneralConfig() class ServerConfig(BaseConfigSet): section: ClassVar[ConfigSectionName] = 'SERVER_CONFIG' - SECRET_KEY: str = Field(default=None) + SECRET_KEY: str = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')) BIND_ADDR: str = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER]) ALLOWED_HOSTS: str = Field(default='*') CSRF_TRUSTED_ORIGINS: str = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR)) @@ -179,7 +225,7 @@ SEARCH_BACKEND_CONFIG = SearchBackendConfig() class ConfigPlugin(BasePlugin): - app_label: str = 'config' + app_label: str = 'CONFIG' verbose_name: str = 'Configuration' hooks: List[InstanceOf[BaseHook]] = [ @@ -190,6 +236,12 @@ class ConfigPlugin(BasePlugin): ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG, ] + + # def register(self, settings, parent_plugin=None): + # try: + # super().register(settings, parent_plugin=parent_plugin) + # except Exception as e: + # print(f'[red][X] Error registering config plugin: {e}[/red]', file=sys.stderr) PLUGIN = ConfigPlugin() diff --git a/archivebox/plugins_sys/config/check_for_update.py b/archivebox/plugins_sys/config/check_for_update.py new file mode 100644 index 00000000..a725522a --- /dev/null +++ b/archivebox/plugins_sys/config/check_for_update.py @@ -0,0 +1,47 @@ +# def get_versions_available_on_github(config): +# """ +# returns a dictionary containing the ArchiveBox GitHub release info for +# the recommended upgrade version and the currently installed version +# """ + +# # we only want to perform the (relatively expensive) check for new versions +# # when its most relevant, e.g. when the user runs a long-running command +# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help' +# long_running_commands = ('add', 'schedule', 'update', 'status', 'server') +# if subcommand_run_by_user not in long_running_commands: +# return None + +# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases" +# response = requests.get(github_releases_api) +# if response.status_code != 200: +# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config) +# return None +# all_releases = response.json() + +# installed_version = parse_version_string(config['VERSION']) + +# # find current version or nearest older version (to link to) +# current_version = None +# for idx, release in enumerate(all_releases): +# release_version = parse_version_string(release['tag_name']) +# if release_version <= installed_version: +# current_version = release +# break + +# current_version = current_version or all_releases[-1] + +# # recommended version is whatever comes after current_version in the release list +# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest) +# try: +# recommended_version = all_releases[idx+1] +# except IndexError: +# recommended_version = None + +# return {'recommended_version': recommended_version, 'current_version': current_version} + +# def can_upgrade(config): +# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']: +# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name']) +# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name']) +# return recommended_version > current_version +# return False diff --git a/archivebox/plugins_sys/config/constants.py b/archivebox/plugins_sys/config/constants.py new file mode 100644 index 00000000..7a5c63e0 --- /dev/null +++ b/archivebox/plugins_sys/config/constants.py @@ -0,0 +1 @@ +from archivebox.constants import * diff --git a/archivebox/queues/settings.py b/archivebox/queues/settings.py index 50df38e7..50a60ce2 100644 --- a/archivebox/queues/settings.py +++ b/archivebox/queues/settings.py @@ -1,16 +1,13 @@ from pathlib import Path -from django.conf import settings +import archivebox +OUTPUT_DIR = archivebox.DATA_DIR +LOGS_DIR = archivebox.CONSTANTS.LOGS_DIR -OUTPUT_DIR = settings.CONFIG.OUTPUT_DIR -LOGS_DIR = settings.CONFIG.LOGS_DIR - -TMP_DIR = OUTPUT_DIR / "tmp" +TMP_DIR = archivebox.CONSTANTS.TMP_DIR Path.mkdir(TMP_DIR, exist_ok=True) - - CONFIG_FILE = TMP_DIR / "supervisord.conf" PID_FILE = TMP_DIR / "supervisord.pid" SOCK_FILE = TMP_DIR / "supervisord.sock" diff --git a/archivebox/system.py b/archivebox/system.py index 65aca12d..cae487e5 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -4,6 +4,7 @@ __package__ = 'archivebox' import os import signal import shutil +import getpass from json import dump from pathlib import Path @@ -229,3 +230,31 @@ class suppress_output(object): if self.stderr: os.dup2(self.real_stderr, 2) os.close(self.null_stderr) + + +def get_system_user() -> str: + # some host OS's are unable to provide a username (k3s, Windows), making this complicated + # uid 999 is especially problematic and breaks many attempts + SYSTEM_USER = None + FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}' + + # Option 1 + try: + import pwd + SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name + except (ModuleNotFoundError, Exception): + pass + + # Option 2 + try: + SYSTEM_USER = SYSTEM_USER or getpass.getuser() + except Exception: + pass + + # Option 3 + try: + SYSTEM_USER = SYSTEM_USER or os.getlogin() + except Exception: + pass + + return SYSTEM_USER or FALLBACK_USER_PLACHOLDER