diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 4d53f3d5..e668db33 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1,5 +1,7 @@ __package__ = 'archivebox' +# print('INSTALLING MONKEY PATCHES') + from .monkey_patches import * import os @@ -28,3 +30,5 @@ def _detect_installed_version(): __version__ = _detect_installed_version() + +# print('DONE INSTALLING MONKEY PATCHES') diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 2b59dcba..50be46de 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -1,16 +1,20 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox' -import os import sys import argparse import threading -from time import sleep +import archivebox -from typing import Optional, Dict, List, IO, Union, Iterable +from time import sleep +from collections.abc import Mapping + +from typing import Optional, List, IO, Union, Iterable from pathlib import Path -from ..config import OUTPUT_DIR, check_data_folder, check_migrations, stderr + +from ..misc.checks import check_data_folder, check_migrations +from ..misc.logging import stderr from importlib import import_module @@ -18,13 +22,46 @@ BUILTIN_LIST = list CLI_DIR = Path(__file__).resolve().parent -# these common commands will appear sorted before any others for ease-of-use -meta_cmds = ('help', 'version') # dont require valid data folder at all -main_cmds = ('init', 'config', 'setup') # dont require existing db present -archive_cmds = ('add', 'remove', 'update', 'list', 'status') # require existing db present -fake_db = ("oneshot",) # use fake in-memory db -display_first = (*meta_cmds, *main_cmds, *archive_cmds) +# def list_subcommands() -> Dict[str, str]: +# """find and import all valid archivebox_.py files in CLI_DIR""" +# COMMANDS = [] +# for filename in os.listdir(CLI_DIR): +# if is_cli_module(filename): +# subcommand = filename.replace('archivebox_', '').replace('.py', '') +# module = import_module('.archivebox_{}'.format(subcommand), __package__) +# assert is_valid_cli_module(module, subcommand) +# COMMANDS.append((subcommand, module.main.__doc__)) +# globals()[subcommand] = module.main +# display_order = lambda cmd: ( +# display_first.index(cmd[0]) +# if cmd[0] in display_first else +# 100 + len(cmd[0]) +# ) +# return dict(sorted(COMMANDS, key=display_order)) + +# just define it statically, it's much faster: +SUBCOMMAND_MODULES = { + 'help': 'archivebox_help', + 'version': 'archivebox_version' , + + 'init': 'archivebox_init', + 'config': 'archivebox_config', + 'setup': 'archivebox_setup', + + 'add': 'archivebox_add', + 'remove': 'archivebox_remove', + 'update': 'archivebox_update', + 'list': 'archivebox_list', + 'status': 'archivebox_status', + + 'schedule': 'archivebox_schedule', + 'server': 'archivebox_server', + 'shell': 'archivebox_shell', + 'manage': 'archivebox_manage', + + 'oneshot': 'archivebox_oneshot', +} # every imported command module must have these properties in order to be valid required_attrs = ('__package__', '__command__', 'main') @@ -36,6 +73,38 @@ is_valid_cli_module = lambda module, subcommand: ( and module.__command__.split(' ')[-1] == subcommand ) +class LazySubcommands(Mapping): + def keys(self): + return SUBCOMMAND_MODULES.keys() + + def values(self): + return [self[key] for key in self.keys()] + + def items(self): + return [(key, self[key]) for key in self.keys()] + + def __getitem__(self, key): + module = import_module(f'.{SUBCOMMAND_MODULES[key]}', __package__) + assert is_valid_cli_module(module, key) + return module.main + + def __iter__(self): + return iter(SUBCOMMAND_MODULES.keys()) + + def __len__(self): + return len(SUBCOMMAND_MODULES) + +CLI_SUBCOMMANDS = LazySubcommands() + + +# these common commands will appear sorted before any others for ease-of-use +meta_cmds = ('help', 'version') # dont require valid data folder at all +main_cmds = ('init', 'config', 'setup') # dont require existing db present +archive_cmds = ('add', 'remove', 'update', 'list', 'status') # require existing db present +fake_db = ("oneshot",) # use fake in-memory db + +display_first = (*meta_cmds, *main_cmds, *archive_cmds) + IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting @@ -71,29 +140,9 @@ def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: It raise Exception(f'Background threads failed to exit after {tries}s: {threads_summary}') -def list_subcommands() -> Dict[str, str]: - """find and import all valid archivebox_.py files in CLI_DIR""" - - COMMANDS = [] - for filename in os.listdir(CLI_DIR): - if is_cli_module(filename): - subcommand = filename.replace('archivebox_', '').replace('.py', '') - module = import_module('.archivebox_{}'.format(subcommand), __package__) - assert is_valid_cli_module(module, subcommand) - COMMANDS.append((subcommand, module.main.__doc__)) - globals()[subcommand] = module.main - - display_order = lambda cmd: ( - display_first.index(cmd[0]) - if cmd[0] in display_first else - 100 + len(cmd[0]) - ) - - return dict(sorted(COMMANDS, key=display_order)) - def run_subcommand(subcommand: str, - subcommand_args: List[str]=None, + subcommand_args: List[str] | None = None, stdin: Optional[IO]=None, pwd: Union[Path, str, None]=None) -> None: """Run a given ArchiveBox subcommand with the given list of args""" @@ -101,18 +150,18 @@ def run_subcommand(subcommand: str, subcommand_args = subcommand_args or [] if subcommand not in meta_cmds: - from ..config import setup_django + from ..config import setup_django, CONFIG cmd_requires_db = subcommand in archive_cmds init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args if cmd_requires_db: - check_data_folder(pwd) + check_data_folder(CONFIG) setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending) if cmd_requires_db: - check_migrations() + check_migrations(CONFIG) module = import_module('.archivebox_{}'.format(subcommand), __package__) module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore @@ -121,17 +170,28 @@ def run_subcommand(subcommand: str, wait_for_bg_threads_to_exit(timeout=60) -SUBCOMMANDS = list_subcommands() + + class NotProvided: - pass + def __len__(self): + return 0 + def __bool__(self): + return False + def __repr__(self): + return '' + +Omitted = Union[None, NotProvided] + +OMITTED = NotProvided() -def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, pwd: Optional[str]=None) -> None: - args = sys.argv[1:] if args is NotProvided else args - stdin = sys.stdin if stdin is NotProvided else stdin +def main(args: List[str] | Omitted=OMITTED, stdin: IO | Omitted=OMITTED, pwd: str | None=None) -> None: + # print('STARTING CLI MAIN ENTRYPOINT') + + args = sys.argv[1:] if args is OMITTED else args + stdin = sys.stdin if stdin is OMITTED else stdin - subcommands = list_subcommands() parser = argparse.ArgumentParser( prog=__command__, description='ArchiveBox: The self-hosted internet archive', @@ -141,19 +201,19 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, group.add_argument( '--help', '-h', action='store_true', - help=subcommands['help'], + help=CLI_SUBCOMMANDS['help'].__doc__, ) group.add_argument( '--version', action='store_true', - help=subcommands['version'], + help=CLI_SUBCOMMANDS['version'].__doc__, ) group.add_argument( "subcommand", type=str, help= "The name of the subcommand to run", nargs='?', - choices=subcommands.keys(), + choices=CLI_SUBCOMMANDS.keys(), default=None, ) parser.add_argument( @@ -174,23 +234,13 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, log_cli_command( subcommand=command.subcommand, subcommand_args=command.subcommand_args, - stdin=stdin, - pwd=pwd or OUTPUT_DIR + stdin=stdin or None, + pwd=pwd or archivebox.DATA_DIR, ) run_subcommand( subcommand=command.subcommand, subcommand_args=command.subcommand_args, - stdin=stdin, - pwd=pwd or OUTPUT_DIR, + stdin=stdin or None, + pwd=pwd or archivebox.DATA_DIR, ) - - -__all__ = ( - 'SUBCOMMANDS', - 'list_subcommands', - 'run_subcommand', - *SUBCOMMANDS.keys(), -) - - diff --git a/archivebox/config.py b/archivebox/config.py index 53c23b2e..c85fca72 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -28,21 +28,19 @@ import sys import json import inspect import getpass -import platform import shutil import requests from hashlib import md5 from pathlib import Path -from benedict import benedict from datetime import datetime, timezone -from typing import Optional, Type, Tuple, Dict, Union, List +from typing import Optional, Type, Tuple, Dict from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired from configparser import ConfigParser -from collections import defaultdict import importlib.metadata from pydantic_pkgr import SemVer +from rich.progress import Progress import django from django.db.backends.sqlite3.base import Database as sqlite3 @@ -56,6 +54,17 @@ from .config_stubs import ( ConfigDefaultDict, ) +from .misc.logging import ( + CONSOLE, + SHOW_PROGRESS, + DEFAULT_CLI_COLORS, + ANSI, + COLOR_DICT, + stderr, + hint, +) +from .misc.checks import check_system_config + # print('STARTING CONFIG LOADING') # load fallback libraries from vendor dir @@ -70,7 +79,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SHELL_CONFIG': { 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()}, 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']}, - 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now + 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']}, # progress bars are buggy on mac, disable for now 'IN_DOCKER': {'type': bool, 'default': False}, 'IN_QEMU': {'type': bool, 'default': False}, 'PUID': {'type': int, 'default': os.getuid()}, @@ -306,32 +315,7 @@ ROBOTS_TXT_FILENAME = 'robots.txt' FAVICON_FILENAME = 'favicon.ico' CONFIG_FILENAME = 'ArchiveBox.conf' -DEFAULT_CLI_COLORS = benedict( - { - "reset": "\033[00;00m", - "lightblue": "\033[01;30m", - "lightyellow": "\033[01;33m", - "lightred": "\033[01;35m", - "red": "\033[01;31m", - "green": "\033[01;32m", - "blue": "\033[01;34m", - "white": "\033[01;37m", - "black": "\033[01;30m", - } -) -ANSI = AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()}) -COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], { - '00': [(0, 0, 0), (0, 0, 0)], - '30': [(0, 0, 0), (0, 0, 0)], - '31': [(255, 0, 0), (128, 0, 0)], - '32': [(0, 200, 0), (0, 128, 0)], - '33': [(255, 255, 0), (128, 128, 0)], - '34': [(0, 0, 255), (0, 0, 128)], - '35': [(255, 0, 255), (128, 0, 128)], - '36': [(0, 255, 255), (0, 128, 128)], - '37': [(255, 255, 255), (255, 255, 255)], -}) STATICFILE_EXTENSIONS = { # 99.999% of the time, URLs ending in these extensions are static files @@ -880,37 +864,6 @@ def parse_version_string(version: str) -> Tuple[int, int, int]: return tuple(int(part) for part in base.split('.'))[:3] -# Logging Helpers -def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: - ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI - - if color: - strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n'] - else: - strs = [' '.join(str(a) for a in args), '\n'] - - sys.stdout.write(prefix + ''.join(strs)) - -def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: - ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI - - if color: - strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n'] - else: - strs = [' '.join(str(a) for a in args), '\n'] - - sys.stderr.write(prefix + ''.join(strs)) - -def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None: - ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI - - if isinstance(text, str): - stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi)) - else: - stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi)) - for line in text[1:]: - stderr('{} {}'.format(prefix, line)) - # Dependency Metadata Helpers def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3) -> Optional[str]: @@ -919,6 +872,10 @@ def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3) abspath = bin_path(binary) if not binary or not abspath: return None + + return '999.999.999' + + # Now handled by new BinProvider plugin system, no longer needed: try: bin_env = os.environ | {'LANG': 'C'} @@ -960,6 +917,9 @@ def bin_path(binary: Optional[str]) -> Optional[str]: return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary def bin_hash(binary: Optional[str]) -> Optional[str]: + return 'UNUSED' + # DEPRECATED: now handled by new BinProvider plugin system, no longer needed: + if binary is None: return None abs_path = bin_path(binary) @@ -1329,246 +1289,123 @@ if not CONFIG['CHECK_SSL_VALIDITY']: ########################### Config Validity Checkers ########################### +INITIAL_STARTUP_PROGRESS = None +INITIAL_STARTUP_PROGRESS_TASK = 0 -def check_system_config(config: ConfigDict=CONFIG) -> None: - ### Check system environment - if config['USER'] == 'root' or str(config['PUID']) == "0": - stderr('[!] ArchiveBox should never be run as root!', color='red') - stderr(' For more information, see the security overview documentation:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root') - - if config['IN_DOCKER']: - attempted_command = ' '.join(sys.argv[:3]) - stderr('') - stderr(' {lightred}Hint{reset}: When using Docker, you must run commands with {green}docker run{reset} instead of {lightyellow}docker exec{reset}, e.g.:'.format(**config['ANSI'])) - stderr(f' docker compose run archivebox {attempted_command}') - stderr(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}') - stderr(' or:') - stderr(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"') - stderr(f' docker exec -it --user=archivebox /bin/bash -c "archivebox {attempted_command}"') - - raise SystemExit(2) - - ### Check Python environment - if sys.version_info[:3] < (3, 7, 0): - stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red') - stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') - raise SystemExit(2) - - if int(CONFIG['DJANGO_VERSION'].split('.')[0]) < 3: - stderr(f'[X] Django version is not new enough: {config["DJANGO_VERSION"]} (>3.0 is required)', color='red') - stderr(' Upgrade django using pip or your system package manager: pip3 install --upgrade django') - raise SystemExit(2) - - if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'): - stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red') - stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)') - stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"') - stderr('') - stderr(' Confirm that it\'s fixed by opening a new shell and running:') - stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8') - raise SystemExit(2) - - # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) - # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) - if config['CHROME_USER_DATA_DIR'] is not None and Path(config['CHROME_USER_DATA_DIR']).exists(): - if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(): - stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red') - stderr(f' {config["CHROME_USER_DATA_DIR"]}') - stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.') - stderr(' For more info see:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR') - if '/Default' in str(config['CHROME_USER_DATA_DIR']): - stderr() - stderr(' Try removing /Default from the end e.g.:') - stderr(' CHROME_USER_DATA_DIR="{}"'.format(str(config['CHROME_USER_DATA_DIR']).split('/Default')[0])) - - # hard error is too annoying here, instead just set it to nothing - # raise SystemExit(2) - config['CHROME_USER_DATA_DIR'] = None - else: - config['CHROME_USER_DATA_DIR'] = None - - -def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: - invalid_dependencies = [ - (name, info) for name, info in config['DEPENDENCIES'].items() - if info['enabled'] and not info['is_valid'] - ] - if invalid_dependencies and show_help: - stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow') - for dependency, info in invalid_dependencies: - stderr( - ' ! {}: {} ({})'.format( - dependency, - info['path'] or 'unable to find binary', - info['version'] or 'unable to detect version', - ) - ) - if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'): - hint(('To install all packages automatically run: archivebox setup', - f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False', - ''), prefix=' ') - stderr('') - - if config['TIMEOUT'] < 5: - stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') - stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.') - stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)') - stderr() - stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles') - stderr() - - elif config['USE_CHROME'] and config['TIMEOUT'] < 15: - stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') - stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.') - stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)') - stderr() - stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles') - stderr() - - if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20: - stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red') - stderr(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.') - stderr(' (Setting it somewhere over 60 seconds is recommended)') - stderr() - stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') - stderr() - - -def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None: - output_dir = out_dir or config['OUTPUT_DIR'] - assert isinstance(output_dir, (str, Path)) - - archive_dir_exists = (Path(output_dir) / ARCHIVE_DIR_NAME).exists() - if not archive_dir_exists: - stderr('[X] No archivebox index found in the current directory.', color='red') - stderr(f' {output_dir}', color='lightyellow') - stderr() - stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI'])) - stderr(' cd path/to/your/archive/folder') - stderr(' archivebox [command]') - stderr() - stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI'])) - stderr(' archivebox init') - raise SystemExit(2) - - -def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG): - output_dir = out_dir or config['OUTPUT_DIR'] - from .index.sql import list_migrations - - pending_migrations = [name for status, name in list_migrations() if not status] - - if pending_migrations: - stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow') - stderr(f' {output_dir}') - stderr() - stderr(f' To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:') - stderr(' archivebox init') - raise SystemExit(3) - - (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True) - (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True) - (Path(output_dir) / CACHE_DIR_NAME).mkdir(exist_ok=True) - (Path(output_dir) / LIB_DIR_NAME / 'bin').mkdir(exist_ok=True, parents=True) - (Path(output_dir) / PERSONAS_DIR_NAME / 'Default').mkdir(exist_ok=True, parents=True) - - +def bump_startup_progress_bar(): + global INITIAL_STARTUP_PROGRESS + global INITIAL_STARTUP_PROGRESS_TASK + if INITIAL_STARTUP_PROGRESS: + INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None: - check_system_config() + global INITIAL_STARTUP_PROGRESS + global INITIAL_STARTUP_PROGRESS_TASK + + with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS: + INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25) + check_system_config(config) - output_dir = out_dir or Path(config['OUTPUT_DIR']) + output_dir = out_dir or Path(config['OUTPUT_DIR']) - assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path) + assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path) - try: - from django.core.management import call_command - - sys.path.append(str(config['PACKAGE_DIR'])) - os.environ.setdefault('OUTPUT_DIR', str(output_dir)) - assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py' - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') - - # Check to make sure JSON extension is available in our Sqlite3 instance + bump_startup_progress_bar() try: - cursor = sqlite3.connect(':memory:').cursor() - cursor.execute('SELECT JSON(\'{"a": "b"}\')') - except sqlite3.OperationalError as exc: - stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red') - hint([ - 'Upgrade your Python version or install the extension manually:', - 'https://code.djangoproject.com/wiki/JSON1Extension' - ]) + from django.core.management import call_command - if in_memory_db: - # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk. - # in those cases we create a temporary in-memory db and run the migrations - # immediately to get a usable in-memory-database at startup - os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:") - django.setup() - call_command("migrate", interactive=False, verbosity=0) - else: - # Otherwise use default sqlite3 file-based database and initialize django - # without running migrations automatically (user runs them manually by calling init) - django.setup() + sys.path.append(str(config['PACKAGE_DIR'])) + os.environ.setdefault('OUTPUT_DIR', str(output_dir)) + assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py' + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') - from django.conf import settings - - # log startup message to the error log - with open(settings.ERROR_LOG, "a", encoding='utf-8') as f: - command = ' '.join(sys.argv) - ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') - f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n") - - if check_db: - # Enable WAL mode in sqlite3 - from django.db import connection - with connection.cursor() as cursor: - - # Set Journal mode to WAL to allow for multiple writers - current_mode = cursor.execute("PRAGMA journal_mode") - if current_mode != 'wal': - cursor.execute("PRAGMA journal_mode=wal;") - - # Set max blocking delay for concurrent writes and write sync mode - # https://litestream.io/tips/#busy-timeout - cursor.execute("PRAGMA busy_timeout = 5000;") - cursor.execute("PRAGMA synchronous = NORMAL;") - - # Create cache table in DB if needed + # Check to make sure JSON extension is available in our Sqlite3 instance try: - from django.core.cache import cache - cache.get('test', None) - except django.db.utils.OperationalError: - call_command("createcachetable", verbosity=0) + cursor = sqlite3.connect(':memory:').cursor() + cursor.execute('SELECT JSON(\'{"a": "b"}\')') + except sqlite3.OperationalError as exc: + stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red') + hint([ + 'Upgrade your Python version or install the extension manually:', + 'https://code.djangoproject.com/wiki/JSON1Extension' + ]) + + bump_startup_progress_bar() - # if archivebox gets imported multiple times, we have to close - # the sqlite3 whenever we init from scratch to avoid multiple threads - # sharing the same connection by accident - from django.db import connections - for conn in connections.all(): - conn.close_if_unusable_or_obsolete() + if in_memory_db: + # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk. + # in those cases we create a temporary in-memory db and run the migrations + # immediately to get a usable in-memory-database at startup + os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:") + django.setup() + + bump_startup_progress_bar() + call_command("migrate", interactive=False, verbosity=0) + else: + # Otherwise use default sqlite3 file-based database and initialize django + # without running migrations automatically (user runs them manually by calling init) + django.setup() + + bump_startup_progress_bar() - sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME - assert sql_index_path.exists(), ( - f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)') + from django.conf import settings + # log startup message to the error log + with open(settings.ERROR_LOG, "a", encoding='utf-8') as f: + command = ' '.join(sys.argv) + ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') + f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n") - # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging - if settings.DEBUG_LOGFIRE: - from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor - SQLite3Instrumentor().instrument() + if check_db: + # Enable WAL mode in sqlite3 + from django.db import connection + with connection.cursor() as cursor: - import logfire + # Set Journal mode to WAL to allow for multiple writers + current_mode = cursor.execute("PRAGMA journal_mode") + if current_mode != 'wal': + cursor.execute("PRAGMA journal_mode=wal;") - logfire.configure() - logfire.instrument_django(is_sql_commentor_enabled=True) - logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv) + # Set max blocking delay for concurrent writes and write sync mode + # https://litestream.io/tips/#busy-timeout + cursor.execute("PRAGMA busy_timeout = 5000;") + cursor.execute("PRAGMA synchronous = NORMAL;") - except KeyboardInterrupt: - raise SystemExit(2) + # Create cache table in DB if needed + try: + from django.core.cache import cache + cache.get('test', None) + except django.db.utils.OperationalError: + call_command("createcachetable", verbosity=0) + + bump_startup_progress_bar() + + # if archivebox gets imported multiple times, we have to close + # the sqlite3 whenever we init from scratch to avoid multiple threads + # sharing the same connection by accident + from django.db import connections + for conn in connections.all(): + conn.close_if_unusable_or_obsolete() + + sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME + assert sql_index_path.exists(), ( + f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)') + + bump_startup_progress_bar() + + # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging + if settings.DEBUG_LOGFIRE: + from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor + SQLite3Instrumentor().instrument() + + import logfire + + logfire.configure() + logfire.instrument_django(is_sql_commentor_enabled=True) + logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv) + + except KeyboardInterrupt: + raise SystemExit(2) + + INITIAL_STARTUP_PROGRESS = None + INITIAL_STARTUP_PROGRESS_TASK = None diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index c394494a..31c084de 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -170,6 +170,7 @@ STATICFILES_DIRS = [ *[ str(plugin_dir / 'static') for plugin_dir in PLUGIN_DIRS.values() + if (plugin_dir / 'static').is_dir() ], str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'), ] @@ -179,6 +180,7 @@ TEMPLATE_DIRS = [ *[ str(plugin_dir / 'templates') for plugin_dir in PLUGIN_DIRS.values() + if (plugin_dir / 'templates').is_dir() ], str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'), str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'), diff --git a/archivebox/core/settings_logging.py b/archivebox/core/settings_logging.py index 3a012a9d..10cc23e4 100644 --- a/archivebox/core/settings_logging.py +++ b/archivebox/core/settings_logging.py @@ -141,18 +141,22 @@ SETTINGS_LOGGING = { "api": { "handlers": ["default", "logfile"], "level": "DEBUG", + "propagate": False, }, "checks": { "handlers": ["default", "logfile"], "level": "DEBUG", + "propagate": False, }, "core": { "handlers": ["default", "logfile"], "level": "DEBUG", + "propagate": False, }, "plugins_extractor": { "handlers": ["default", "logfile"], "level": "DEBUG", + "propagate": False, }, "httpx": { "handlers": ["outbound_webhooks"], @@ -164,6 +168,7 @@ SETTINGS_LOGGING = { "handlers": ["default", "logfile"], "level": "INFO", "filters": ["noisyrequestsfilter"], + "propagate": False, }, "django.utils.autoreload": { "propagate": False, diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 09f52c72..b2468d03 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -230,7 +230,7 @@ def progress_bar(seconds: int, prefix: str='') -> None: print() -def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str): +def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str): cmd = ' '.join(('archivebox', subcommand, *subcommand_args)) stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format( now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), @@ -526,11 +526,11 @@ def log_removal_finished(all_links: int, to_remove: int): def log_shell_welcome_msg(): - from .cli import list_subcommands + from .cli import CLI_SUBCOMMANDS print('{green}# ArchiveBox Imports{reset}'.format(**ANSI)) print('{green}from core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI)) - print('{green}from cli import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI)) + print('{green}from cli import *\n {}{reset}'.format("\n ".join(CLI_SUBCOMMANDS.keys()), **ANSI)) print() print('[i] Welcome to the ArchiveBox Shell!') print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage') diff --git a/archivebox/main.py b/archivebox/main.py index ab2b0c9e..a7c52705 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -16,7 +16,7 @@ from django.db.models import QuerySet from django.utils import timezone from .cli import ( - list_subcommands, + CLI_SUBCOMMANDS, run_subcommand, display_first, meta_cmds, @@ -66,9 +66,9 @@ from .index.html import ( ) from .index.csv import links_to_csv from .extractors import archive_links, archive_link, ignore_methods +from .misc.logging import stderr, hint +from .misc.checks import check_data_folder, check_dependencies from .config import ( - stderr, - hint, ConfigDict, ANSI, IS_TTY, @@ -98,8 +98,6 @@ from .config import ( SEARCH_BACKEND_ENGINE, LDAP, get_version, - check_dependencies, - check_data_folder, write_config_file, VERSION, VERSIONS_AVAILABLE, @@ -146,7 +144,7 @@ from .logging_util import ( def help(out_dir: Path=OUTPUT_DIR) -> None: """Print the ArchiveBox help message and usage""" - all_subcommands = list_subcommands() + all_subcommands = CLI_SUBCOMMANDS COMMANDS_HELP_TEXT = '\n '.join( f'{cmd.ljust(20)} {summary}' for cmd, summary in all_subcommands.items() @@ -281,7 +279,7 @@ def version(quiet: bool=False, print('{white}[i] Data locations:{reset} (not in a data directory)'.format(**ANSI)) print() - check_dependencies() + check_dependencies(CONFIG) @enforce_types @@ -469,7 +467,7 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path= def status(out_dir: Path=OUTPUT_DIR) -> None: """Print out some info and statistics about the archive collection""" - check_data_folder(out_dir=out_dir) + check_data_folder(CONFIG) from core.models import Snapshot from django.contrib.auth import get_user_model @@ -609,8 +607,8 @@ def add(urls: Union[str, List[str]], run_subcommand('init', stdin=None, pwd=out_dir) # Load list of links from the existing index - check_data_folder(out_dir=out_dir) - check_dependencies() + check_data_folder(CONFIG) + check_dependencies(CONFIG) new_links: List[Link] = [] all_links = load_main_index(out_dir=out_dir) @@ -705,7 +703,7 @@ def remove(filter_str: Optional[str]=None, out_dir: Path=OUTPUT_DIR) -> List[Link]: """Remove the specified URLs from the archive""" - check_data_folder(out_dir=out_dir) + check_data_folder(CONFIG) if snapshots is None: if filter_str and filter_patterns: @@ -792,8 +790,8 @@ def update(resume: Optional[float]=None, from core.models import ArchiveResult from .search import index_links - check_data_folder(out_dir=out_dir) - check_dependencies() + check_data_folder(CONFIG) + check_dependencies(CONFIG) new_links: List[Link] = [] # TODO: Remove input argument: only_new extractors = extractors.split(",") if extractors else [] @@ -863,7 +861,7 @@ def list_all(filter_patterns_str: Optional[str]=None, out_dir: Path=OUTPUT_DIR) -> Iterable[Link]: """List, filter, and export information about archive entries""" - check_data_folder(out_dir=out_dir) + check_data_folder(CONFIG) if filter_patterns and filter_patterns_str: stderr( @@ -911,7 +909,7 @@ def list_links(snapshots: Optional[QuerySet]=None, before: Optional[float]=None, out_dir: Path=OUTPUT_DIR) -> Iterable[Link]: - check_data_folder(out_dir=out_dir) + check_data_folder(CONFIG) if snapshots: all_snapshots = snapshots @@ -935,7 +933,7 @@ def list_folders(links: List[Link], status: str, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - check_data_folder(out_dir=out_dir) + check_data_folder(CONFIG) STATUS_FUNCTIONS = { "indexed": get_indexed_folders, @@ -1080,7 +1078,7 @@ def config(config_options_str: Optional[str]=None, out_dir: Path=OUTPUT_DIR) -> None: """Get and set your ArchiveBox project configuration values""" - check_data_folder(out_dir=out_dir) + check_data_folder(CONFIG) if config_options and config_options_str: stderr( @@ -1183,7 +1181,7 @@ def schedule(add: bool=False, out_dir: Path=OUTPUT_DIR): """Set ArchiveBox to regularly import URLs at specific times using cron""" - check_data_folder(out_dir=out_dir) + check_data_folder(CONFIG) Path(LOGS_DIR).mkdir(exist_ok=True) @@ -1324,7 +1322,7 @@ def server(runserver_args: Optional[List[str]]=None, config.SHOW_PROGRESS = False config.DEBUG = config.DEBUG or debug - check_data_folder(out_dir=out_dir) + check_data_folder(CONFIG) from django.core.management import call_command from django.contrib.auth.models import User @@ -1417,7 +1415,7 @@ def server(runserver_args: Optional[List[str]]=None, def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None: """Run an ArchiveBox Django management command""" - check_data_folder(out_dir=out_dir) + check_data_folder(CONFIG) from django.core.management import execute_from_command_line if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY): @@ -1432,7 +1430,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None: def shell(out_dir: Path=OUTPUT_DIR) -> None: """Enter an interactive ArchiveBox Django shell""" - check_data_folder(out_dir=out_dir) + check_data_folder(CONFIG) from django.core.management import call_command call_command("shell_plus") diff --git a/archivebox/manage.py b/archivebox/manage.py index 195a0ec1..37d436a9 100755 --- a/archivebox/manage.py +++ b/archivebox/manage.py @@ -7,7 +7,7 @@ if __name__ == '__main__': # versions of ./manage.py commands whenever possible. When that's not possible # (e.g. makemigrations), you can comment out this check temporarily - allowed_commands = ['makemigrations', 'migrate', 'startapp','squashmigrations', 'generate_stubs'] + allowed_commands = ['makemigrations', 'migrate', 'startapp','squashmigrations', 'generate_stubs', 'test'] if not any(cmd in sys.argv for cmd in allowed_commands): print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):") diff --git a/archivebox/misc/__init__.py b/archivebox/misc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py new file mode 100644 index 00000000..e0b7016a --- /dev/null +++ b/archivebox/misc/checks.py @@ -0,0 +1,159 @@ +__package__ = 'archivebox.misc' + +# TODO: migrate all of these to new plugantic/base_check.py Check system + +import sys +from benedict import benedict +from pathlib import Path + +from .logging import stderr, hint + + +def check_system_config(config: benedict) -> None: + ### Check system environment + if config['USER'] == 'root' or str(config['PUID']) == "0": + stderr('[!] ArchiveBox should never be run as root!', color='red') + stderr(' For more information, see the security overview documentation:') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root') + + if config['IN_DOCKER']: + attempted_command = ' '.join(sys.argv[:3]) + stderr('') + stderr(' {lightred}Hint{reset}: When using Docker, you must run commands with {green}docker run{reset} instead of {lightyellow}docker exec{reset}, e.g.:'.format(**config['ANSI'])) + stderr(f' docker compose run archivebox {attempted_command}') + stderr(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}') + stderr(' or:') + stderr(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"') + stderr(f' docker exec -it --user=archivebox /bin/bash -c "archivebox {attempted_command}"') + + raise SystemExit(2) + + ### Check Python environment + if sys.version_info[:3] < (3, 7, 0): + stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red') + stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') + raise SystemExit(2) + + if int(config['DJANGO_VERSION'].split('.')[0]) < 3: + stderr(f'[X] Django version is not new enough: {config["DJANGO_VERSION"]} (>3.0 is required)', color='red') + stderr(' Upgrade django using pip or your system package manager: pip3 install --upgrade django') + raise SystemExit(2) + + if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'): + stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red') + stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)') + stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"') + stderr('') + stderr(' Confirm that it\'s fixed by opening a new shell and running:') + stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8') + raise SystemExit(2) + + # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) + # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) + if config['CHROME_USER_DATA_DIR'] is not None and Path(config['CHROME_USER_DATA_DIR']).exists(): + if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(): + stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red') + stderr(f' {config["CHROME_USER_DATA_DIR"]}') + stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.') + stderr(' For more info see:') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR') + if '/Default' in str(config['CHROME_USER_DATA_DIR']): + stderr() + stderr(' Try removing /Default from the end e.g.:') + stderr(' CHROME_USER_DATA_DIR="{}"'.format(str(config['CHROME_USER_DATA_DIR']).split('/Default')[0])) + + # hard error is too annoying here, instead just set it to nothing + # raise SystemExit(2) + config['CHROME_USER_DATA_DIR'] = None + else: + config['CHROME_USER_DATA_DIR'] = None + + +def check_dependencies(config: benedict, show_help: bool=True) -> None: + invalid_dependencies = [ + (name, info) for name, info in config['DEPENDENCIES'].items() + if info['enabled'] and not info['is_valid'] + ] + if invalid_dependencies and show_help: + stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow') + for dependency, info in invalid_dependencies: + stderr( + ' ! {}: {} ({})'.format( + dependency, + info['path'] or 'unable to find binary', + info['version'] or 'unable to detect version', + ) + ) + if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'): + hint(('To install all packages automatically run: archivebox setup', + f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False', + ''), prefix=' ') + stderr('') + + if config['TIMEOUT'] < 5: + stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') + stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.') + stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)') + stderr() + stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles') + stderr() + + elif config['USE_CHROME'] and config['TIMEOUT'] < 15: + stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') + stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.') + stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)') + stderr() + stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles') + stderr() + + if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20: + stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red') + stderr(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.') + stderr(' (Setting it somewhere over 60 seconds is recommended)') + stderr() + stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') + stderr() + + + + +def check_data_folder(config: benedict) -> None: + output_dir = config['OUTPUT_DIR'] + + archive_dir_exists = (Path(output_dir) / 'archive').exists() + if not archive_dir_exists: + stderr('[X] No archivebox index found in the current directory.', color='red') + stderr(f' {output_dir}', color='lightyellow') + stderr() + stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI'])) + stderr(' cd path/to/your/archive/folder') + stderr(' archivebox [command]') + stderr() + stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI'])) + stderr(' archivebox init') + raise SystemExit(2) + + +def check_migrations(config: benedict): + output_dir = config['OUTPUT_DIR'] + + from ..index.sql import list_migrations + + pending_migrations = [name for status, name in list_migrations() if not status] + + if pending_migrations: + stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow') + stderr(f' {output_dir}') + stderr() + stderr(f' To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:') + stderr(' archivebox init') + raise SystemExit(3) + + (Path(output_dir) / config['SOURCES_DIR_NAME']).mkdir(exist_ok=True) + (Path(output_dir) / config['LOGS_DIR_NAME']).mkdir(exist_ok=True) + (Path(output_dir) / config['CACHE_DIR_NAME']).mkdir(exist_ok=True) + (Path(output_dir) / config['LIB_DIR_NAME'] / 'bin').mkdir(exist_ok=True, parents=True) + (Path(output_dir) / config['PERSONAS_DIR_NAME'] / 'Default').mkdir(exist_ok=True, parents=True) diff --git a/archivebox/misc/debugging.py b/archivebox/misc/debugging.py new file mode 100644 index 00000000..d92109bf --- /dev/null +++ b/archivebox/misc/debugging.py @@ -0,0 +1,30 @@ +from functools import wraps +from time import time + +def timed_function(func): + """ + Very simple profiling decorator for debugging. + Usage: + @timed_function + def my_func(): + ... + + More advanced alternatives: + - viztracer ../.venv/bin/archivebox manage check # https://viztracer.readthedocs.io/en/latest/filter.html + - python -m cProfile -o archivebox.prof ../.venv/bin/archivebox manage check; snakeviz archivebox.prof + - Django Debug Toolbar + django-debug-toolbar-flamegraph + + Django Requests Tracker (requests-tracker) + """ + @wraps(func) + def wrap(*args, **kwargs): + if args and hasattr(args[0], '__module__'): + module = args[0].__module__ + else: + module = func.__module__ + ts_start = time() + result = func(*args, **kwargs) + ts_end = time() + ms_elapsed = int((ts_end-ts_start) * 1000) + print(f'[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)') + return result + return wrap diff --git a/archivebox/misc/logging.py b/archivebox/misc/logging.py new file mode 100644 index 00000000..aee43254 --- /dev/null +++ b/archivebox/misc/logging.py @@ -0,0 +1,77 @@ +__package__ = 'archivebox.misc' + +# TODO: merge/dedupe this file with archivebox/logging_util.py + +import os +import sys +from typing import Optional, Union, Tuple, List +from collections import defaultdict +from benedict import benedict +from rich.console import Console + +from ..config_stubs import ConfigDict + +SHOW_PROGRESS = None +if os.environ.get('SHOW_PROGRESS', 'None') in ('True', '1', 'true', 'yes'): + SHOW_PROGRESS = True + +CONSOLE = Console(force_interactive=SHOW_PROGRESS) +SHOW_PROGRESS = CONSOLE.is_interactive if SHOW_PROGRESS is None else SHOW_PROGRESS + +DEFAULT_CLI_COLORS = benedict( + { + "reset": "\033[00;00m", + "lightblue": "\033[01;30m", + "lightyellow": "\033[01;33m", + "lightred": "\033[01;35m", + "red": "\033[01;31m", + "green": "\033[01;32m", + "blue": "\033[01;34m", + "white": "\033[01;37m", + "black": "\033[01;30m", + } +) +ANSI = benedict({k: '' for k in DEFAULT_CLI_COLORS.keys()}) + +COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], { + '00': [(0, 0, 0), (0, 0, 0)], + '30': [(0, 0, 0), (0, 0, 0)], + '31': [(255, 0, 0), (128, 0, 0)], + '32': [(0, 200, 0), (0, 128, 0)], + '33': [(255, 255, 0), (128, 128, 0)], + '34': [(0, 0, 255), (0, 0, 128)], + '35': [(255, 0, 255), (128, 0, 128)], + '36': [(0, 255, 255), (0, 128, 128)], + '37': [(255, 255, 255), (255, 255, 255)], +}) + +# Logging Helpers +def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI + + if color: + strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n'] + else: + strs = [' '.join(str(a) for a in args), '\n'] + + sys.stdout.write(prefix + ''.join(strs)) + +def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI + + if color: + strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n'] + else: + strs = [' '.join(str(a) for a in args), '\n'] + + sys.stderr.write(prefix + ''.join(strs)) + +def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI + + if isinstance(text, str): + stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi)) + else: + stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi)) + for line in text[1:]: + stderr('{} {}'.format(prefix, line)) diff --git a/archivebox/monkey_patches.py b/archivebox/monkey_patches.py index 423df148..c68be8fd 100644 --- a/archivebox/monkey_patches.py +++ b/archivebox/monkey_patches.py @@ -10,7 +10,6 @@ import datetime from django.utils import timezone timezone.utc = datetime.timezone.utc - # monkey patch django-signals-webhooks to change how it shows up in Admin UI # from signal_webhooks.apps import DjangoSignalWebhooksConfig # DjangoSignalWebhooksConfig.verbose_name = 'API' diff --git a/archivebox/package-lock.json b/archivebox/package-lock.json index db0ac368..396e69c1 100644 --- a/archivebox/package-lock.json +++ b/archivebox/package-lock.json @@ -371,9 +371,9 @@ "license": "Apache-2.0" }, "node_modules/bare-events": { - "version": "2.4.2", - "resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.4.2.tgz", - "integrity": "sha512-qMKFd2qG/36aA4GwvKq8MxnPgCQAmBWmSyLWsJcbn8v03wvIPQ/hG1Ms8bPzndZxMDoHpxez5VOS+gC9Yi24/Q==", + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.5.0.tgz", + "integrity": "sha512-/E8dDe9dsbLyh2qrZ64PEPadOQ0F4gbl1sUJOrmph7xOiIxfY8vwab/4bFLh4Y88/Hk/ujKcrQKc+ps0mv873A==", "license": "Apache-2.0", "optional": true }, diff --git a/archivebox/plugantic/base_binary.py b/archivebox/plugantic/base_binary.py index 4aa96ac2..cafccae8 100644 --- a/archivebox/plugantic/base_binary.py +++ b/archivebox/plugantic/base_binary.py @@ -3,6 +3,7 @@ __package__ = "archivebox.plugantic" from typing import Dict, List from typing_extensions import Self +from benedict import benedict from pydantic import Field, InstanceOf, validate_call from pydantic_pkgr import ( Binary, @@ -17,7 +18,6 @@ from pydantic_pkgr import ( from django.conf import settings from .base_hook import BaseHook, HookType -from ..config_stubs import AttrDict class BaseBinProvider(BaseHook, BinProvider): @@ -38,7 +38,7 @@ class BaseBinProvider(BaseHook, BinProvider): def register(self, settings, parent_plugin=None): # self._plugin = parent_plugin # for debugging only, never rely on this! - settings.BINPROVIDERS = getattr(settings, "BINPROVIDERS", None) or AttrDict({}) + settings.BINPROVIDERS = getattr(settings, "BINPROVIDERS", None) or benedict({}) settings.BINPROVIDERS[self.id] = self super().register(settings, parent_plugin=parent_plugin) @@ -58,7 +58,7 @@ class BaseBinary(BaseHook, Binary): def register(self, settings, parent_plugin=None): # self._plugin = parent_plugin # for debugging only, never rely on this! - settings.BINARIES = getattr(settings, "BINARIES", None) or AttrDict({}) + settings.BINARIES = getattr(settings, "BINARIES", None) or benedict({}) settings.BINARIES[self.id] = self super().register(settings, parent_plugin=parent_plugin) diff --git a/archivebox/plugantic/base_check.py b/archivebox/plugantic/base_check.py index e650df42..3f3deda4 100644 --- a/archivebox/plugantic/base_check.py +++ b/archivebox/plugantic/base_check.py @@ -28,7 +28,7 @@ class BaseCheck(BaseHook): def register(self, settings, parent_plugin=None): # self._plugin = parent_plugin # backref to parent is for debugging only, never rely on this! - self.register_with_django_check_system() # (SIDE EFFECT) + self.register_with_django_check_system(settings) # (SIDE EFFECT) # install hook into settings.CHECKS settings.CHECKS = getattr(settings, "CHECKS", None) or AttrDict({}) @@ -37,12 +37,9 @@ class BaseCheck(BaseHook): # record installed hook in settings.HOOKS super().register(settings, parent_plugin=parent_plugin) - def register_with_django_check_system(self): - + def register_with_django_check_system(self, settings): def run_check(app_configs, **kwargs) -> List[Warning]: - from django.conf import settings import logging - return self.check(settings, logging.getLogger("checks")) run_check.__name__ = self.id diff --git a/archivebox/plugantic/base_hook.py b/archivebox/plugantic/base_hook.py index 866b5119..a847ca1c 100644 --- a/archivebox/plugantic/base_hook.py +++ b/archivebox/plugantic/base_hook.py @@ -96,14 +96,13 @@ class BaseHook(BaseModel): # e.g. /admin/environment/config/LdapConfig/ return f"/admin/environment/{self.hook_type.lower()}/{self.id}/" - def register(self, settings, parent_plugin=None): """Load a record of an installed hook into global Django settings.HOOKS at runtime.""" self._plugin = parent_plugin # for debugging only, never rely on this! # assert json.dumps(self.model_json_schema(), indent=4), f"Hook {self.hook_module} has invalid JSON schema." - print(' -', self.hook_module, '.register()') + # print(' -', self.hook_module, '.register()') # record installed hook in settings.HOOKS settings.HOOKS[self.id] = self @@ -118,7 +117,7 @@ class BaseHook(BaseModel): def ready(self, settings): """Runs any runtime code needed when AppConfig.ready() is called (after all models are imported).""" - print(' -', self.hook_module, '.ready()') + # print(' -', self.hook_module, '.ready()') assert self.id in settings.HOOKS, f"Tried to ready hook {self.hook_module} but it is not registered in settings.HOOKS." diff --git a/archivebox/plugantic/base_plugin.py b/archivebox/plugantic/base_plugin.py index a890f961..24683fab 100644 --- a/archivebox/plugantic/base_plugin.py +++ b/archivebox/plugantic/base_plugin.py @@ -1,6 +1,5 @@ __package__ = 'archivebox.plugantic' -import json import inspect from pathlib import Path @@ -18,10 +17,11 @@ from pydantic import ( computed_field, validate_call, ) +from benedict import benedict from .base_hook import BaseHook, HookType -from ..config import AttrDict +from ..config import bump_startup_progress_bar class BasePlugin(BaseModel): @@ -90,7 +90,8 @@ class BasePlugin(BaseModel): assert self.app_label and self.app_label and self.verbose_name, f'{self.__class__.__name__} is missing .name or .app_label or .verbose_name' - assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema." + # assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema." + return self @property @@ -114,13 +115,13 @@ class BasePlugin(BaseModel): @property def HOOKS_BY_ID(self) -> Dict[str, InstanceOf[BaseHook]]: - return AttrDict({hook.id: hook for hook in self.hooks}) + return benedict({hook.id: hook for hook in self.hooks}) @property def HOOKS_BY_TYPE(self) -> Dict[HookType, Dict[str, InstanceOf[BaseHook]]]: - hooks = AttrDict({}) + hooks = benedict({}) for hook in self.hooks: - hooks[hook.hook_type] = hooks.get(hook.hook_type) or AttrDict({}) + hooks[hook.hook_type] = hooks.get(hook.hook_type) or benedict({}) hooks[hook.hook_type][hook.id] = hook return hooks @@ -131,10 +132,10 @@ class BasePlugin(BaseModel): from django.conf import settings as django_settings settings = django_settings - print() - print(self.plugin_module_full, '.register()') + # print() + # print(self.plugin_module_full, '.register()') - assert json.dumps(self.model_json_schema(), indent=4), f'Plugin {self.plugin_module} has invalid JSON schema.' + # assert json.dumps(self.model_json_schema(), indent=4), f'Plugin {self.plugin_module} has invalid JSON schema.' assert self.id not in settings.PLUGINS, f'Tried to register plugin {self.plugin_module} but it conflicts with existing plugin of the same name ({self.app_label}).' @@ -149,6 +150,7 @@ class BasePlugin(BaseModel): settings.PLUGINS[self.id]._is_registered = True # print('√ REGISTERED PLUGIN:', self.plugin_module) + bump_startup_progress_bar() def ready(self, settings=None): """Runs any runtime code needed when AppConfig.ready() is called (after all models are imported).""" @@ -157,8 +159,8 @@ class BasePlugin(BaseModel): from django.conf import settings as django_settings settings = django_settings - print() - print(self.plugin_module_full, '.ready()') + # print() + # print(self.plugin_module_full, '.ready()') assert ( self.id in settings.PLUGINS and settings.PLUGINS[self.id]._is_registered @@ -171,6 +173,7 @@ class BasePlugin(BaseModel): hook.ready(settings) settings.PLUGINS[self.id]._is_ready = True + bump_startup_progress_bar() # @validate_call # def install_binaries(self) -> Self: diff --git a/archivebox/plugantic/ini_to_toml.py b/archivebox/plugantic/ini_to_toml.py index 415b99aa..48bd90c6 100644 --- a/archivebox/plugantic/ini_to_toml.py +++ b/archivebox/plugantic/ini_to_toml.py @@ -83,338 +83,3 @@ class JSONSchemaWithLambdas(GenerateJsonSchema): # for computed_field properties render them like this instead: # inspect.getsource(field.wrapped_property.fget).split('def ', 1)[-1].split('\n', 1)[-1].strip().strip('return '), - - -### Basic Assertions - -# test_input = """ -# [SERVER_CONFIG] -# IS_TTY=False -# USE_COLOR=False -# SHOW_PROGRESS=False -# IN_DOCKER=False -# IN_QEMU=False -# PUID=501 -# PGID=20 -# OUTPUT_DIR=/opt/archivebox/data -# CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf -# ONLY_NEW=True -# TIMEOUT=60 -# MEDIA_TIMEOUT=3600 -# OUTPUT_PERMISSIONS=644 -# RESTRICT_FILE_NAMES=windows -# URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$ -# URL_ALLOWLIST=None -# ADMIN_USERNAME=None -# ADMIN_PASSWORD=None -# ENFORCE_ATOMIC_WRITES=True -# TAG_SEPARATOR_PATTERN=[,] -# SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -# BIND_ADDR=127.0.0.1:8000 -# ALLOWED_HOSTS=* -# DEBUG=False -# PUBLIC_INDEX=True -# PUBLIC_SNAPSHOTS=True -# PUBLIC_ADD_VIEW=False -# FOOTER_INFO=Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests. -# SNAPSHOTS_PER_PAGE=40 -# CUSTOM_TEMPLATES_DIR=None -# TIME_ZONE=UTC -# TIMEZONE=UTC -# REVERSE_PROXY_USER_HEADER=Remote-User -# REVERSE_PROXY_WHITELIST= -# LOGOUT_REDIRECT_URL=/ -# PREVIEW_ORIGINALS=True -# LDAP=False -# LDAP_SERVER_URI=None -# LDAP_BIND_DN=None -# LDAP_BIND_PASSWORD=None -# LDAP_USER_BASE=None -# LDAP_USER_FILTER=None -# LDAP_USERNAME_ATTR=None -# LDAP_FIRSTNAME_ATTR=None -# LDAP_LASTNAME_ATTR=None -# LDAP_EMAIL_ATTR=None -# LDAP_CREATE_SUPERUSER=False -# SAVE_TITLE=True -# SAVE_FAVICON=True -# SAVE_WGET=True -# SAVE_WGET_REQUISITES=True -# SAVE_SINGLEFILE=True -# SAVE_READABILITY=True -# SAVE_MERCURY=True -# SAVE_HTMLTOTEXT=True -# SAVE_PDF=True -# SAVE_SCREENSHOT=True -# SAVE_DOM=True -# SAVE_HEADERS=True -# SAVE_WARC=True -# SAVE_GIT=True -# SAVE_MEDIA=True -# SAVE_ARCHIVE_DOT_ORG=True -# RESOLUTION=1440,2000 -# GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht -# CHECK_SSL_VALIDITY=True -# MEDIA_MAX_SIZE=750m -# USER_AGENT=None -# CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0) -# WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5 -# CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) -# COOKIES_FILE=None -# CHROME_USER_DATA_DIR=None -# CHROME_TIMEOUT=0 -# CHROME_HEADLESS=True -# CHROME_SANDBOX=True -# CHROME_EXTRA_ARGS=[] -# YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)'] -# YOUTUBEDL_EXTRA_ARGS=[] -# WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off'] -# WGET_EXTRA_ARGS=[] -# CURL_ARGS=['--silent', '--location', '--compressed'] -# CURL_EXTRA_ARGS=[] -# GIT_ARGS=['--recursive'] -# SINGLEFILE_ARGS=[] -# SINGLEFILE_EXTRA_ARGS=[] -# MERCURY_ARGS=['--format=text'] -# MERCURY_EXTRA_ARGS=[] -# FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={} -# USE_INDEXING_BACKEND=True -# USE_SEARCHING_BACKEND=True -# SEARCH_BACKEND_ENGINE=ripgrep -# SEARCH_BACKEND_HOST_NAME=localhost -# SEARCH_BACKEND_PORT=1491 -# SEARCH_BACKEND_PASSWORD=SecretPassword -# SEARCH_PROCESS_HTML=True -# SONIC_COLLECTION=archivebox -# SONIC_BUCKET=snapshots -# SEARCH_BACKEND_TIMEOUT=90 -# FTS_SEPARATE_DATABASE=True -# FTS_TOKENIZERS=porter unicode61 remove_diacritics 2 -# FTS_SQLITE_MAX_LENGTH=1000000000 -# USE_CURL=True -# USE_WGET=True -# USE_SINGLEFILE=True -# USE_READABILITY=True -# USE_MERCURY=True -# USE_GIT=True -# USE_CHROME=True -# USE_NODE=True -# USE_YOUTUBEDL=True -# USE_RIPGREP=True -# CURL_BINARY=curl -# GIT_BINARY=git -# WGET_BINARY=wget -# SINGLEFILE_BINARY=single-file -# READABILITY_BINARY=readability-extractor -# MERCURY_BINARY=postlight-parser -# YOUTUBEDL_BINARY=yt-dlp -# NODE_BINARY=node -# RIPGREP_BINARY=rg -# CHROME_BINARY=chrome -# POCKET_CONSUMER_KEY=None -# USER=squash -# PACKAGE_DIR=/opt/archivebox/archivebox -# TEMPLATES_DIR=/opt/archivebox/archivebox/templates -# ARCHIVE_DIR=/opt/archivebox/data/archive -# SOURCES_DIR=/opt/archivebox/data/sources -# LOGS_DIR=/opt/archivebox/data/logs -# PERSONAS_DIR=/opt/archivebox/data/personas -# URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE) -# URL_ALLOWLIST_PTN=None -# DIR_OUTPUT_PERMISSIONS=755 -# ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox -# VERSION=0.8.0 -# COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f -# BUILD_TIME=2024-05-15 03:28:05 1715768885 -# VERSIONS_AVAILABLE=None -# CAN_UPGRADE=False -# PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10 -# PYTHON_ENCODING=UTF-8 -# PYTHON_VERSION=3.10.14 -# DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py -# DJANGO_VERSION=5.0.6 final (0) -# SQLITE_BINARY=/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py -# SQLITE_VERSION=2.6.0 -# CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0) -# WGET_VERSION=GNU Wget 1.24.5 -# WGET_AUTO_COMPRESSION=True -# RIPGREP_VERSION=ripgrep 14.1.0 -# SINGLEFILE_VERSION=None -# READABILITY_VERSION=None -# MERCURY_VERSION=None -# GIT_VERSION=git version 2.44.0 -# YOUTUBEDL_VERSION=2024.04.09 -# CHROME_VERSION=Google Chrome 124.0.6367.207 -# NODE_VERSION=v21.7.3 -# """ - - -# expected_output = TOML_HEADER + '''[SERVER_CONFIG] -# IS_TTY = false -# USE_COLOR = false -# SHOW_PROGRESS = false -# IN_DOCKER = false -# IN_QEMU = false -# PUID = 501 -# PGID = 20 -# OUTPUT_DIR = "/opt/archivebox/data" -# CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf" -# ONLY_NEW = true -# TIMEOUT = 60 -# MEDIA_TIMEOUT = 3600 -# OUTPUT_PERMISSIONS = 644 -# RESTRICT_FILE_NAMES = "windows" -# URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$" -# URL_ALLOWLIST = null -# ADMIN_USERNAME = null -# ADMIN_PASSWORD = null -# ENFORCE_ATOMIC_WRITES = true -# TAG_SEPARATOR_PATTERN = "[,]" -# SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" -# BIND_ADDR = "127.0.0.1:8000" -# ALLOWED_HOSTS = "*" -# DEBUG = false -# PUBLIC_INDEX = true -# PUBLIC_SNAPSHOTS = true -# PUBLIC_ADD_VIEW = false -# FOOTER_INFO = "Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests." -# SNAPSHOTS_PER_PAGE = 40 -# CUSTOM_TEMPLATES_DIR = null -# TIME_ZONE = "UTC" -# TIMEZONE = "UTC" -# REVERSE_PROXY_USER_HEADER = "Remote-User" -# REVERSE_PROXY_WHITELIST = "" -# LOGOUT_REDIRECT_URL = "/" -# PREVIEW_ORIGINALS = true -# LDAP = false -# LDAP_SERVER_URI = null -# LDAP_BIND_DN = null -# LDAP_BIND_PASSWORD = null -# LDAP_USER_BASE = null -# LDAP_USER_FILTER = null -# LDAP_USERNAME_ATTR = null -# LDAP_FIRSTNAME_ATTR = null -# LDAP_LASTNAME_ATTR = null -# LDAP_EMAIL_ATTR = null -# LDAP_CREATE_SUPERUSER = false -# SAVE_TITLE = true -# SAVE_FAVICON = true -# SAVE_WGET = true -# SAVE_WGET_REQUISITES = true -# SAVE_SINGLEFILE = true -# SAVE_READABILITY = true -# SAVE_MERCURY = true -# SAVE_HTMLTOTEXT = true -# SAVE_PDF = true -# SAVE_SCREENSHOT = true -# SAVE_DOM = true -# SAVE_HEADERS = true -# SAVE_WARC = true -# SAVE_GIT = true -# SAVE_MEDIA = true -# SAVE_ARCHIVE_DOT_ORG = true -# RESOLUTION = [1440, 2000] -# GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht" -# CHECK_SSL_VALIDITY = true -# MEDIA_MAX_SIZE = "750m" -# USER_AGENT = null -# CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)" -# WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5" -# CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)" -# COOKIES_FILE = null -# CHROME_USER_DATA_DIR = null -# CHROME_TIMEOUT = false -# CHROME_HEADLESS = true -# CHROME_SANDBOX = true -# CHROME_EXTRA_ARGS = [] -# YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"] -# YOUTUBEDL_EXTRA_ARGS = [] -# WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"] -# WGET_EXTRA_ARGS = [] -# CURL_ARGS = ["--silent", "--location", "--compressed"] -# CURL_EXTRA_ARGS = [] -# GIT_ARGS = ["--recursive"] -# SINGLEFILE_ARGS = [] -# SINGLEFILE_EXTRA_ARGS = [] -# MERCURY_ARGS = ["--format=text"] -# MERCURY_EXTRA_ARGS = [] -# FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}" -# USE_INDEXING_BACKEND = true -# USE_SEARCHING_BACKEND = true -# SEARCH_BACKEND_ENGINE = "ripgrep" -# SEARCH_BACKEND_HOST_NAME = "localhost" -# SEARCH_BACKEND_PORT = 1491 -# SEARCH_BACKEND_PASSWORD = "SecretPassword" -# SEARCH_PROCESS_HTML = true -# SONIC_COLLECTION = "archivebox" -# SONIC_BUCKET = "snapshots" -# SEARCH_BACKEND_TIMEOUT = 90 -# FTS_SEPARATE_DATABASE = true -# FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2" -# FTS_SQLITE_MAX_LENGTH = 1000000000 -# USE_CURL = true -# USE_WGET = true -# USE_SINGLEFILE = true -# USE_READABILITY = true -# USE_MERCURY = true -# USE_GIT = true -# USE_CHROME = true -# USE_NODE = true -# USE_YOUTUBEDL = true -# USE_RIPGREP = true -# CURL_BINARY = "curl" -# GIT_BINARY = "git" -# WGET_BINARY = "wget" -# SINGLEFILE_BINARY = "single-file" -# READABILITY_BINARY = "readability-extractor" -# MERCURY_BINARY = "postlight-parser" -# YOUTUBEDL_BINARY = "yt-dlp" -# NODE_BINARY = "node" -# RIPGREP_BINARY = "rg" -# CHROME_BINARY = "chrome" -# POCKET_CONSUMER_KEY = null -# USER = "squash" -# PACKAGE_DIR = "/opt/archivebox/archivebox" -# TEMPLATES_DIR = "/opt/archivebox/archivebox/templates" -# ARCHIVE_DIR = "/opt/archivebox/data/archive" -# SOURCES_DIR = "/opt/archivebox/data/sources" -# LOGS_DIR = "/opt/archivebox/data/logs" -# PERSONAS_DIR = "/opt/archivebox/data/personas" -# URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)" -# URL_ALLOWLIST_PTN = null -# DIR_OUTPUT_PERMISSIONS = 755 -# ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox" -# VERSION = "0.8.0" -# COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f" -# BUILD_TIME = "2024-05-15 03:28:05 1715768885" -# VERSIONS_AVAILABLE = null -# CAN_UPGRADE = false -# PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10" -# PYTHON_ENCODING = "UTF-8" -# PYTHON_VERSION = "3.10.14" -# DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py" -# DJANGO_VERSION = "5.0.6 final (0)" -# SQLITE_BINARY = "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py" -# SQLITE_VERSION = "2.6.0" -# CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)" -# WGET_VERSION = "GNU Wget 1.24.5" -# WGET_AUTO_COMPRESSION = true -# RIPGREP_VERSION = "ripgrep 14.1.0" -# SINGLEFILE_VERSION = null -# READABILITY_VERSION = null -# MERCURY_VERSION = null -# GIT_VERSION = "git version 2.44.0" -# YOUTUBEDL_VERSION = "2024.04.09" -# CHROME_VERSION = "Google Chrome 124.0.6367.207" -# NODE_VERSION = "v21.7.3"''' - - -# first_output = convert(test_input) # make sure ini -> toml parses correctly -# second_output = convert(first_output) # make sure toml -> toml parses/dumps consistently -# assert first_output == second_output == expected_output # make sure parsing is indempotent - -# # DEBUGGING -# import sys -# import difflib -# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second')) -# print(repr(second_output)) diff --git a/archivebox/plugins_extractor/chrome/apps.py b/archivebox/plugins_extractor/chrome/apps.py index 5c9a71b5..6f2aa94a 100644 --- a/archivebox/plugins_extractor/chrome/apps.py +++ b/archivebox/plugins_extractor/chrome/apps.py @@ -1,3 +1,5 @@ +__package__ = 'archivebox.plugins_extractor.chrome' + import platform from pathlib import Path from typing import List, Optional, Dict, ClassVar @@ -77,40 +79,16 @@ def create_macos_app_symlink(target: Path, shortcut: Path): ###################### Config ########################## -class ChromeDependencyConfigs(BaseConfigSet): +class ChromeConfig(BaseConfigSet): section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG" - CHROME_BINARY: str = Field(default='chrome') - CHROME_ARGS: Optional[List[str]] = Field(default=None) - CHROME_EXTRA_ARGS: List[str] = [] - CHROME_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] - - # def load(self) -> Self: - # # for each field in the model, load its value - # # load from each source in order of precedence (lowest to highest): - # # - schema default - # # - ArchiveBox.conf INI file - # # - environment variables - # # - command-line arguments - - # LOADED_VALUES: Dict[str, Any] = {} + CHROME_BINARY: str = Field(default='chrome') + CHROME_ARGS: List[str] | None = Field(default=None) + CHROME_EXTRA_ARGS: List[str] = Field(default=[]) + CHROME_DEFAULT_ARGS: List[str] = Field(default=lambda: ['--timeout={TIMEOUT-10}']) - # for field_name, field in self.__fields__.items(): - # def_value = field.default_factory() if field.default_factory else field.default - # ini_value = settings.INI_CONFIG.get_value(field_name) - # env_value = settings.ENV_CONFIG.get_value(field_name) - # cli_value = settings.CLI_CONFIG.get_value(field_name) - # run_value = settings.RUN_CONFIG.get_value(field_name) - # value = run_value or cli_value or env_value or ini_value or def_value -class ChromeConfigs(ChromeDependencyConfigs): - # section: ConfigSectionName = 'ALL_CONFIGS' - pass - -DEFAULT_GLOBAL_CONFIG = { -} - -CHROME_CONFIG = ChromeConfigs(**DEFAULT_GLOBAL_CONFIG) +CHROME_CONFIG = ChromeConfig() class ChromeBinary(BaseBinary): @@ -133,6 +111,7 @@ class ChromeBinary(BaseBinary): def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None: if not (binary.abspath and binary.abspath.exists()): return + bin_dir.mkdir(parents=True, exist_ok=True) symlink = bin_dir / binary.name @@ -146,7 +125,6 @@ class ChromeBinary(BaseBinary): CHROME_BINARY = ChromeBinary() -PLUGIN_BINARIES = [CHROME_BINARY] class ChromePlugin(BasePlugin): app_label: str = 'chrome' diff --git a/archivebox/plugins_pkg/pip/apps.py b/archivebox/plugins_pkg/pip/apps.py index 5ea84c3a..f11c8645 100644 --- a/archivebox/plugins_pkg/pip/apps.py +++ b/archivebox/plugins_pkg/pip/apps.py @@ -149,6 +149,7 @@ class CheckUserIsNotRoot(BaseCheck): ) logger.debug('[√] UID is not root') return errors + class CheckPipEnvironment(BaseCheck): label: str = "CheckPipEnvironment" diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 2419e3fe..befbd675 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -14,7 +14,7 @@ from .utils import get_indexable_content, log_index_started def import_backend(): - for backend in settings.SEARCH_BACKENDS: + for backend in settings.SEARCH_BACKENDS.values(): if backend.name == settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE: return backend raise Exception(f'Could not load {settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE} as search backend')