diff --git a/archivebox/abx/archivebox/base_binary.py b/archivebox/abx/archivebox/base_binary.py index 2c9a8116..afa4f192 100644 --- a/archivebox/abx/archivebox/base_binary.py +++ b/archivebox/abx/archivebox/base_binary.py @@ -37,7 +37,8 @@ class BaseBinary(Binary): @staticmethod def symlink_to_lib(binary, bin_dir=None) -> None: - bin_dir = bin_dir or CONSTANTS.LIB_BIN_DIR + from archivebox.config.common import STORAGE_CONFIG + bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin' if not (binary.abspath and os.access(binary.abspath, os.R_OK)): return @@ -55,9 +56,10 @@ class BaseBinary(Binary): @validate_call def load(self, fresh=False, **kwargs) -> Self: + from archivebox.config.common import STORAGE_CONFIG if fresh: binary = super().load(**kwargs) - self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR) + self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin') else: # get cached binary from db try: @@ -72,16 +74,18 @@ class BaseBinary(Binary): @validate_call def install(self, **kwargs) -> Self: + from archivebox.config.common import STORAGE_CONFIG binary = super().install(**kwargs) - self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR) + self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin') return binary @validate_call def load_or_install(self, fresh=False, **kwargs) -> Self: + from archivebox.config.common import STORAGE_CONFIG try: binary = self.load(fresh=fresh) if binary and binary.version: - self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR) + self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin') return binary except Exception: pass diff --git a/archivebox/abx/archivebox/base_configset.py b/archivebox/abx/archivebox/base_configset.py index 5e2871d4..700d7caa 100644 --- a/archivebox/abx/archivebox/base_configset.py +++ b/archivebox/abx/archivebox/base_configset.py @@ -1,8 +1,13 @@ __package__ = 'abx.archivebox' import os +import sys +import re from pathlib import Path -from typing import Type, Tuple, Callable, ClassVar +from typing import Type, Tuple, Callable, ClassVar, Dict, Any + +import toml +from rich import print from benedict import benedict from pydantic import model_validator, TypeAdapter @@ -18,6 +23,11 @@ from . import toml_util PACKAGE_DIR = Path(__file__).resolve().parent.parent DATA_DIR = Path(os.getcwd()).resolve() +ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf" +ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak" + +AUTOFIXES_HEADER = "[AUTOFIXES]" +AUTOFIXES_SUBHEADER = "# The following config was added automatically to fix problems detected at startup:" class FlatTomlConfigSettingsSource(TomlConfigSettingsSource): @@ -53,7 +63,7 @@ class FlatTomlConfigSettingsSource(TomlConfigSettingsSource): super(TomlConfigSettingsSource, self).__init__(settings_cls, self.toml_data) -class ArchiveBoxBaseConfig(BaseSettings): +class BaseConfigSet(BaseSettings): """ This is the base class for an ArchiveBox ConfigSet. It handles loading values from schema defaults, ArchiveBox.conf TOML config, and environment variables. @@ -83,7 +93,7 @@ class ArchiveBoxBaseConfig(BaseSettings): loc_by_alias=False, validate_assignment=True, validate_return=True, - revalidate_instances="always", + revalidate_instances="subclass-instances", ) load_from_defaults: ClassVar[bool] = True @@ -101,9 +111,6 @@ class ArchiveBoxBaseConfig(BaseSettings): ) -> Tuple[PydanticBaseSettingsSource, ...]: """Defines the config precedence order: Schema defaults -> ArchiveBox.conf (TOML) -> Environment variables""" - ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf" - ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak" - # import ipdb; ipdb.set_trace() precedence_order = {} @@ -152,27 +159,36 @@ class ArchiveBoxBaseConfig(BaseSettings): def fill_defaults(self): """Populate any unset values using function provided as their default""" - for key, field in self.model_fields.items(): - value = getattr(self, key) - - if isinstance(value, Callable): - # if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected - if func_takes_args_or_kwargs(value): - # assemble dict of existing field values to pass to default factory functions - config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False)) - computed_default = field.default(config_so_far) - else: - # otherwise it's a pure function with no args, just call it - computed_default = field.default() - - # coerce/check to make sure default factory return value matches type annotation - TypeAdapter(field.annotation).validate_python(computed_default) - - # set generated default value as final validated value - setattr(self, key, computed_default) + for key in self.model_fields.keys(): + if isinstance(getattr(self, key), Callable): + if self.load_from_defaults: + computed_default = self.get_default_value(key) + # set generated default value as final validated value + setattr(self, key, computed_default) return self - def update_in_place(self, warn=True, **kwargs): + def get_default_value(self, key): + """Get the default value for a given config key""" + field = self.model_fields[key] + value = getattr(self, key) + + if isinstance(value, Callable): + # if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected + if func_takes_args_or_kwargs(value): + # assemble dict of existing field values to pass to default factory functions + config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False)) + computed_default = field.default(config_so_far) + else: + # otherwise it's a pure function with no args, just call it + computed_default = field.default() + + # coerce/check to make sure default factory return value matches type annotation + TypeAdapter(field.annotation).validate_python(computed_default) + + return computed_default + return value + + def update_in_place(self, warn=True, persist=False, hint='', **kwargs): """ Update the config with new values. Use this sparingly! We should almost never be updating config at runtime. Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment @@ -180,25 +196,106 @@ class ArchiveBoxBaseConfig(BaseSettings): Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it. SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue. """ + from archivebox.misc.toml_util import CustomTOMLEncoder + if warn: - print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:') + fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run' + print(f'[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr) + + # set the new values in the environment for key, value in kwargs.items(): os.environ[key] = str(value) original_value = getattr(self, key) if warn: print(f' {key}={original_value} -> {value}') + + # if persist=True, write config changes to data/ArchiveBox.conf [AUTOFIXES] section + try: + if persist and ARCHIVEBOX_CONFIG_FILE.is_file(): + autofixes_to_add = benedict(kwargs).to_toml(encoder=CustomTOMLEncoder()) + + existing_config = ARCHIVEBOX_CONFIG_FILE.read_text().split(AUTOFIXES_HEADER, 1)[0].strip() + if AUTOFIXES_HEADER in existing_config: + existing_autofixes = existing_config.split(AUTOFIXES_HEADER, 1)[-1].strip().replace(AUTOFIXES_SUBHEADER, '').replace(AUTOFIXES_HEADER, '').strip() + else: + existing_autofixes = '' + + new_config = '\n'.join(line for line in [ + existing_config, + '\n' + AUTOFIXES_HEADER, + AUTOFIXES_SUBHEADER, + existing_autofixes, + autofixes_to_add, + ] if line.strip()).strip() + '\n' + ARCHIVEBOX_CONFIG_FILE.write_text(new_config) + except Exception: + pass self.__init__() + if warn: + print(file=sys.stderr) + return self - def as_legacy_config_schema(self): + @property + def toml_section_header(self): + """Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG""" + class_name = self.__class__.__name__ + return re.sub('([A-Z]+)', r'_\1', class_name).upper().strip('_') + + + def from_defaults(self) -> Dict[str, Any]: + """Get the dictionary of {key: value} config loaded from the default values""" + class OnlyDefaultsConfig(self.__class__): + load_from_defaults = True + load_from_configfile = False + load_from_environment = False + return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) + + def from_configfile(self) -> Dict[str, Any]: + """Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf""" + class OnlyConfigFileConfig(self.__class__): + load_from_defaults = False + load_from_configfile = True + load_from_environment = False + return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) + + def from_environment(self) -> Dict[str, Any]: + """Get the dictionary of {key: value} config loaded from the environment variables""" + class OnlyEnvironmentConfig(self.__class__): + load_from_defaults = False + load_from_configfile = False + load_from_environment = True + return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) + + def from_computed(self) -> Dict[str, Any]: + """Get the dictionary of {key: value} config loaded from the computed fields""" + return benedict(self.model_dump(include=set(self.model_computed_fields.keys()))) + + + def to_toml_dict(self, defaults=False) -> Dict[str, Any]: + """Get the current config as a TOML-ready dict""" + config_dict = {} + for key, value in benedict(self).items(): + if defaults or value != self.get_default_value(key): + config_dict[key] = value + + return benedict({self.toml_section_header: config_dict}) + + def to_toml_str(self, defaults=False) -> str: + """Get the current config as a TOML string""" + from archivebox.misc.toml_util import CustomTOMLEncoder + + toml_dict = self.to_toml_dict(defaults=defaults) + if not toml_dict[self.toml_section_header]: + # if the section is empty, don't write it + toml_dict.pop(self.toml_section_header) + + return toml.dumps(toml_dict, encoder=CustomTOMLEncoder()) + + def as_legacy_config_schema(self) -> Dict[str, Any]: # shim for backwards compatibility with old config schema style model_values = self.model_dump() return benedict({ key: {'type': field.annotation, 'default': model_values[key]} for key, field in self.model_fields.items() }) - - -class BaseConfigSet(ArchiveBoxBaseConfig): # type: ignore[type-arg] - - pass diff --git a/archivebox/abx/archivebox/hookspec.py b/archivebox/abx/archivebox/hookspec.py index 54bf1113..bfcb93b8 100644 --- a/archivebox/abx/archivebox/hookspec.py +++ b/archivebox/abx/archivebox/hookspec.py @@ -18,13 +18,7 @@ def get_PLUGIN() -> Dict[str, Dict[str, Any]]: def get_CONFIG() -> Dict[str, BaseConfigSet]: return {} -@hookspec -def get_BINARIES() -> Dict[str, BaseBinary]: - return {} -@hookspec -def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]: - return {} @hookspec def get_EXTRACTORS() -> Dict[str, BaseExtractor]: @@ -45,3 +39,14 @@ def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]: # @hookspec # def get_QUEUES(): # return {} + + +############################################################## +# provided by abx.pydantic_pkgr.hookspec: +# @hookspec +# def get_BINARIES() -> Dict[str, BaseBinary]: +# return {} + +# @hookspec +# def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]: +# return {} diff --git a/archivebox/abx/archivebox/reads.py b/archivebox/abx/archivebox/reads.py index 5653a7fd..4b12b560 100644 --- a/archivebox/abx/archivebox/reads.py +++ b/archivebox/abx/archivebox/reads.py @@ -131,9 +131,12 @@ def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']: -def get_scope_config(defaults=settings.CONFIG, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None): +def get_scope_config(defaults: benedict | None = None, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None): """Get all the relevant config for the given scope, in correct precedence order""" + from django.conf import settings + default_config: benedict = defaults or settings.CONFIG + snapshot = snapshot or (archiveresult and archiveresult.snapshot) crawl = crawl or (snapshot and snapshot.crawl) seed = seed or (crawl and crawl.seed) @@ -147,7 +150,7 @@ def get_scope_config(defaults=settings.CONFIG, persona=None, seed=None, crawl=No extra_config = extra_config or {} return { - **defaults, # defaults / config file / environment variables + **default_config, # defaults / config file / environment variables **persona_config, # lowest precedence **seed_config, **crawl_config, diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index ab532a04..57750918 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -164,13 +164,18 @@ def run_subcommand(subcommand: str, # print('DATA_DIR is', DATA_DIR) # print('pwd is', os.getcwd()) - cmd_requires_db = subcommand in archive_cmds + cmd_requires_db = (subcommand in archive_cmds) init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args check_db = cmd_requires_db and not init_pending setup_django(in_memory_db=subcommand in fake_db, check_db=check_db) + for ignore_pattern in ('help', '-h', '--help', 'version', '--version'): + if ignore_pattern in sys.argv[:4]: + cmd_requires_db = False + break + if subcommand in archive_cmds: if cmd_requires_db: check_migrations() diff --git a/archivebox/config/common.py b/archivebox/config/common.py index b17fde09..e9903d41 100644 --- a/archivebox/config/common.py +++ b/archivebox/config/common.py @@ -1,18 +1,18 @@ __package__ = 'archivebox.config' +import os import sys import shutil - +import tempfile from typing import Dict, Optional from pathlib import Path from rich import print -from pydantic import Field, field_validator, computed_field +from pydantic import Field, field_validator, computed_field, model_validator from django.utils.crypto import get_random_string from abx.archivebox.base_configset import BaseConfigSet - from .constants import CONSTANTS from .version import get_COMMIT_HASH, get_BUILD_TIME from .permissions import IN_DOCKER @@ -35,7 +35,6 @@ class ShellConfig(BaseConfigSet): VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)}, CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)}, - @computed_field @property def TERM_WIDTH(self) -> int: @@ -57,6 +56,16 @@ SHELL_CONFIG = ShellConfig() class StorageConfig(BaseConfigSet): + # TMP_DIR must be a local, fast, readable/writable dir by archivebox user, + # must be a short path due to unix path length restrictions for socket files (<100 chars) + # must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets + TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR) + + # LIB_DIR must be a local, fast, readable/writable dir by archivebox user, + # must be able to contain executable binaries (up to 5GB size) + # should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow + LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR) + OUTPUT_PERMISSIONS: str = Field(default='644') RESTRICT_FILE_NAMES: str = Field(default='windows') ENFORCE_ATOMIC_WRITES: bool = Field(default=True) diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index e8ea9958..b8019f99 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -1,6 +1,5 @@ __package__ = 'archivebox.config' -import os import re import sys @@ -97,14 +96,10 @@ class ConstantsDict(Mapping): # Runtime dirs TMP_DIR_NAME: str = 'tmp' - TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID + DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323 + LIB_DIR_NAME: str = 'lib' - LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE - LIB_PIP_DIR: Path = LIB_DIR / 'pip' - LIB_NPM_DIR: Path = LIB_DIR / 'npm' - LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers' - LIB_BIN_DIR: Path = LIB_DIR / 'bin' - BIN_DIR: Path = LIB_BIN_DIR + DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker # Config constants TIMEZONE: str = 'UTC' @@ -198,91 +193,7 @@ class ConstantsDict(Mapping): ".archivebox_id", "Dockerfile", )) - - CODE_LOCATIONS = benedict({ - 'PACKAGE_DIR': { - 'path': (PACKAGE_DIR).resolve(), - 'enabled': True, - 'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable - }, - 'TEMPLATES_DIR': { - 'path': TEMPLATES_DIR.resolve(), - 'enabled': True, - 'is_valid': os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list - }, - 'CUSTOM_TEMPLATES_DIR': { - 'path': CUSTOM_TEMPLATES_DIR.resolve(), - 'enabled': os.path.isdir(CUSTOM_TEMPLATES_DIR), - 'is_valid': os.path.isdir(CUSTOM_TEMPLATES_DIR) and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK), # read - }, - 'USER_PLUGINS_DIR': { - 'path': USER_PLUGINS_DIR.resolve(), - 'enabled': os.path.isdir(USER_PLUGINS_DIR), - 'is_valid': os.path.isdir(USER_PLUGINS_DIR) and os.access(USER_PLUGINS_DIR, os.R_OK), # read - }, - 'LIB_DIR': { - 'path': LIB_DIR.resolve(), - 'enabled': True, - 'is_valid': os.path.isdir(LIB_DIR) and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.W_OK), # read + write - }, - }) - DATA_LOCATIONS = benedict({ - "DATA_DIR": { - "path": DATA_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK), - "is_mount": os.path.ismount(DATA_DIR.resolve()), - }, - "CONFIG_FILE": { - "path": CONFIG_FILE.resolve(), - "enabled": True, - "is_valid": os.path.isfile(CONFIG_FILE) and os.access(CONFIG_FILE, os.R_OK) and os.access(CONFIG_FILE, os.W_OK), - }, - "SQL_INDEX": { - "path": DATABASE_FILE.resolve(), - "enabled": True, - "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), - "is_mount": os.path.ismount(DATABASE_FILE.resolve()), - }, - "QUEUE_DATABASE": { - "path": QUEUE_DATABASE_FILE.resolve(), - "enabled": True, - "is_valid": os.path.isfile(QUEUE_DATABASE_FILE) and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK), - "is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()), - }, - "ARCHIVE_DIR": { - "path": ARCHIVE_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK), - "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()), - }, - "SOURCES_DIR": { - "path": SOURCES_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(SOURCES_DIR) and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK), - }, - "PERSONAS_DIR": { - "path": PERSONAS_DIR.resolve(), - "enabled": os.path.isdir(PERSONAS_DIR), - "is_valid": os.path.isdir(PERSONAS_DIR) and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK), # read + write - }, - "LOGS_DIR": { - "path": LOGS_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(LOGS_DIR) and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK), # read + write - }, - 'TMP_DIR': { - 'path': TMP_DIR.resolve(), - 'enabled': True, - 'is_valid': os.path.isdir(TMP_DIR) and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.W_OK), # read + write - }, - # "CACHE_DIR": { - # "path": CACHE_DIR.resolve(), - # "enabled": True, - # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write - # }, - }) @classmethod def __getitem__(cls, key: str): diff --git a/archivebox/config/legacy.py b/archivebox/config/legacy.py index 27f09345..99b497ca 100644 --- a/archivebox/config/legacy.py +++ b/archivebox/config/legacy.py @@ -258,6 +258,9 @@ def load_config_val(key: str, elif type is list or type is dict: return json.loads(val) + + elif type is Path: + return Path(val) raise Exception('Config values can only be str, bool, int, or json') @@ -574,7 +577,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON with SudoPermission(uid=0): # running as root is a special case where it's ok to be a bit slower # make sure data dir is always owned by the correct user - os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"') + os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null') os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null') bump_startup_progress_bar() diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py index 12864f7c..1f582881 100644 --- a/archivebox/config/paths.py +++ b/archivebox/config/paths.py @@ -1,12 +1,16 @@ __package__ = 'archivebox.config' import os +import socket import hashlib +import tempfile import platform from pathlib import Path from functools import cache from datetime import datetime +from benedict import benedict + from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER ############################################################################################# @@ -88,7 +92,7 @@ def get_machine_type() -> str: return LIB_DIR_SCOPE -def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool: +def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool: """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)""" current_uid, current_gid = os.geteuid(), os.getegid() uid, gid = uid or current_uid, gid or current_gid @@ -101,10 +105,197 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No test_file.unlink() return True except (IOError, OSError, PermissionError): - pass - + if chown: + # try fixing it using sudo permissions + with SudoPermission(uid=uid, fallback=fallback): + os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null') + return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False) return False +def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool: + """Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)""" + from archivebox.logging_util import pretty_path + + try: + socket_path = str(dir_path / '.test_socket.sock') + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + try: + os.remove(socket_path) + except OSError: + pass + s.bind(socket_path) + s.close() + try: + os.remove(socket_path) + except OSError: + pass + except Exception as e: + raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e + + return True + + +def create_and_chown_dir(dir_path: Path) -> None: + with SudoPermission(uid=0, fallback=True): + dir_path.mkdir(parents=True, exist_ok=True) + os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null') + os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &') + +@cache +def get_or_create_working_tmp_dir(autofix=True, quiet=False): + from archivebox import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + from archivebox.misc.checks import check_tmp_dir + + # try a few potential directories in order of preference + CANDIDATES = [ + STORAGE_CONFIG.TMP_DIR, # <user-specified> + CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id> + Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512 + Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512 + Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512 + Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512 + Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d + Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5 + ] + for candidate in CANDIDATES: + try: + create_and_chown_dir(candidate) + except Exception: + pass + if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True): + if autofix and STORAGE_CONFIG.TMP_DIR != candidate: + STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet) + return candidate + + if not quiet: + raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!') + +@cache +def get_or_create_working_lib_dir(autofix=True, quiet=False): + from archivebox import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + from archivebox.misc.checks import check_lib_dir + + # try a few potential directories in order of preference + CANDIDATES = [ + STORAGE_CONFIG.LIB_DIR, # <user-specified> + CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker + Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5 + *([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5 + Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5 + ] + + for candidate in CANDIDATES: + try: + create_and_chown_dir(candidate) + except Exception: + pass + if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True): + if autofix and STORAGE_CONFIG.LIB_DIR != candidate: + STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet) + return candidate + + if not quiet: + raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!') + + + +@cache +def get_data_locations(): + from archivebox.config import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + + return benedict({ + "DATA_DIR": { + "path": DATA_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK), + "is_mount": os.path.ismount(DATA_DIR.resolve()), + }, + "CONFIG_FILE": { + "path": CONSTANTS.CONFIG_FILE.resolve(), + "enabled": True, + "is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK), + }, + "SQL_INDEX": { + "path": DATABASE_FILE.resolve(), + "enabled": True, + "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), + "is_mount": os.path.ismount(DATABASE_FILE.resolve()), + }, + "QUEUE_DATABASE": { + "path": CONSTANTS.QUEUE_DATABASE_FILE, + "enabled": True, + "is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK), + "is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE), + }, + "ARCHIVE_DIR": { + "path": ARCHIVE_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK), + "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()), + }, + "SOURCES_DIR": { + "path": CONSTANTS.SOURCES_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK), + }, + "PERSONAS_DIR": { + "path": CONSTANTS.PERSONAS_DIR.resolve(), + "enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR), + "is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write + }, + "LOGS_DIR": { + "path": CONSTANTS.LOGS_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write + }, + 'TMP_DIR': { + 'path': STORAGE_CONFIG.TMP_DIR.resolve(), + 'enabled': True, + 'is_valid': os.path.isdir(STORAGE_CONFIG.TMP_DIR) and os.access(STORAGE_CONFIG.TMP_DIR, os.R_OK) and os.access(STORAGE_CONFIG.TMP_DIR, os.W_OK), # read + write + }, + # "CACHE_DIR": { + # "path": CACHE_DIR.resolve(), + # "enabled": True, + # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write + # }, + }) + +@cache +def get_code_locations(): + from archivebox.config import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + + return benedict({ + 'PACKAGE_DIR': { + 'path': (PACKAGE_DIR).resolve(), + 'enabled': True, + 'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable + }, + 'TEMPLATES_DIR': { + 'path': CONSTANTS.TEMPLATES_DIR.resolve(), + 'enabled': True, + 'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list + }, + 'CUSTOM_TEMPLATES_DIR': { + 'path': CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(), + 'enabled': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR), + 'is_valid': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read + }, + 'USER_PLUGINS_DIR': { + 'path': CONSTANTS.USER_PLUGINS_DIR.resolve(), + 'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR), + 'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read + }, + 'LIB_DIR': { + 'path': STORAGE_CONFIG.LIB_DIR.resolve(), + 'enabled': True, + 'is_valid': os.path.isdir(STORAGE_CONFIG.LIB_DIR) and os.access(STORAGE_CONFIG.LIB_DIR, os.R_OK) and os.access(STORAGE_CONFIG.LIB_DIR, os.W_OK), # read + write + }, + }) + # @cache diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index f4503a1f..33ab0766 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -510,7 +510,7 @@ def log_removal_finished(all_links: int, to_remove: int): ### Helpers @enforce_types -def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR) -> str: +def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: bool=True) -> str: """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" pwd = str(Path(pwd)) # .resolve() path = str(path) @@ -520,7 +520,10 @@ def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR) -> str: # replace long absolute paths with ./ relative ones to save on terminal output width if path.startswith(pwd) and (pwd != '/') and path != pwd: - path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1) + if color: + path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1) + else: + path = path.replace(pwd, '.', 1) # quote paths containing spaces if ' ' in path: diff --git a/archivebox/main.py b/archivebox/main.py index 3d2a5472..5ed3973f 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -189,6 +189,7 @@ def version(quiet: bool=False, if quiet or '--version' in sys.argv: return + from rich.panel import Panel from rich.console import Console console = Console() prnt = console.print @@ -197,6 +198,7 @@ def version(quiet: bool=False, from django.conf import settings from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID + from archivebox.config.paths import get_data_locations, get_code_locations from abx.archivebox.base_binary import BaseBinary, apt, brew, env @@ -221,7 +223,7 @@ def version(quiet: bool=False, f'PLATFORM={platform.platform()}', f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''), ) - OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount + OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat() prnt( f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}', @@ -240,6 +242,21 @@ def version(quiet: bool=False, #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually ) prnt() + + if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)): + PANEL_TEXT = '\n'.join(( + # '', + # f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]', + '', + '[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...', + ' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.', + '', + ' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]', + '', + )) + prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]')) + prnt() + return prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]') failures = [] @@ -299,13 +316,13 @@ def version(quiet: bool=False, prnt() prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]') - for name, path in CONSTANTS.CODE_LOCATIONS.items(): + for name, path in get_code_locations().items(): prnt(printable_folder_status(name, path), overflow='ignore', crop=False) prnt() if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK): prnt('[bright_yellow][i] Data locations:[/bright_yellow]') - for name, path in CONSTANTS.DATA_LOCATIONS.items(): + for name, path in get_data_locations().items(): prnt(printable_folder_status(name, path), overflow='ignore', crop=False) from archivebox.misc.checks import check_data_dir_permissions @@ -395,7 +412,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat print(f' β ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}') # from django.contrib.auth.models import User - # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exists(): + # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists(): # print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI)) # call_command("createsuperuser", interactive=True) @@ -486,9 +503,13 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat html_index.rename(f"{index_name}.html") CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True) - CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True) - CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True) - + CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True) + CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True) + + from archivebox.config.common import STORAGE_CONFIG + STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True) + STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True) + if install: run_subcommand('install', pwd=out_dir) @@ -1115,7 +1136,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina from django.contrib.auth import get_user_model User = get_user_model() - if not User.objects.filter(is_superuser=True).exists(): + if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green') stderr(' archivebox manage createsuperuser') # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) @@ -1399,46 +1420,43 @@ def server(runserver_args: Optional[List[str]]=None, from django.core.management import call_command from django.contrib.auth.models import User + if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): + print() + # print('[yellow][!] No admin accounts exist, you must create one to be able to log in to the Admin UI![/yellow]') + print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:') + print(' [green]archivebox manage createsuperuser[/green]') + print() - print('[green][+] Starting ArchiveBox webserver...[/green]') - print(' > Logging errors to ./logs/errors.log') - if not User.objects.filter(is_superuser=True).exists(): - print('[yellow][!] No admin users exist yet, you will not be able to edit links in the UI.[/yellow]') - print() - print(' [violet]Hint:[/violet] To create an admin user, run:') - print(' archivebox manage createsuperuser') - print() + host = '127.0.0.1' + port = '8000' + try: + host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0] + if ':' in host_and_port: + host, port = host_and_port.split(':') + else: + if '.' in host_and_port: + host = host_and_port + else: + port = host_and_port + except IndexError: + pass + + print('[green][+] Starting ArchiveBox webserver...[/green]') + print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') + print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') + print(' > Writing ArchiveBox error log to ./logs/errors.log') if SHELL_CONFIG.DEBUG: if not reload: runserver_args.append('--noreload') # '--insecure' call_command("runserver", *runserver_args) else: - host = '127.0.0.1' - port = '8000' - - try: - host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0] - if ':' in host_and_port: - host, port = host_and_port.split(':') - else: - if '.' in host_and_port: - host = host_and_port - else: - port = host_and_port - except IndexError: - pass - - print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') - from queues.supervisor_util import start_server_workers print() - start_server_workers(host=host, port=port, daemonize=False) - print("\n[i][green][π©] ArchiveBox server shut down gracefully.[/green][/i]") diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index 0c4f9d66..5fe02055 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -5,16 +5,24 @@ import sys from pathlib import Path from rich import print +from rich.panel import Panel -# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE +# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE to anything other than builtin python libraries # this file is imported by archivebox/__init__.py # and any imports here will be imported by EVERYTHING else # so this file should only be used for pure python checks # that don't need to import other parts of ArchiveBox +# if a check needs to import other parts of ArchiveBox, +# the imports should be done inside the check function +# and you should make sure if you need to import any django stuff +# that the check is called after django.setup() has been called + def check_data_folder() -> None: from archivebox import DATA_DIR, ARCHIVE_DIR + from archivebox.config import CONSTANTS + from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir() if not archive_dir_exists: @@ -30,8 +38,21 @@ def check_data_folder() -> None: raise SystemExit(2) + # Create data dir subdirs + create_and_chown_dir(CONSTANTS.SOURCES_DIR) + create_and_chown_dir(CONSTANTS.PERSONAS_DIR / 'Default') + create_and_chown_dir(CONSTANTS.LOGS_DIR) + # create_and_chown_dir(CONSTANTS.CACHE_DIR) + + # Create /tmp and /lib dirs if they don't exist + get_or_create_working_tmp_dir(autofix=True, quiet=False) + get_or_create_working_lib_dir(autofix=True, quiet=False) + + # Check data dir permissions, /tmp, and /lib permissions + check_data_dir_permissions() + def check_migrations(): - from archivebox import DATA_DIR, CONSTANTS + from archivebox import DATA_DIR from ..index.sql import list_migrations pending_migrations = [name for status, name in list_migrations() if not status] @@ -45,13 +66,6 @@ def check_migrations(): print(' archivebox init', file=sys.stderr) raise SystemExit(3) - CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True) - CONSTANTS.LOGS_DIR.mkdir(exist_ok=True) - # CONSTANTS.CACHE_DIR.mkdir(exist_ok=True) - (CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True) - (CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True) - - def check_io_encoding(): PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8') @@ -128,3 +142,98 @@ def check_data_dir_permissions(): STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]') STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]') STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]') + + from archivebox.config.common import STORAGE_CONFIG + + # Check /tmp dir permissions + check_tmp_dir(STORAGE_CONFIG.TMP_DIR, throw=False, must_exist=True) + + # Check /lib dir permissions + check_lib_dir(STORAGE_CONFIG.LIB_DIR, throw=False, must_exist=True) + + +def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True): + from archivebox.config.paths import assert_dir_can_contain_unix_sockets, dir_is_writable, get_or_create_working_tmp_dir + from archivebox.misc.logging import STDERR + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.config.common import STORAGE_CONFIG + from archivebox.logging_util import pretty_path + + tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR + socket_file = tmp_dir.absolute().resolve() / "supervisord.sock" + + if not must_exist and not os.path.isdir(tmp_dir): + # just check that its viable based on its length (because dir may not exist yet, we cant check if its writable) + return len(f'file://{socket_file}') <= 96 + + tmp_is_valid = False + try: + tmp_is_valid = dir_is_writable(tmp_dir) + tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir) + assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}' + assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.' + return True + except Exception as e: + if not quiet: + STDERR.print() + ERROR_TEXT = '\n'.join(( + '', + f'[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]', + f' [yellow]{e}[/yellow]', + '', + '[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.', + ' - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).', + f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).', + ' - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.', + ' - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]', + '', + '[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:', + f' [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or "/tmp/archivebox"}[/green]', + '', + )) + STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured TMP_DIR[/red]', subtitle='Background workers may fail to start until fixed.')) + STDERR.print() + if throw: + raise OSError(f'TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!') from e + return False + + +def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True): + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.misc.logging import STDERR + from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir + from archivebox.config.common import STORAGE_CONFIG + from archivebox.logging_util import pretty_path + + lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR + + if not must_exist and not os.path.isdir(lib_dir): + return True + + lib_is_valid = False + try: + lib_is_valid = dir_is_writable(lib_dir) + assert lib_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}' + return True + except Exception as e: + if not quiet: + STDERR.print() + ERROR_TEXT = '\n'.join(( + '', + f'[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]', + f' [yellow]{e}[/yellow]', + '', + '[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.', + f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).', + ' - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).', + ' - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]', + '', + '[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:', + f' [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or "/usr/local/share/archivebox"}[/green]', + '', + )) + STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured LIB_DIR[/red]', subtitle='[yellow]Dependencies may not auto-install properly until fixed.[/yellow]')) + STDERR.print() + if throw: + raise OSError(f'LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.') from e + return False diff --git a/archivebox/misc/shell_welcome_message.py b/archivebox/misc/shell_welcome_message.py index b1ed1b58..5b85e6bd 100644 --- a/archivebox/misc/shell_welcome_message.py +++ b/archivebox/misc/shell_welcome_message.py @@ -49,7 +49,7 @@ if __name__ == '__main__': prnt('[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!') prnt(' [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]') - prnt(' [link=https://docs.archivebox.io/en/latest/modules.html]https://docs.archivebox.io/en/latest/modules.html[/link]') + prnt(' [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]') prnt() prnt(' :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]') prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]') diff --git a/archivebox/misc/toml_util.py b/archivebox/misc/toml_util.py index d4784335..9dd51d1b 100644 --- a/archivebox/misc/toml_util.py +++ b/archivebox/misc/toml_util.py @@ -82,10 +82,10 @@ class JSONSchemaWithLambdas(GenerateJsonSchema): if isinstance(default, Callable): return '{{lambda ' + inspect.getsource(default).split('=lambda ')[-1].strip()[:-1] + '}}' return to_jsonable_python( - default, - timedelta_mode=config.ser_json_timedelta, - bytes_mode=config.ser_json_bytes, - serialize_unknown=True + default, + timedelta_mode=config.ser_json_timedelta, + bytes_mode=config.ser_json_bytes, + serialize_unknown=True ) # for computed_field properties render them like this instead: diff --git a/archivebox/plugins_extractor/chrome/binaries.py b/archivebox/plugins_extractor/chrome/binaries.py index 7e17d822..d2ece7c5 100644 --- a/archivebox/plugins_extractor/chrome/binaries.py +++ b/archivebox/plugins_extractor/chrome/binaries.py @@ -104,7 +104,10 @@ class ChromeBinary(BaseBinary): } @staticmethod - def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None: + def symlink_to_lib(binary, bin_dir=None) -> None: + from archivebox.config.common import STORAGE_CONFIG + bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin' + if not (binary.abspath and os.access(binary.abspath, os.F_OK)): return diff --git a/archivebox/plugins_pkg/npm/binproviders.py b/archivebox/plugins_pkg/npm/binproviders.py index 3e4adff7..b1b83168 100644 --- a/archivebox/plugins_pkg/npm/binproviders.py +++ b/archivebox/plugins_pkg/npm/binproviders.py @@ -3,8 +3,6 @@ __package__ = 'plugins_pkg.npm' from pathlib import Path from typing import Optional -from pydantic import model_validator - from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName from archivebox.config import DATA_DIR, CONSTANTS @@ -14,7 +12,7 @@ from abx.archivebox.base_binary import BaseBinProvider OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin' -NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin' +NEW_NODE_BIN_PATH = CONSTANTS.DEFAULT_LIB_DIR / 'npm' / 'node_modules' / '.bin' class SystemNpmBinProvider(NpmProvider, BaseBinProvider): @@ -27,12 +25,16 @@ class LibNpmBinProvider(NpmProvider, BaseBinProvider): name: BinProviderName = "lib_npm" PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}' - npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR + npm_prefix: Optional[Path] = CONSTANTS.DEFAULT_LIB_DIR / 'npm' - @model_validator(mode='after') - def validate_path(self): - assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent - return self + def setup(self) -> None: + # update paths from config if they arent the default + from archivebox.config.common import STORAGE_CONFIG + if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR: + self.npm_prefix = STORAGE_CONFIG.LIB_DIR / 'npm' + self.PATH = f'{STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules" / ".bin"}:{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}' + + super().setup() SYS_NPM_BINPROVIDER = SystemNpmBinProvider() diff --git a/archivebox/plugins_pkg/pip/binproviders.py b/archivebox/plugins_pkg/pip/binproviders.py index 5395205e..e51dc780 100644 --- a/archivebox/plugins_pkg/pip/binproviders.py +++ b/archivebox/plugins_pkg/pip/binproviders.py @@ -49,7 +49,15 @@ class LibPipBinProvider(PipProvider, BaseBinProvider): name: BinProviderName = "lib_pip" INSTALLER_BIN: BinName = "pip" - pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv' + pip_venv: Optional[Path] = CONSTANTS.DEFAULT_LIB_DIR / 'pip' / 'venv' + + def setup(self) -> None: + # update paths from config if they arent the default + from archivebox.config.common import STORAGE_CONFIG + if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR: + self.pip_venv = STORAGE_CONFIG.LIB_DIR / 'pip' / 'venv' + + super().setup() SYS_PIP_BINPROVIDER = SystemPipBinProvider() PIPX_PIP_BINPROVIDER = SystemPipxBinProvider() diff --git a/archivebox/plugins_pkg/playwright/binproviders.py b/archivebox/plugins_pkg/playwright/binproviders.py index a5c35e0a..68e62bb5 100644 --- a/archivebox/plugins_pkg/playwright/binproviders.py +++ b/archivebox/plugins_pkg/playwright/binproviders.py @@ -35,7 +35,7 @@ class PlaywrightBinProvider(BaseBinProvider): name: BinProviderName = "playwright" INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name - PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}" + PATH: PATHStr = f"{CONSTANTS.DEFAULT_LIB_DIR / 'bin'}:{DEFAULT_ENV_PATH}" playwright_browsers_dir: Path = ( MACOS_PLAYWRIGHT_CACHE_DIR.expanduser() @@ -56,6 +56,11 @@ class PlaywrightBinProvider(BaseBinProvider): return PLAYWRIGHT_BINARY.load().abspath def setup(self) -> None: + # update paths from config if they arent the default + from archivebox.config.common import STORAGE_CONFIG + if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR: + self.PATH = f"{STORAGE_CONFIG.LIB_DIR / 'bin'}:{DEFAULT_ENV_PATH}" + assert SYS_PIP_BINPROVIDER.INSTALLER_BIN_ABSPATH, "Pip bin provider not initialized" if self.playwright_browsers_dir: diff --git a/archivebox/plugins_pkg/puppeteer/binproviders.py b/archivebox/plugins_pkg/puppeteer/binproviders.py index 54903019..2ef0eb7a 100644 --- a/archivebox/plugins_pkg/puppeteer/binproviders.py +++ b/archivebox/plugins_pkg/puppeteer/binproviders.py @@ -23,19 +23,16 @@ from abx.archivebox.base_binary import BaseBinProvider from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER -LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR - - class PuppeteerBinProvider(BaseBinProvider): name: BinProviderName = "puppeteer" INSTALLER_BIN: BinName = "npx" - PATH: PATHStr = str(CONSTANTS.LIB_BIN_DIR) + PATH: PATHStr = str(CONSTANTS.DEFAULT_LIB_DIR / 'bin') euid: Optional[int] = ARCHIVEBOX_USER - puppeteer_browsers_dir: Path = LIB_DIR_BROWSERS - puppeteer_install_args: List[str] = ['--yes', "@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)] + puppeteer_browsers_dir: Path = CONSTANTS.DEFAULT_LIB_DIR / 'browsers' + puppeteer_install_args: List[str] = ['--yes', "@puppeteer/browsers", "install"] packages_handler: BinProviderOverrides = Field(default={ "chrome": lambda: @@ -45,6 +42,11 @@ class PuppeteerBinProvider(BaseBinProvider): _browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {} def setup(self) -> None: + # update paths from config + from archivebox.config.common import STORAGE_CONFIG + self.puppeteer_browsers_dir = STORAGE_CONFIG.LIB_DIR / 'browsers' + self.PATH = str(STORAGE_CONFIG.LIB_DIR / 'bin') + assert SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH, "NPM bin provider not initialized" if self.puppeteer_browsers_dir: @@ -90,7 +92,7 @@ class PuppeteerBinProvider(BaseBinProvider): # print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}') - install_args = [*self.puppeteer_install_args] + install_args = [*self.puppeteer_install_args, "--path", str(self.puppeteer_browsers_dir)] proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=[*install_args, *packages]) diff --git a/archivebox/queues/settings.py b/archivebox/queues/settings.py deleted file mode 100644 index 85dfb869..00000000 --- a/archivebox/queues/settings.py +++ /dev/null @@ -1,40 +0,0 @@ -import tempfile -from pathlib import Path -from functools import cache - -from archivebox.config import CONSTANTS -from archivebox.config.paths import get_collection_id - -DATA_DIR = CONSTANTS.DATA_DIR -LOGS_DIR = CONSTANTS.LOGS_DIR -TMP_DIR = CONSTANTS.TMP_DIR - -SUPERVISORD_CONFIG_FILE = TMP_DIR / "supervisord.conf" -PID_FILE = TMP_DIR / "supervisord.pid" -SOCK_FILE = TMP_DIR / "supervisord.sock" -LOG_FILE = TMP_DIR / "supervisord.log" -WORKERS_DIR = TMP_DIR / "workers" - -@cache -def get_sock_file(): - """Get the path to the supervisord socket file, symlinking to a shorter path if needed due to unix path length limits""" - TMP_DIR.mkdir(parents=True, exist_ok=True) - - if len(f'file://{SOCK_FILE.absolute().resolve()}') > 98: - # socket absolute paths cannot be longer than 104 bytes on macos, and 108 bytes on linux - # symlink it to a shorter path and use that instead - - # place the actual socket file in a shorter tmp dir - # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox_supervisord_3d1e544e.sock - shorter_sock_file = Path(tempfile.gettempdir()) / f"archivebox_supervisord_{get_collection_id()}.sock" - - # symlink ./data/tmp/<collection_id>/supervisord.sock -> /var/folders/qy/abc234235/T/archivebox_supervisord_3d1e544e.sock - # for convenience/consistency - symlink = SOCK_FILE - symlink.unlink(missing_ok=True) - symlink.symlink_to(shorter_sock_file) - - assert len(f'file://{shorter_sock_file}') <= 98, f'Failed to create supervisord SOCK_FILE, system tmp dir location is too long {shorter_sock_file} (unix only allows 108 characters for socket paths)' - return shorter_sock_file - - return SOCK_FILE diff --git a/archivebox/queues/supervisor_util.py b/archivebox/queues/supervisor_util.py index 1dc87395..f181da08 100644 --- a/archivebox/queues/supervisor_util.py +++ b/archivebox/queues/supervisor_util.py @@ -1,23 +1,39 @@ __package__ = 'archivebox.queues' +import sys import time import signal import psutil import shutil import subprocess + +from typing import Dict, cast, Iterator from pathlib import Path +from functools import cache + from rich import print - -from typing import Dict, cast - from supervisor.xmlrpc import SupervisorTransport from xmlrpc.client import ServerProxy +from archivebox.config import CONSTANTS +from archivebox.config.paths import get_or_create_working_tmp_dir from archivebox.config.permissions import ARCHIVEBOX_USER +from archivebox.misc.logging import STDERR +from archivebox.logging_util import pretty_path -from .settings import SUPERVISORD_CONFIG_FILE, DATA_DIR, PID_FILE, get_sock_file, LOG_FILE, WORKERS_DIR, TMP_DIR, LOGS_DIR +LOG_FILE_NAME = "supervisord.log" +CONFIG_FILE_NAME = "supervisord.conf" +PID_FILE_NAME = "supervisord.pid" +WORKERS_DIR_NAME = "workers" -from typing import Iterator +@cache +def get_sock_file(): + """Get the path to the supervisord socket file, symlinking to a shorter path if needed due to unix path length limits""" + TMP_DIR = get_or_create_working_tmp_dir(autofix=True, quiet=False) + assert TMP_DIR, "Failed to find or create a writable TMP_DIR!" + socket_file = TMP_DIR / "supervisord.sock" + + return socket_file def follow(file, sleep_sec=0.1) -> Iterator[str]: """ Yield each line from a file as they are written. @@ -35,24 +51,30 @@ def follow(file, sleep_sec=0.1) -> Iterator[str]: def create_supervisord_config(): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME + PID_FILE = SOCK_FILE.parent / PID_FILE_NAME + LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME + config_content = f""" [supervisord] nodaemon = true environment = IS_SUPERVISORD_PARENT="true" -pidfile = {TMP_DIR}/{PID_FILE.name} -logfile = {LOGS_DIR}/{LOG_FILE.name} -childlogdir = {LOGS_DIR} -directory = {DATA_DIR} +pidfile = {PID_FILE} +logfile = {LOG_FILE} +childlogdir = {CONSTANTS.LOGS_DIR} +directory = {CONSTANTS.DATA_DIR} strip_ansi = true nocleanup = true user = {ARCHIVEBOX_USER} [unix_http_server] -file = {get_sock_file()} +file = {SOCK_FILE} chmod = 0700 [supervisorctl] -serverurl = unix://{get_sock_file()} +serverurl = unix://{SOCK_FILE} [rpcinterface:supervisor] supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface @@ -61,9 +83,14 @@ supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface files = {WORKERS_DIR}/*.conf """ - SUPERVISORD_CONFIG_FILE.write_text(config_content) + CONFIG_FILE.write_text(config_content) + Path.mkdir(WORKERS_DIR, exist_ok=True) + (WORKERS_DIR / 'initial_startup.conf').write_text('') # hides error about "no files found to include" when supervisord starts def create_worker_config(daemon): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + Path.mkdir(WORKERS_DIR, exist_ok=True) name = daemon['name'] @@ -80,13 +107,14 @@ def create_worker_config(daemon): def get_existing_supervisord_process(): + SOCK_FILE = get_sock_file() try: - transport = SupervisorTransport(None, None, f"unix://{get_sock_file()}") + transport = SupervisorTransport(None, None, f"unix://{SOCK_FILE}") server = ServerProxy("http://localhost", transport=transport) current_state = cast(Dict[str, int | str], server.supervisor.getState()) if current_state["statename"] == "RUNNING": pid = server.supervisor.getPID() - print(f"[π¦ΈββοΈ] Supervisord connected (pid={pid}) via unix://{str(get_sock_file()).replace(str(DATA_DIR), '.')}.") + print(f"[π¦ΈββοΈ] Supervisord connected (pid={pid}) via unix://{pretty_path(SOCK_FILE)}.") return server.supervisor except FileNotFoundError: return None @@ -95,58 +123,83 @@ def get_existing_supervisord_process(): return None def stop_existing_supervisord_process(): + SOCK_FILE = get_sock_file() + PID_FILE = SOCK_FILE.parent / PID_FILE_NAME + try: - pid = int(PID_FILE.read_text()) - except FileNotFoundError: - return - except ValueError: - PID_FILE.unlink() - return + try: + pid = int(PID_FILE.read_text()) + except (FileNotFoundError, ValueError): + return - try: - print(f"[π¦ΈββοΈ] Stopping supervisord process (pid={pid})...") - proc = psutil.Process(pid) - proc.terminate() - proc.wait() - except Exception: - pass - try: - PID_FILE.unlink() - except FileNotFoundError: - pass + try: + print(f"[π¦ΈββοΈ] Stopping supervisord process (pid={pid})...") + proc = psutil.Process(pid) + proc.terminate() + proc.wait() + except (Exception, BrokenPipeError, IOError): + pass + finally: + try: + # clear PID file and socket file + PID_FILE.unlink(missing_ok=True) + get_sock_file().unlink(missing_ok=True) + except Exception: + pass def start_new_supervisord_process(daemonize=False): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME + CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME + PID_FILE = SOCK_FILE.parent / PID_FILE_NAME + print(f"[π¦ΈββοΈ] Supervisord starting{' in background' if daemonize else ''}...") - # Create a config file in the current working directory + pretty_log_path = pretty_path(LOG_FILE) + print(f" > Writing supervisord logs to: {pretty_log_path}") + print(f" > Writing task worker logs to: {pretty_log_path.replace('supervisord.log', 'worker_*.log')}") + print(f' > Using supervisord config file: {pretty_path(CONFIG_FILE)}') + print(f" > Using supervisord UNIX socket: {pretty_path(SOCK_FILE)}") + print() # clear out existing stale state files shutil.rmtree(WORKERS_DIR, ignore_errors=True) PID_FILE.unlink(missing_ok=True) get_sock_file().unlink(missing_ok=True) - SUPERVISORD_CONFIG_FILE.unlink(missing_ok=True) + CONFIG_FILE.unlink(missing_ok=True) + # create the supervisord config file create_supervisord_config() # Start supervisord + # panel = Panel(f"Starting supervisord with config: {SUPERVISORD_CONFIG_FILE}") + # with Live(panel, refresh_per_second=1) as live: + subprocess.Popen( - f"supervisord --configuration={SUPERVISORD_CONFIG_FILE}", + f"supervisord --configuration={CONFIG_FILE}", stdin=None, shell=True, start_new_session=daemonize, ) def exit_signal_handler(signum, frame): - if signum != 13: - print(f"\n[π¦ΈββοΈ] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...") + if signum == 2: + STDERR.print("\n[π] Got Ctrl+C. Terminating child processes...") + elif signum != 13: + STDERR.print(f"\n[π¦ΈββοΈ] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...") stop_existing_supervisord_process() raise SystemExit(0) # Monitor for termination signals and cleanup child processes if not daemonize: - signal.signal(signal.SIGINT, exit_signal_handler) - signal.signal(signal.SIGHUP, exit_signal_handler) - signal.signal(signal.SIGPIPE, exit_signal_handler) - signal.signal(signal.SIGTERM, exit_signal_handler) + try: + signal.signal(signal.SIGINT, exit_signal_handler) + signal.signal(signal.SIGHUP, exit_signal_handler) + signal.signal(signal.SIGPIPE, exit_signal_handler) + signal.signal(signal.SIGTERM, exit_signal_handler) + except Exception: + # signal handlers only work in main thread + pass # otherwise supervisord will containue in background even if parent proc is ends (aka daemon mode) time.sleep(2) @@ -154,14 +207,32 @@ def start_new_supervisord_process(daemonize=False): return get_existing_supervisord_process() def get_or_create_supervisord_process(daemonize=False): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + supervisor = get_existing_supervisord_process() if supervisor is None: stop_existing_supervisord_process() supervisor = start_new_supervisord_process(daemonize=daemonize) time.sleep(0.5) + # wait up to 5s in case supervisord is slow to start + if not supervisor: + for _ in range(10): + if supervisor is not None: + print() + break + sys.stdout.write('.') + sys.stdout.flush() + time.sleep(0.5) + supervisor = get_existing_supervisord_process() + else: + print() + assert supervisor, "Failed to start supervisord or connect to it!" supervisor.getPID() # make sure it doesn't throw an exception + + (WORKERS_DIR / 'initial_startup.conf').unlink(missing_ok=True) return supervisor @@ -242,9 +313,9 @@ def tail_worker_logs(log_path: str): for line in follow(f): if '://' in line: live.console.print(f"Working on: {line.strip()}") - table.add_row("123124234", line.strip()) - except KeyboardInterrupt: - print("\n[π] Got Ctrl+C, stopping gracefully...") + # table.add_row("123124234", line.strip()) + except (KeyboardInterrupt, BrokenPipeError, IOError): + STDERR.print("\n[π] Got Ctrl+C, stopping gracefully...") except SystemExit: pass @@ -321,12 +392,12 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False): if not daemonize: try: watch_worker(supervisor, "worker_daphne") - except KeyboardInterrupt: - print("\n[π] Got Ctrl+C, stopping gracefully...") + except (KeyboardInterrupt, BrokenPipeError, IOError): + STDERR.print("\n[π] Got Ctrl+C, stopping gracefully...") except SystemExit: pass except BaseException as e: - print(f"\n[π] Got {e.__class__.__name__} exception, stopping web server gracefully...") + STDERR.print(f"\n[π] Got {e.__class__.__name__} exception, stopping web server gracefully...") raise finally: stop_worker(supervisor, "worker_daphne") @@ -350,12 +421,12 @@ def start_cli_workers(watch=False): if watch: try: watch_worker(supervisor, "worker_system_tasks") - except KeyboardInterrupt: - print("\n[π] Got Ctrl+C, stopping gracefully...") + except (KeyboardInterrupt, BrokenPipeError, IOError): + STDERR.print("\n[π] Got Ctrl+C, stopping gracefully...") except SystemExit: pass except BaseException as e: - print(f"\n[π] Got {e.__class__.__name__} exception, stopping web server gracefully...") + STDERR.print(f"\n[π] Got {e.__class__.__name__} exception, stopping web server gracefully...") raise finally: stop_worker(supervisor, "worker_system_tasks")