fix LIB_DIR and TMP_DIR loading when primary option isnt available

This commit is contained in:
Nick Sweeting 2024-10-21 00:35:25 -07:00
parent deb116eed4
commit a211461ffc
No known key found for this signature in database
21 changed files with 712 additions and 303 deletions

View file

@ -37,7 +37,8 @@ class BaseBinary(Binary):
@staticmethod
def symlink_to_lib(binary, bin_dir=None) -> None:
bin_dir = bin_dir or CONSTANTS.LIB_BIN_DIR
from archivebox.config.common import STORAGE_CONFIG
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
if not (binary.abspath and os.access(binary.abspath, os.R_OK)):
return
@ -55,9 +56,10 @@ class BaseBinary(Binary):
@validate_call
def load(self, fresh=False, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
if fresh:
binary = super().load(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
else:
# get cached binary from db
try:
@ -72,16 +74,18 @@ class BaseBinary(Binary):
@validate_call
def install(self, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
binary = super().install(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
return binary
@validate_call
def load_or_install(self, fresh=False, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
try:
binary = self.load(fresh=fresh)
if binary and binary.version:
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
return binary
except Exception:
pass

View file

@ -1,8 +1,13 @@
__package__ = 'abx.archivebox'
import os
import sys
import re
from pathlib import Path
from typing import Type, Tuple, Callable, ClassVar
from typing import Type, Tuple, Callable, ClassVar, Dict, Any
import toml
from rich import print
from benedict import benedict
from pydantic import model_validator, TypeAdapter
@ -18,6 +23,11 @@ from . import toml_util
PACKAGE_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = Path(os.getcwd()).resolve()
ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
AUTOFIXES_HEADER = "[AUTOFIXES]"
AUTOFIXES_SUBHEADER = "# The following config was added automatically to fix problems detected at startup:"
class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
@ -53,7 +63,7 @@ class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
super(TomlConfigSettingsSource, self).__init__(settings_cls, self.toml_data)
class ArchiveBoxBaseConfig(BaseSettings):
class BaseConfigSet(BaseSettings):
"""
This is the base class for an ArchiveBox ConfigSet.
It handles loading values from schema defaults, ArchiveBox.conf TOML config, and environment variables.
@ -83,7 +93,7 @@ class ArchiveBoxBaseConfig(BaseSettings):
loc_by_alias=False,
validate_assignment=True,
validate_return=True,
revalidate_instances="always",
revalidate_instances="subclass-instances",
)
load_from_defaults: ClassVar[bool] = True
@ -101,9 +111,6 @@ class ArchiveBoxBaseConfig(BaseSettings):
) -> Tuple[PydanticBaseSettingsSource, ...]:
"""Defines the config precedence order: Schema defaults -> ArchiveBox.conf (TOML) -> Environment variables"""
ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
# import ipdb; ipdb.set_trace()
precedence_order = {}
@ -152,27 +159,36 @@ class ArchiveBoxBaseConfig(BaseSettings):
def fill_defaults(self):
"""Populate any unset values using function provided as their default"""
for key, field in self.model_fields.items():
value = getattr(self, key)
if isinstance(value, Callable):
# if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
if func_takes_args_or_kwargs(value):
# assemble dict of existing field values to pass to default factory functions
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
computed_default = field.default(config_so_far)
else:
# otherwise it's a pure function with no args, just call it
computed_default = field.default()
# coerce/check to make sure default factory return value matches type annotation
TypeAdapter(field.annotation).validate_python(computed_default)
# set generated default value as final validated value
setattr(self, key, computed_default)
for key in self.model_fields.keys():
if isinstance(getattr(self, key), Callable):
if self.load_from_defaults:
computed_default = self.get_default_value(key)
# set generated default value as final validated value
setattr(self, key, computed_default)
return self
def update_in_place(self, warn=True, **kwargs):
def get_default_value(self, key):
"""Get the default value for a given config key"""
field = self.model_fields[key]
value = getattr(self, key)
if isinstance(value, Callable):
# if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
if func_takes_args_or_kwargs(value):
# assemble dict of existing field values to pass to default factory functions
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
computed_default = field.default(config_so_far)
else:
# otherwise it's a pure function with no args, just call it
computed_default = field.default()
# coerce/check to make sure default factory return value matches type annotation
TypeAdapter(field.annotation).validate_python(computed_default)
return computed_default
return value
def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
"""
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
@ -180,25 +196,106 @@ class ArchiveBoxBaseConfig(BaseSettings):
Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it.
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue.
"""
from archivebox.misc.toml_util import CustomTOMLEncoder
if warn:
print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:')
fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
print(f'[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)
# set the new values in the environment
for key, value in kwargs.items():
os.environ[key] = str(value)
original_value = getattr(self, key)
if warn:
print(f' {key}={original_value} -> {value}')
# if persist=True, write config changes to data/ArchiveBox.conf [AUTOFIXES] section
try:
if persist and ARCHIVEBOX_CONFIG_FILE.is_file():
autofixes_to_add = benedict(kwargs).to_toml(encoder=CustomTOMLEncoder())
existing_config = ARCHIVEBOX_CONFIG_FILE.read_text().split(AUTOFIXES_HEADER, 1)[0].strip()
if AUTOFIXES_HEADER in existing_config:
existing_autofixes = existing_config.split(AUTOFIXES_HEADER, 1)[-1].strip().replace(AUTOFIXES_SUBHEADER, '').replace(AUTOFIXES_HEADER, '').strip()
else:
existing_autofixes = ''
new_config = '\n'.join(line for line in [
existing_config,
'\n' + AUTOFIXES_HEADER,
AUTOFIXES_SUBHEADER,
existing_autofixes,
autofixes_to_add,
] if line.strip()).strip() + '\n'
ARCHIVEBOX_CONFIG_FILE.write_text(new_config)
except Exception:
pass
self.__init__()
if warn:
print(file=sys.stderr)
return self
def as_legacy_config_schema(self):
@property
def toml_section_header(self):
"""Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG"""
class_name = self.__class__.__name__
return re.sub('([A-Z]+)', r'_\1', class_name).upper().strip('_')
def from_defaults(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the default values"""
class OnlyDefaultsConfig(self.__class__):
load_from_defaults = True
load_from_configfile = False
load_from_environment = False
return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
def from_configfile(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf"""
class OnlyConfigFileConfig(self.__class__):
load_from_defaults = False
load_from_configfile = True
load_from_environment = False
return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
def from_environment(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the environment variables"""
class OnlyEnvironmentConfig(self.__class__):
load_from_defaults = False
load_from_configfile = False
load_from_environment = True
return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
def from_computed(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the computed fields"""
return benedict(self.model_dump(include=set(self.model_computed_fields.keys())))
def to_toml_dict(self, defaults=False) -> Dict[str, Any]:
"""Get the current config as a TOML-ready dict"""
config_dict = {}
for key, value in benedict(self).items():
if defaults or value != self.get_default_value(key):
config_dict[key] = value
return benedict({self.toml_section_header: config_dict})
def to_toml_str(self, defaults=False) -> str:
"""Get the current config as a TOML string"""
from archivebox.misc.toml_util import CustomTOMLEncoder
toml_dict = self.to_toml_dict(defaults=defaults)
if not toml_dict[self.toml_section_header]:
# if the section is empty, don't write it
toml_dict.pop(self.toml_section_header)
return toml.dumps(toml_dict, encoder=CustomTOMLEncoder())
def as_legacy_config_schema(self) -> Dict[str, Any]:
# shim for backwards compatibility with old config schema style
model_values = self.model_dump()
return benedict({
key: {'type': field.annotation, 'default': model_values[key]}
for key, field in self.model_fields.items()
})
class BaseConfigSet(ArchiveBoxBaseConfig): # type: ignore[type-arg]
pass

View file

@ -18,13 +18,7 @@ def get_PLUGIN() -> Dict[str, Dict[str, Any]]:
def get_CONFIG() -> Dict[str, BaseConfigSet]:
return {}
@hookspec
def get_BINARIES() -> Dict[str, BaseBinary]:
return {}
@hookspec
def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
return {}
@hookspec
def get_EXTRACTORS() -> Dict[str, BaseExtractor]:
@ -45,3 +39,14 @@ def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]:
# @hookspec
# def get_QUEUES():
# return {}
##############################################################
# provided by abx.pydantic_pkgr.hookspec:
# @hookspec
# def get_BINARIES() -> Dict[str, BaseBinary]:
# return {}
# @hookspec
# def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
# return {}

View file

@ -131,9 +131,12 @@ def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
def get_scope_config(defaults=settings.CONFIG, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
def get_scope_config(defaults: benedict | None = None, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
"""Get all the relevant config for the given scope, in correct precedence order"""
from django.conf import settings
default_config: benedict = defaults or settings.CONFIG
snapshot = snapshot or (archiveresult and archiveresult.snapshot)
crawl = crawl or (snapshot and snapshot.crawl)
seed = seed or (crawl and crawl.seed)
@ -147,7 +150,7 @@ def get_scope_config(defaults=settings.CONFIG, persona=None, seed=None, crawl=No
extra_config = extra_config or {}
return {
**defaults, # defaults / config file / environment variables
**default_config, # defaults / config file / environment variables
**persona_config, # lowest precedence
**seed_config,
**crawl_config,