mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-24 19:54:25 -04:00
fix LIB_DIR and TMP_DIR loading when primary option isnt available
This commit is contained in:
parent
deb116eed4
commit
a211461ffc
21 changed files with 712 additions and 303 deletions
|
@ -37,7 +37,8 @@ class BaseBinary(Binary):
|
|||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=None) -> None:
|
||||
bin_dir = bin_dir or CONSTANTS.LIB_BIN_DIR
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
|
||||
|
||||
if not (binary.abspath and os.access(binary.abspath, os.R_OK)):
|
||||
return
|
||||
|
@ -55,9 +56,10 @@ class BaseBinary(Binary):
|
|||
|
||||
@validate_call
|
||||
def load(self, fresh=False, **kwargs) -> Self:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
if fresh:
|
||||
binary = super().load(**kwargs)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
else:
|
||||
# get cached binary from db
|
||||
try:
|
||||
|
@ -72,16 +74,18 @@ class BaseBinary(Binary):
|
|||
|
||||
@validate_call
|
||||
def install(self, **kwargs) -> Self:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
binary = super().install(**kwargs)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
return binary
|
||||
|
||||
@validate_call
|
||||
def load_or_install(self, fresh=False, **kwargs) -> Self:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
try:
|
||||
binary = self.load(fresh=fresh)
|
||||
if binary and binary.version:
|
||||
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
return binary
|
||||
except Exception:
|
||||
pass
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
__package__ = 'abx.archivebox'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Type, Tuple, Callable, ClassVar
|
||||
from typing import Type, Tuple, Callable, ClassVar, Dict, Any
|
||||
|
||||
import toml
|
||||
from rich import print
|
||||
|
||||
from benedict import benedict
|
||||
from pydantic import model_validator, TypeAdapter
|
||||
|
@ -18,6 +23,11 @@ from . import toml_util
|
|||
PACKAGE_DIR = Path(__file__).resolve().parent.parent
|
||||
DATA_DIR = Path(os.getcwd()).resolve()
|
||||
|
||||
ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
|
||||
ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
|
||||
|
||||
AUTOFIXES_HEADER = "[AUTOFIXES]"
|
||||
AUTOFIXES_SUBHEADER = "# The following config was added automatically to fix problems detected at startup:"
|
||||
|
||||
|
||||
class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
|
||||
|
@ -53,7 +63,7 @@ class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
|
|||
super(TomlConfigSettingsSource, self).__init__(settings_cls, self.toml_data)
|
||||
|
||||
|
||||
class ArchiveBoxBaseConfig(BaseSettings):
|
||||
class BaseConfigSet(BaseSettings):
|
||||
"""
|
||||
This is the base class for an ArchiveBox ConfigSet.
|
||||
It handles loading values from schema defaults, ArchiveBox.conf TOML config, and environment variables.
|
||||
|
@ -83,7 +93,7 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
|||
loc_by_alias=False,
|
||||
validate_assignment=True,
|
||||
validate_return=True,
|
||||
revalidate_instances="always",
|
||||
revalidate_instances="subclass-instances",
|
||||
)
|
||||
|
||||
load_from_defaults: ClassVar[bool] = True
|
||||
|
@ -101,9 +111,6 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
|||
) -> Tuple[PydanticBaseSettingsSource, ...]:
|
||||
"""Defines the config precedence order: Schema defaults -> ArchiveBox.conf (TOML) -> Environment variables"""
|
||||
|
||||
ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
|
||||
ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
|
||||
|
||||
# import ipdb; ipdb.set_trace()
|
||||
|
||||
precedence_order = {}
|
||||
|
@ -152,27 +159,36 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
|||
def fill_defaults(self):
|
||||
"""Populate any unset values using function provided as their default"""
|
||||
|
||||
for key, field in self.model_fields.items():
|
||||
value = getattr(self, key)
|
||||
|
||||
if isinstance(value, Callable):
|
||||
# if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
|
||||
if func_takes_args_or_kwargs(value):
|
||||
# assemble dict of existing field values to pass to default factory functions
|
||||
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
|
||||
computed_default = field.default(config_so_far)
|
||||
else:
|
||||
# otherwise it's a pure function with no args, just call it
|
||||
computed_default = field.default()
|
||||
|
||||
# coerce/check to make sure default factory return value matches type annotation
|
||||
TypeAdapter(field.annotation).validate_python(computed_default)
|
||||
|
||||
# set generated default value as final validated value
|
||||
setattr(self, key, computed_default)
|
||||
for key in self.model_fields.keys():
|
||||
if isinstance(getattr(self, key), Callable):
|
||||
if self.load_from_defaults:
|
||||
computed_default = self.get_default_value(key)
|
||||
# set generated default value as final validated value
|
||||
setattr(self, key, computed_default)
|
||||
return self
|
||||
|
||||
def update_in_place(self, warn=True, **kwargs):
|
||||
def get_default_value(self, key):
|
||||
"""Get the default value for a given config key"""
|
||||
field = self.model_fields[key]
|
||||
value = getattr(self, key)
|
||||
|
||||
if isinstance(value, Callable):
|
||||
# if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
|
||||
if func_takes_args_or_kwargs(value):
|
||||
# assemble dict of existing field values to pass to default factory functions
|
||||
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
|
||||
computed_default = field.default(config_so_far)
|
||||
else:
|
||||
# otherwise it's a pure function with no args, just call it
|
||||
computed_default = field.default()
|
||||
|
||||
# coerce/check to make sure default factory return value matches type annotation
|
||||
TypeAdapter(field.annotation).validate_python(computed_default)
|
||||
|
||||
return computed_default
|
||||
return value
|
||||
|
||||
def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
|
||||
"""
|
||||
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
|
||||
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
|
||||
|
@ -180,25 +196,106 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
|||
Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it.
|
||||
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue.
|
||||
"""
|
||||
from archivebox.misc.toml_util import CustomTOMLEncoder
|
||||
|
||||
if warn:
|
||||
print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:')
|
||||
fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
|
||||
print(f'[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)
|
||||
|
||||
# set the new values in the environment
|
||||
for key, value in kwargs.items():
|
||||
os.environ[key] = str(value)
|
||||
original_value = getattr(self, key)
|
||||
if warn:
|
||||
print(f' {key}={original_value} -> {value}')
|
||||
|
||||
# if persist=True, write config changes to data/ArchiveBox.conf [AUTOFIXES] section
|
||||
try:
|
||||
if persist and ARCHIVEBOX_CONFIG_FILE.is_file():
|
||||
autofixes_to_add = benedict(kwargs).to_toml(encoder=CustomTOMLEncoder())
|
||||
|
||||
existing_config = ARCHIVEBOX_CONFIG_FILE.read_text().split(AUTOFIXES_HEADER, 1)[0].strip()
|
||||
if AUTOFIXES_HEADER in existing_config:
|
||||
existing_autofixes = existing_config.split(AUTOFIXES_HEADER, 1)[-1].strip().replace(AUTOFIXES_SUBHEADER, '').replace(AUTOFIXES_HEADER, '').strip()
|
||||
else:
|
||||
existing_autofixes = ''
|
||||
|
||||
new_config = '\n'.join(line for line in [
|
||||
existing_config,
|
||||
'\n' + AUTOFIXES_HEADER,
|
||||
AUTOFIXES_SUBHEADER,
|
||||
existing_autofixes,
|
||||
autofixes_to_add,
|
||||
] if line.strip()).strip() + '\n'
|
||||
ARCHIVEBOX_CONFIG_FILE.write_text(new_config)
|
||||
except Exception:
|
||||
pass
|
||||
self.__init__()
|
||||
if warn:
|
||||
print(file=sys.stderr)
|
||||
|
||||
return self
|
||||
|
||||
def as_legacy_config_schema(self):
|
||||
@property
|
||||
def toml_section_header(self):
|
||||
"""Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG"""
|
||||
class_name = self.__class__.__name__
|
||||
return re.sub('([A-Z]+)', r'_\1', class_name).upper().strip('_')
|
||||
|
||||
|
||||
def from_defaults(self) -> Dict[str, Any]:
|
||||
"""Get the dictionary of {key: value} config loaded from the default values"""
|
||||
class OnlyDefaultsConfig(self.__class__):
|
||||
load_from_defaults = True
|
||||
load_from_configfile = False
|
||||
load_from_environment = False
|
||||
return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||
|
||||
def from_configfile(self) -> Dict[str, Any]:
|
||||
"""Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf"""
|
||||
class OnlyConfigFileConfig(self.__class__):
|
||||
load_from_defaults = False
|
||||
load_from_configfile = True
|
||||
load_from_environment = False
|
||||
return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||
|
||||
def from_environment(self) -> Dict[str, Any]:
|
||||
"""Get the dictionary of {key: value} config loaded from the environment variables"""
|
||||
class OnlyEnvironmentConfig(self.__class__):
|
||||
load_from_defaults = False
|
||||
load_from_configfile = False
|
||||
load_from_environment = True
|
||||
return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
|
||||
|
||||
def from_computed(self) -> Dict[str, Any]:
|
||||
"""Get the dictionary of {key: value} config loaded from the computed fields"""
|
||||
return benedict(self.model_dump(include=set(self.model_computed_fields.keys())))
|
||||
|
||||
|
||||
def to_toml_dict(self, defaults=False) -> Dict[str, Any]:
|
||||
"""Get the current config as a TOML-ready dict"""
|
||||
config_dict = {}
|
||||
for key, value in benedict(self).items():
|
||||
if defaults or value != self.get_default_value(key):
|
||||
config_dict[key] = value
|
||||
|
||||
return benedict({self.toml_section_header: config_dict})
|
||||
|
||||
def to_toml_str(self, defaults=False) -> str:
|
||||
"""Get the current config as a TOML string"""
|
||||
from archivebox.misc.toml_util import CustomTOMLEncoder
|
||||
|
||||
toml_dict = self.to_toml_dict(defaults=defaults)
|
||||
if not toml_dict[self.toml_section_header]:
|
||||
# if the section is empty, don't write it
|
||||
toml_dict.pop(self.toml_section_header)
|
||||
|
||||
return toml.dumps(toml_dict, encoder=CustomTOMLEncoder())
|
||||
|
||||
def as_legacy_config_schema(self) -> Dict[str, Any]:
|
||||
# shim for backwards compatibility with old config schema style
|
||||
model_values = self.model_dump()
|
||||
return benedict({
|
||||
key: {'type': field.annotation, 'default': model_values[key]}
|
||||
for key, field in self.model_fields.items()
|
||||
})
|
||||
|
||||
|
||||
class BaseConfigSet(ArchiveBoxBaseConfig): # type: ignore[type-arg]
|
||||
|
||||
pass
|
||||
|
|
|
@ -18,13 +18,7 @@ def get_PLUGIN() -> Dict[str, Dict[str, Any]]:
|
|||
def get_CONFIG() -> Dict[str, BaseConfigSet]:
|
||||
return {}
|
||||
|
||||
@hookspec
|
||||
def get_BINARIES() -> Dict[str, BaseBinary]:
|
||||
return {}
|
||||
|
||||
@hookspec
|
||||
def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
|
||||
return {}
|
||||
|
||||
@hookspec
|
||||
def get_EXTRACTORS() -> Dict[str, BaseExtractor]:
|
||||
|
@ -45,3 +39,14 @@ def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]:
|
|||
# @hookspec
|
||||
# def get_QUEUES():
|
||||
# return {}
|
||||
|
||||
|
||||
##############################################################
|
||||
# provided by abx.pydantic_pkgr.hookspec:
|
||||
# @hookspec
|
||||
# def get_BINARIES() -> Dict[str, BaseBinary]:
|
||||
# return {}
|
||||
|
||||
# @hookspec
|
||||
# def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
|
||||
# return {}
|
||||
|
|
|
@ -131,9 +131,12 @@ def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
|
|||
|
||||
|
||||
|
||||
def get_scope_config(defaults=settings.CONFIG, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
|
||||
def get_scope_config(defaults: benedict | None = None, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
|
||||
"""Get all the relevant config for the given scope, in correct precedence order"""
|
||||
|
||||
from django.conf import settings
|
||||
default_config: benedict = defaults or settings.CONFIG
|
||||
|
||||
snapshot = snapshot or (archiveresult and archiveresult.snapshot)
|
||||
crawl = crawl or (snapshot and snapshot.crawl)
|
||||
seed = seed or (crawl and crawl.seed)
|
||||
|
@ -147,7 +150,7 @@ def get_scope_config(defaults=settings.CONFIG, persona=None, seed=None, crawl=No
|
|||
extra_config = extra_config or {}
|
||||
|
||||
return {
|
||||
**defaults, # defaults / config file / environment variables
|
||||
**default_config, # defaults / config file / environment variables
|
||||
**persona_config, # lowest precedence
|
||||
**seed_config,
|
||||
**crawl_config,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue