From 60f0458c774574dc89c78b1ce4ebdf32fd39f13f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 24 Oct 2024 15:40:24 -0700 Subject: [PATCH] rename configfile to collection --- archivebox/abx/archivebox/base_binary.py | 2 +- archivebox/abx/archivebox/base_configset.py | 22 +++++++++--------- archivebox/abx/archivebox/base_extractor.py | 25 +++++---------------- archivebox/abx/archivebox/reads.py | 2 +- archivebox/abx/archivebox/writes.py | 2 +- archivebox/config/configfile.py | 2 +- archivebox/config/constants.py | 19 ++++++++++++++-- archivebox/main.py | 2 +- archivebox/misc/util.py | 2 ++ 9 files changed, 41 insertions(+), 37 deletions(-) diff --git a/archivebox/abx/archivebox/base_binary.py b/archivebox/abx/archivebox/base_binary.py index afa4f192..ee7ab5e1 100644 --- a/archivebox/abx/archivebox/base_binary.py +++ b/archivebox/abx/archivebox/base_binary.py @@ -14,7 +14,6 @@ from pydantic_pkgr import ( EnvProvider, ) -from archivebox.config import CONSTANTS from archivebox.config.permissions import ARCHIVEBOX_USER import abx @@ -34,6 +33,7 @@ class BaseBinProvider(BinProvider): return [self] class BaseBinary(Binary): + # TODO: formalize state diagram, final states, transitions, side effects, etc. @staticmethod def symlink_to_lib(binary, bin_dir=None) -> None: diff --git a/archivebox/abx/archivebox/base_configset.py b/archivebox/abx/archivebox/base_configset.py index 3a6695a1..706b9df8 100644 --- a/archivebox/abx/archivebox/base_configset.py +++ b/archivebox/abx/archivebox/base_configset.py @@ -99,7 +99,7 @@ class BaseConfigSet(BaseSettings): ) load_from_defaults: ClassVar[bool] = True - load_from_configfile: ClassVar[bool] = True + load_from_collection: ClassVar[bool] = True load_from_environment: ClassVar[bool] = True @classmethod @@ -128,7 +128,8 @@ class BaseConfigSet(BaseSettings): try: precedence_order = precedence_order or { 'defaults': init_settings, - 'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), + # 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), + 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), 'environment': env_settings, } except Exception as err: @@ -144,14 +145,15 @@ class BaseConfigSet(BaseSettings): precedence_order = { 'defaults': init_settings, - 'configfile': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), + # 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), + 'collection': FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), 'environment': env_settings, } if not cls.load_from_environment: precedence_order.pop('environment') - if not cls.load_from_configfile: - precedence_order.pop('configfile') + if not cls.load_from_collection: + precedence_order.pop('collection') if not cls.load_from_defaults: precedence_order.pop('defaults') @@ -278,15 +280,15 @@ class BaseConfigSet(BaseSettings): """Get the dictionary of {key: value} config loaded from the default values""" class OnlyDefaultsConfig(self.__class__): load_from_defaults = True - load_from_configfile = False + load_from_collection = False load_from_environment = False return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) - def from_configfile(self) -> Dict[str, Any]: - """Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf""" + def from_collection(self) -> Dict[str, Any]: + """Get the dictionary of {key: value} config loaded from the collection ArchiveBox.conf""" class OnlyConfigFileConfig(self.__class__): load_from_defaults = False - load_from_configfile = True + load_from_collection = True load_from_environment = False return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) @@ -294,7 +296,7 @@ class BaseConfigSet(BaseSettings): """Get the dictionary of {key: value} config loaded from the environment variables""" class OnlyEnvironmentConfig(self.__class__): load_from_defaults = False - load_from_configfile = False + load_from_collection = False load_from_environment = True return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) diff --git a/archivebox/abx/archivebox/base_extractor.py b/archivebox/abx/archivebox/base_extractor.py index f78921e0..51dcc8d2 100644 --- a/archivebox/abx/archivebox/base_extractor.py +++ b/archivebox/abx/archivebox/base_extractor.py @@ -4,10 +4,9 @@ import json import os from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple -from typing_extensions import Self from pathlib import Path -from pydantic import model_validator, AfterValidator +from pydantic import AfterValidator from pydantic_pkgr import BinName from django.utils.functional import cached_property from django.utils import timezone @@ -17,36 +16,22 @@ import abx from .base_binary import BaseBinary -def no_empty_args(args: List[str]) -> List[str]: +def assert_no_empty_args(args: List[str]) -> List[str]: assert all(len(arg) for arg in args) return args -ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str +ExtractorName = Annotated[str, AfterValidator(lambda s: s.isidentifier())] HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))] -CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)] +CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(assert_no_empty_args)] class BaseExtractor: - name: ExtractorName binary: BinName - output_path_func: HandlerFuncStr = 'self.get_output_path' - should_extract_func: HandlerFuncStr = 'self.should_extract' - extract_func: HandlerFuncStr = 'self.extract' - exec_func: HandlerFuncStr = 'self.exec' - default_args: CmdArgsList = [] extra_args: CmdArgsList = [] - args: Optional[CmdArgsList] = None - - @model_validator(mode='after') - def validate_model(self) -> Self: - if self.args is None: - self.args = [*self.default_args, *self.extra_args] - return self - def get_output_path(self, snapshot) -> Path: return Path(self.__class__.__name__.lower()) @@ -71,7 +56,7 @@ class BaseExtractor: snapshot = Snapshot.objects.get(id=snapshot_id) - if not self.should_extract(snapshot): + if not self.should_extract(snapshot.url): return {} status = 'failed' diff --git a/archivebox/abx/archivebox/reads.py b/archivebox/abx/archivebox/reads.py index d2877ac5..10ad6ecd 100644 --- a/archivebox/abx/archivebox/reads.py +++ b/archivebox/abx/archivebox/reads.py @@ -57,7 +57,7 @@ def get_HOOKS() -> Set[str]: for hook_name in get_PLUGIN(plugin_id).hooks } -def get_CONFIGS() -> Dict[str, 'BaseConfigSet']: +def get_CONFIGS() -> benedict: # Dict[str, 'BaseConfigSet'] return benedict({ config_id: configset for plugin_configs in pm.hook.get_CONFIG() diff --git a/archivebox/abx/archivebox/writes.py b/archivebox/abx/archivebox/writes.py index 0c4566b4..1ca1ac7e 100644 --- a/archivebox/abx/archivebox/writes.py +++ b/archivebox/abx/archivebox/writes.py @@ -88,7 +88,7 @@ def create_root_snapshot_from_seed(crawl): def create_archiveresults_pending_from_snapshot(snapshot, config): config = get_scope_config( # defaults=settings.CONFIG_FROM_DEFAULTS, - # configfile=settings.CONFIG_FROM_FILE, + # collection=settings.CONFIG_FROM_FILE, # environment=settings.CONFIG_FROM_ENVIRONMENT, persona=archiveresult.snapshot.crawl.persona, seed=archiveresult.snapshot.crawl.seed, diff --git a/archivebox/config/configfile.py b/archivebox/config/configfile.py index c489e114..911e1559 100644 --- a/archivebox/config/configfile.py +++ b/archivebox/config/configfile.py @@ -15,7 +15,7 @@ from archivebox.misc.logging import stderr def get_real_name(key: str) -> str: - """get the current canonical name for a given deprecated config key""" + """get the up-to-date canonical name for a given old alias or current key""" from django.conf import settings for section in settings.CONFIGS.values(): diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index b8019f99..5124384d 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -1,3 +1,15 @@ +""" +Constants are for things that never change at runtime. +(but they can change from run-to-run or machine-to-machine) + +DATA_DIR will never change at runtime, but you can run +archivebox from inside a different DATA_DIR on the same machine. + +This is loaded very early in the archivebox startup flow, so nothing in this file +or imported from this file should import anything from archivebox.config.common, +django, other INSTALLED_APPS, or anything else that is not in a standard library. +""" + __package__ = 'archivebox.config' import re @@ -197,10 +209,12 @@ class ConstantsDict(Mapping): @classmethod def __getitem__(cls, key: str): + # so it behaves like a dict[key] == dict.key or object attr return getattr(cls, key) @classmethod def __benedict__(cls): + # when casting to benedict, only include uppercase keys that don't start with an underscore return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')}) @classmethod @@ -214,5 +228,6 @@ class ConstantsDict(Mapping): CONSTANTS = ConstantsDict() CONSTANTS_CONFIG = CONSTANTS.__benedict__() -# add all key: values to globals() for easier importing -globals().update(CONSTANTS) +# add all key: values to globals() for easier importing, e.g.: +# from archivebox.config.constants import IS_ROOT, PERSONAS_DIR, ... +# globals().update(CONSTANTS) diff --git a/archivebox/main.py b/archivebox/main.py index 7f196a3c..e05c696d 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -22,7 +22,7 @@ from archivebox.misc.logging import stderr, hint from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG from archivebox.config.permissions import SudoPermission, IN_DOCKER -from archivebox.config.configfile import ( +from archivebox.config.collection import ( write_config_file, load_all_config, get_real_name, diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py index a856fe64..6f54ada6 100644 --- a/archivebox/misc/util.py +++ b/archivebox/misc/util.py @@ -126,6 +126,7 @@ def is_static_file(url: str): def enforce_types(func): """ Enforce function arg and kwarg types at runtime using its python3 type hints + Simpler version of pydantic @validate_call decorator """ # TODO: check return type as well @@ -283,6 +284,7 @@ def get_headers(url: str, timeout: int=None) -> str: def ansi_to_html(text: str) -> str: """ Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html + Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though. """ TEMPLATE = '
'