From b6cfeb8d40d1161edc9b07bca342b87ca7d1fb36 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 22 Sep 2024 19:30:24 -0700 Subject: [PATCH] add new pydantic_settings based loader for ConfigSets --- archivebox/builtin_plugins/chrome/apps.py | 5 +- archivebox/builtin_plugins/pip/apps.py | 4 +- archivebox/builtin_plugins/singlefile/apps.py | 10 +- archivebox/builtin_plugins/ytdlp/apps.py | 4 +- archivebox/plugantic/base_configset.py | 166 +++++++++++++++++- pdm.lock | 30 +++- pyproject.toml | 1 + requirements.txt | 2 + 8 files changed, 201 insertions(+), 21 deletions(-) diff --git a/archivebox/builtin_plugins/chrome/apps.py b/archivebox/builtin_plugins/chrome/apps.py index f69967b5..56eb48de 100644 --- a/archivebox/builtin_plugins/chrome/apps.py +++ b/archivebox/builtin_plugins/chrome/apps.py @@ -1,7 +1,6 @@ import platform from pathlib import Path -from typing import List, Optional, Dict, Any -from typing_extensions import Self +from typing import List, Optional, Dict, ClassVar from django.conf import settings @@ -79,7 +78,7 @@ def create_macos_app_symlink(target: Path, shortcut: Path): class ChromeDependencyConfigs(BaseConfigSet): - section: ConfigSectionName = 'DEPENDENCY_CONFIG' + section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG" CHROME_BINARY: str = Field(default='chrome') CHROME_ARGS: Optional[List[str]] = Field(default=None) diff --git a/archivebox/builtin_plugins/pip/apps.py b/archivebox/builtin_plugins/pip/apps.py index b339f247..5ea84c3a 100644 --- a/archivebox/builtin_plugins/pip/apps.py +++ b/archivebox/builtin_plugins/pip/apps.py @@ -2,7 +2,7 @@ import os import sys import inspect from pathlib import Path -from typing import List, Dict, Optional +from typing import List, Dict, Optional, ClassVar from pydantic import InstanceOf, Field import django @@ -23,7 +23,7 @@ from plugantic.base_hook import BaseHook class PipDependencyConfigs(BaseConfigSet): - section: ConfigSectionName = 'DEPENDENCY_CONFIG' + section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG" USE_PIP: bool = True PIP_BINARY: str = Field(default='pip') diff --git a/archivebox/builtin_plugins/singlefile/apps.py b/archivebox/builtin_plugins/singlefile/apps.py index 70431c60..54e6cb2d 100644 --- a/archivebox/builtin_plugins/singlefile/apps.py +++ b/archivebox/builtin_plugins/singlefile/apps.py @@ -1,7 +1,7 @@ __package__ = 'archivebox.builtin_plugins.singlefile' from pathlib import Path -from typing import List, Dict, Optional +from typing import List, Dict, Optional, ClassVar from typing_extensions import Self from django.conf import settings @@ -25,13 +25,13 @@ from builtin_plugins.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER ###################### Config ########################## class SinglefileToggleConfigs(BaseConfigSet): - section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES' + section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_TOGGLES' SAVE_SINGLEFILE: bool = True class SinglefileOptionsConfigs(BaseConfigSet): - section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS' + section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_OPTIONS' # loaded from shared config SINGLEFILE_USER_AGENT: str = Field(default='', alias='USER_AGENT') @@ -42,7 +42,7 @@ class SinglefileOptionsConfigs(BaseConfigSet): class SinglefileDependencyConfigs(BaseConfigSet): - section: ConfigSectionName = 'DEPENDENCY_CONFIG' + section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG' SINGLEFILE_BINARY: str = Field(default='wget') SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None) @@ -50,7 +50,7 @@ class SinglefileDependencyConfigs(BaseConfigSet): SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] class SinglefileConfigs(SinglefileToggleConfigs, SinglefileOptionsConfigs, SinglefileDependencyConfigs): - # section: ConfigSectionName = 'ALL_CONFIGS' + # section: ClassVar[ConfigSectionName] = 'ALL_CONFIGS' pass DEFAULT_GLOBAL_CONFIG = { diff --git a/archivebox/builtin_plugins/ytdlp/apps.py b/archivebox/builtin_plugins/ytdlp/apps.py index 31985687..f88cf6d8 100644 --- a/archivebox/builtin_plugins/ytdlp/apps.py +++ b/archivebox/builtin_plugins/ytdlp/apps.py @@ -1,4 +1,4 @@ -from typing import List, Dict +from typing import List, Dict, ClassVar from subprocess import run, PIPE from pydantic import InstanceOf, Field @@ -16,7 +16,7 @@ from builtin_plugins.pip.apps import pip class YtdlpDependencyConfigs(BaseConfigSet): - section: ConfigSectionName = 'DEPENDENCY_CONFIG' + section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG" USE_YTDLP: bool = True diff --git a/archivebox/plugantic/base_configset.py b/archivebox/plugantic/base_configset.py index 0c44bdb0..3eab8cf5 100644 --- a/archivebox/plugantic/base_configset.py +++ b/archivebox/plugantic/base_configset.py @@ -1,36 +1,186 @@ __package__ = 'archivebox.plugantic' -from typing import List, Literal +from pathlib import Path +from typing import List, Literal, Type, Tuple, Callable, ClassVar + +from benedict import benedict +from pydantic import model_validator, TypeAdapter +from pydantic_settings import BaseSettings, SettingsConfigDict, PydanticBaseSettingsSource +from pydantic_settings.sources import TomlConfigSettingsSource + +from django.conf import settings from .base_hook import BaseHook, HookType -from ..config_stubs import AttrDict - +from . import ini_to_toml ConfigSectionName = Literal[ + 'SHELL_CONFIG', 'GENERAL_CONFIG', + 'SERVER_CONFIG', 'ARCHIVE_METHOD_TOGGLES', 'ARCHIVE_METHOD_OPTIONS', + 'SEARCH_BACKEND_CONFIG', 'DEPENDENCY_CONFIG', ] ConfigSectionNames: List[ConfigSectionName] = [ + 'SHELL_CONFIG', 'GENERAL_CONFIG', + 'SERVER_CONFIG', 'ARCHIVE_METHOD_TOGGLES', 'ARCHIVE_METHOD_OPTIONS', + 'SEARCH_BACKEND_CONFIG', 'DEPENDENCY_CONFIG', ] +class FlatTomlConfigSettingsSource(TomlConfigSettingsSource): + """ + A source class that loads variables from a TOML file + """ -class BaseConfigSet(BaseHook): - hook_type: HookType = 'CONFIG' + def __init__( + self, + settings_cls: type[BaseSettings], + toml_file: Path | None=None, + ): + self.toml_file_path = toml_file or settings_cls.model_config.get("toml_file") + + self.nested_toml_data = self._read_files(self.toml_file_path) + self.toml_data = {} + for section_name, section in self.nested_toml_data.items(): + if section_name in ConfigSectionNames and isinstance(section, dict): + # value is nested, flatten it + for key, value in section.items(): + self.toml_data[key] = value + else: + # value is already flat, just set it as-is + self.toml_data[section_name] = section + + # filter toml_data to only include keys that are defined on the settings_cls + self.toml_data = { + key: value + for key, value in self.toml_data.items() + if key in settings_cls.model_fields + } + + super(TomlConfigSettingsSource, self).__init__(settings_cls, self.toml_data) - section: ConfigSectionName = 'GENERAL_CONFIG' + +class ArchiveBoxBaseConfig(BaseSettings): + """ + This is the base class for an ArchiveBox ConfigSet. + It handles loading values from schema defaults, ArchiveBox.conf TOML config, and environment variables. + + class WgetConfig(ArchiveBoxBaseConfig): + WGET_BINARY: str = Field(default='wget', alias='WGET_BINARY_PATH') + + c = WgetConfig() + print(c.WGET_BINARY) # outputs: wget + + # you can mutate process environment variable and reload config using .__init__() + os.environ['WGET_BINARY_PATH'] = 'wget2' + c.__init__() + + print(c.WGET_BINARY) # outputs: wget2 + + """ + + # these pydantic config options are all VERY carefully chosen, make sure to test thoroughly before changing!!! + model_config = SettingsConfigDict( + validate_default=False, + case_sensitive=True, + extra="ignore", + arbitrary_types_allowed=False, + populate_by_name=True, + from_attributes=True, + loc_by_alias=False, + validate_assignment=True, + validate_return=True, + revalidate_instances="always", + ) + + @classmethod + def settings_customise_sources( + cls, + settings_cls: Type[BaseSettings], + init_settings: PydanticBaseSettingsSource, + env_settings: PydanticBaseSettingsSource, + dotenv_settings: PydanticBaseSettingsSource, + file_secret_settings: PydanticBaseSettingsSource, + ) -> Tuple[PydanticBaseSettingsSource, ...]: + """Defines the config precedence order: Schema defaults -> ArchiveBox.conf (TOML) -> Environment variables""" + + ARCHIVEBOX_CONFIG_FILE = settings.DATA_DIR / "ArchiveBox.conf" + ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak" + + # import ipdb; ipdb.set_trace() + + # if ArchiveBox.conf does not exist yet, return defaults -> env order + if not ARCHIVEBOX_CONFIG_FILE.is_file(): + return ( + init_settings, + env_settings, + ) + + # if ArchiveBox.conf exists and is in TOML format, return default -> TOML -> env order + try: + return ( + init_settings, + FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), + env_settings, + ) + except Exception as err: + if err.__class__.__name__ != "TOMLDecodeError": + raise + # if ArchiveBox.conf exists and is in INI format, convert it then return default -> TOML -> env order + + # Convert ArchiveBox.conf in INI format to TOML and save original to .ArchiveBox.bak + original_ini = ARCHIVEBOX_CONFIG_FILE.read_text() + ARCHIVEBOX_CONFIG_FILE_BAK.write_text(original_ini) + new_toml = ini_to_toml.convert(original_ini) + ARCHIVEBOX_CONFIG_FILE.write_text(new_toml) + + return ( + init_settings, + FlatTomlConfigSettingsSource(settings_cls, toml_file=ARCHIVEBOX_CONFIG_FILE), + env_settings, + ) + + @model_validator(mode="after") + def fill_defaults(self): + """Populate any unset values using function provided as their default""" + + for key, field in self.model_fields.items(): + config_so_far = self.model_dump() + value = getattr(self, key) + if isinstance(value, Callable): + # if value is a function, execute it to get the actual value, passing existing config as a dict arg + fallback_value = field.default(config_so_far) + + # check to make sure default factory return value matches type annotation + TypeAdapter(field.annotation).validate_python(fallback_value) + + # set generated default value as final validated value + setattr(self, key, fallback_value) + return self + +class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg] + hook_type: ClassVar[HookType] = 'CONFIG' + + section: ClassVar[ConfigSectionName] = 'GENERAL_CONFIG' def register(self, settings, parent_plugin=None): # self._plugin = parent_plugin # for debugging only, never rely on this! - settings.CONFIGS = getattr(settings, "CONFIGS", None) or AttrDict({}) - settings.CONFIGS[self.id] = self + settings.FLAT_CONFIG = getattr(settings, "FLAT_CONFIG", None) or benedict({}) + settings.CONFIGS = getattr(settings, "CONFIGS", None) or benedict({}) + + # pass FLAT_CONFIG so far into our config model to load it + loaded_config = self.__class__(**settings.FLAT_CONFIG) + # then dump our parsed config back into FLAT_CONFIG for the next plugin to use + settings.FLAT_CONFIG.merge(loaded_config.model_dump()) + + settings.CONFIGS[self.id] = loaded_config super().register(settings, parent_plugin=parent_plugin) diff --git a/pdm.lock b/pdm.lock index d83cd5e5..ff81362e 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "all", "ldap", "sonic"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:d7c9e7a40b0a794986eb3f6a3774d5003c9b39985411f63c1aa387dda9986ada" +content_hash = "sha256:6b062624538c5dfe6b1bd5be32546fef02b70ee73c4a1710a8eea9764bdd21d8" [[metadata.targets]] requires_python = "==3.11.*" @@ -1147,6 +1147,22 @@ files = [ {file = "pydantic_pkgr-0.3.5.tar.gz", hash = "sha256:36444778d53d5cbdc261086fda0d65fb519a072105f5d1c7d88e224bd197dd1d"}, ] +[[package]] +name = "pydantic-settings" +version = "2.5.2" +requires_python = ">=3.8" +summary = "Settings management using Pydantic" +groups = ["default"] +marker = "python_version == \"3.11\"" +dependencies = [ + "pydantic>=2.7.0", + "python-dotenv>=0.21.0", +] +files = [ + {file = "pydantic_settings-2.5.2-py3-none-any.whl", hash = "sha256:2c912e55fd5794a59bf8c832b9de832dcfdf4778d79ff79b708744eed499a907"}, + {file = "pydantic_settings-2.5.2.tar.gz", hash = "sha256:f90b139682bee4d2065273d5185d71d37ea46cfe57e1b5ae184fc6a0b2484ca0"}, +] + [[package]] name = "pygments" version = "2.18.0" @@ -1277,6 +1293,18 @@ files = [ {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, ] +[[package]] +name = "python-dotenv" +version = "1.0.1" +requires_python = ">=3.8" +summary = "Read key-value pairs from a .env file and set them as environment variables" +groups = ["default"] +marker = "python_version == \"3.11\"" +files = [ + {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"}, + {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"}, +] + [[package]] name = "python-fsutil" version = "0.14.1" diff --git a/pyproject.toml b/pyproject.toml index 803f5cd6..49cd31cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,7 @@ dependencies = [ "base32-crockford==0.3.0", ############# Extractor Dependencies ############# "yt-dlp>=2024.8.6", # for: media + "pydantic-settings>=2.5.2", ] # pdm lock --group=':all' diff --git a/requirements.txt b/requirements.txt index a9b859e2..27cba2b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -76,6 +76,7 @@ pycryptodomex==3.20.0; python_version == "3.11" pydantic==2.9.2; python_version == "3.11" pydantic-core==2.23.4; python_version == "3.11" pydantic-pkgr==0.3.5; python_version == "3.11" +pydantic-settings==2.5.2; python_version == "3.11" pygments==2.18.0; python_version == "3.11" pyopenssl==24.2.1; python_version == "3.11" python-benedict[html,toml,xls,xml,yaml]==0.33.2; python_version == "3.11" @@ -83,6 +84,7 @@ python-benedict[io,parse]==0.33.2; python_version == "3.11" python-benedict[xml]==0.33.2; python_version == "3.11" python-crontab==3.2.0; python_version == "3.11" python-dateutil==2.9.0.post0; python_version == "3.11" +python-dotenv==1.0.1; python_version == "3.11" python-fsutil==0.14.1; python_version == "3.11" python-ldap==3.4.4; python_version == "3.11" python-slugify==8.0.4; python_version == "3.11"