diff --git a/archivebox/abx/archivebox/base_hook.py b/archivebox/abx/archivebox/base_hook.py index 6abf5b6e..c9845124 100644 --- a/archivebox/abx/archivebox/base_hook.py +++ b/archivebox/abx/archivebox/base_hook.py @@ -13,43 +13,6 @@ HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', ' hook_type_names: Tuple[HookType] = get_args(HookType) class BaseHook(BaseModel): - """ - A Plugin consists of a list of Hooks, applied to django.conf.settings when AppConfig.read() -> Plugin.register() is called. - Plugin.register() then calls each Hook.register() on the provided settings. - each Hook.regsiter() function (ideally pure) takes a django.conf.settings as input and returns a new one back. - or - it modifies django.conf.settings in-place to add changes corresponding to its HookType. - e.g. for a HookType.CONFIG, the Hook.register() function places the hook in settings.CONFIG (and settings.HOOKS) - An example of an impure Hook would be a CHECK that modifies settings but also calls django.core.checks.register(check). - In practice any object that subclasses BaseHook and provides a .register() function can behave as a Hook. - - setup_django() -> imports all settings.INSTALLED_APPS... - # django imports AppConfig, models, migrations, admins, etc. for all installed apps - # django then calls AppConfig.ready() on each installed app... - - plugins_pkg.npm.NpmPlugin().AppConfig.ready() # called by django - plugins_pkg.npm.NpmPlugin().register(settings) -> - plugins_pkg.npm.NpmConfigSet().register(settings) - abx.archivebox.base_configset.BaseConfigSet().register(settings) - abx.archivebox.base_hook.BaseHook().register(settings, parent_plugin=plugins_pkg.npm.NpmPlugin()) - - ... - ... - - Both core ArchiveBox code and plugin code depend on python >= 3.10 and django >= 5.0 w/ sqlite and a filesystem. - Core ArchiveBox code can depend only on python and the pip libraries it ships with, and can never depend on plugin code / node / other binaries. - Plugin code can depend on archivebox core, other django apps, other pip libraries, and other plugins. - Plugins can provide BinProviders + Binaries which can depend on arbitrary other binaries / package managers like curl / wget / yt-dlp / etc. - - The execution interface between plugins is simply calling builtinplugins.npm.... functions directly, django handles - importing all plugin code. There is no need to manually register methods/classes, only register to call - impure setup functions or provide runtime state. - settings.CONFIGS / settings.BINPROVIDERS / settings.BINARIES /... etc. are reserved for dynamic runtime state only. - This state is exposed to the broader system in a flat namespace, e.g. CONFIG.IS_DOCKER=True, or BINARIES = [ - ..., Binary('node', abspath='/usr/local/bin/node', version='22.2.0'), ... - ] - - """ model_config = ConfigDict( extra="allow", arbitrary_types_allowed=True, diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py index 559a7dfb..9db7bcad 100644 --- a/archivebox/api/v1_cli.py +++ b/archivebox/api/v1_cli.py @@ -13,7 +13,7 @@ from ..main import ( schedule, ) from archivebox.misc.util import ansi_to_html -from ..config.legacy import ONLY_NEW +from archivebox.config import ARCHIVING_CONFIG from .auth import API_AUTH_METHODS @@ -58,7 +58,7 @@ class AddCommandSchema(Schema): urls: List[str] tag: str = "" depth: int = 0 - update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW + update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW update_all: bool = False index_only: bool = False overwrite: bool = False @@ -68,7 +68,7 @@ class AddCommandSchema(Schema): class UpdateCommandSchema(Schema): resume: Optional[float] = 0 - only_new: bool = ONLY_NEW + only_new: bool = ARCHIVING_CONFIG.ONLY_NEW index_only: bool = False overwrite: bool = False after: Optional[float] = 0 @@ -85,7 +85,7 @@ class ScheduleCommandSchema(Schema): tag: str = '' depth: int = 0 overwrite: bool = False - update: bool = not ONLY_NEW + update: bool = not ARCHIVING_CONFIG.ONLY_NEW clear: bool = False class ListCommandSchema(Schema): diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 8003ff22..6d5adafb 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -152,18 +152,15 @@ def run_subcommand(subcommand: str, subcommand_args = subcommand_args or [] if subcommand not in meta_cmds: - from ..config.legacy import setup_django, CONFIG + from archivebox.config.legacy import setup_django cmd_requires_db = subcommand in archive_cmds init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args - if cmd_requires_db: - check_data_folder(CONFIG) - setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending) if cmd_requires_db: - check_migrations(CONFIG) + check_migrations() module = import_module('.archivebox_{}'.format(subcommand), __package__) module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index ce4a5ed1..7eb3d52c 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -1,6 +1,6 @@ __package__ = 'archivebox.config' -from .constants import CONSTANTS, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION +from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION from .defaults import ( SHELL_CONFIG, STORAGE_CONFIG, @@ -23,4 +23,5 @@ __all__ = [ 'SERVER_CONFIG', 'ARCHIVING_CONFIG', 'SEARCH_BACKEND_CONFIG', + 'CONSTANTS_CONFIG', ] diff --git a/archivebox/config/legacy.py b/archivebox/config/legacy.py index c7b88b4a..e5273090 100644 --- a/archivebox/config/legacy.py +++ b/archivebox/config/legacy.py @@ -60,6 +60,7 @@ from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CON from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG from archivebox.plugins_extractor.wget.apps import WGET_CONFIG +from archivebox.plugins_extractor.curl.apps import CURL_CONFIG ANSI = SHELL_CONFIG.ANSI LDAP = LDAP_CONFIG.LDAP_ENABLED @@ -81,9 +82,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(), - 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(), + # 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(), - 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(), + # 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(), + + # 'CURL_CONFIG': CURL_CONFIG.as_legacy_config_schema(), 'ARCHIVE_METHOD_TOGGLES': { @@ -109,7 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'ARCHIVE_METHOD_OPTIONS': { 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')}, - 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'}, + # 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'}, 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, @@ -144,15 +147,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { ]}, 'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None}, - - 'CURL_ARGS': {'type': list, 'default': ['--silent', - '--location', - '--compressed' - ]}, - 'CURL_EXTRA_ARGS': {'type': list, 'default': None}, - 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, - 'SINGLEFILE_ARGS': {'type': list, 'default': None}, - 'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None}, }, 'DEPENDENCY_CONFIG': { @@ -164,9 +158,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'USE_YOUTUBEDL': {'type': bool, 'default': True}, 'USE_RIPGREP': {'type': bool, 'default': True}, - 'CURL_BINARY': {'type': str, 'default': 'curl'}, - 'GIT_BINARY': {'type': str, 'default': 'git'}, - 'NODE_BINARY': {'type': str, 'default': 'node'}, + # 'GIT_BINARY': {'type': str, 'default': 'git'}, + # 'CURL_BINARY': {'type': str, 'default': 'curl'}, + # 'NODE_BINARY': {'type': str, 'default': 'node'}, # 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl # 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, # 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, @@ -209,21 +203,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)}, - - 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])}, - 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None}, - # 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)}, - 'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []}, - 'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []}, - 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']}, - 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']}, - - 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, - 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, - 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, + # 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, + # 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, + # 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, - 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, + # 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, # 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, # 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, @@ -613,13 +598,13 @@ def get_dependency_info(config: benedict) -> ConfigValue: # 'is_valid': True, # }, - 'CURL_BINARY': { - 'path': bin_path(config['CURL_BINARY']), - 'version': config['CURL_VERSION'], - 'hash': bin_hash(config['CURL_BINARY']), - 'enabled': config['USE_CURL'], - 'is_valid': bool(config['CURL_VERSION']), - }, + # 'CURL_BINARY': { + # 'path': bin_path(config['CURL_BINARY']), + # 'version': config['CURL_VERSION'], + # 'hash': bin_hash(config['CURL_BINARY']), + # 'enabled': config['USE_CURL'], + # 'is_valid': bool(config['CURL_VERSION']), + # }, # 'WGET_BINARY': { # 'path': bin_path(config['WGET_BINARY']), # 'version': config['WGET_VERSION'], @@ -641,13 +626,13 @@ def get_dependency_info(config: benedict) -> ConfigValue: # 'enabled': config['USE_MERCURY'], # 'is_valid': bool(config['MERCURY_VERSION']), # }, - 'GIT_BINARY': { - 'path': bin_path(config['GIT_BINARY']), - 'version': config['GIT_VERSION'], - 'hash': bin_hash(config['GIT_BINARY']), - 'enabled': config['USE_GIT'], - 'is_valid': bool(config['GIT_VERSION']), - }, + # 'GIT_BINARY': { + # 'path': bin_path(config['GIT_BINARY']), + # 'version': config['GIT_VERSION'], + # 'hash': bin_hash(config['GIT_BINARY']), + # 'enabled': config['USE_GIT'], + # 'is_valid': bool(config['GIT_VERSION']), + # }, # 'SINGLEFILE_BINARY': { # 'path': bin_path(config['SINGLEFILE_BINARY']), # 'version': config['SINGLEFILE_VERSION'], diff --git a/archivebox/config/views.py b/archivebox/config/views.py index 33f97b87..f7828718 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -76,7 +76,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: relevant_configs = { key: val - for key, val in settings.CONFIG.items() + for key, val in settings.FLAT_CONFIG.items() if '_BINARY' in key or '_VERSION' in key } @@ -105,6 +105,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: f'{config_key}' for config_key, config_value in relevant_configs.items() if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower() + or config_value.lower().endswith(binary.name.lower()) # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower() ))) # if not binary.provider_overrides: diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 113b97db..52e4fb5a 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -36,7 +36,7 @@ from main import remove from extractors import archive_links -CONFIG = settings.CONFIG +CONFIG = settings.FLAT_CONFIG GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False} diff --git a/archivebox/core/auth.py b/archivebox/core/auth.py index 536e0778..b3892322 100644 --- a/archivebox/core/auth.py +++ b/archivebox/core/auth.py @@ -1,13 +1,11 @@ __package__ = 'archivebox.core' -from ..config.legacy import ( - LDAP -) +from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG def register_signals(): - if LDAP: + if LDAP_CONFIG.LDAP_ENABLED: import django_auth_ldap.backend from .auth_ldap import create_user diff --git a/archivebox/core/auth_ldap.py b/archivebox/core/auth_ldap.py index 1d0e8658..7e94c316 100644 --- a/archivebox/core/auth_ldap.py +++ b/archivebox/core/auth_ldap.py @@ -1,9 +1,7 @@ -from ..config.legacy import ( - LDAP_CREATE_SUPERUSER -) +from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG def create_user(sender, user=None, ldap_user=None, **kwargs): - if not user.id and LDAP_CREATE_SUPERUSER: + if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER: user.is_superuser = True user.is_staff = True diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py index 4cd45e01..181d67f0 100644 --- a/archivebox/core/middleware.py +++ b/archivebox/core/middleware.py @@ -5,7 +5,7 @@ from django.utils import timezone from django.contrib.auth.middleware import RemoteUserMiddleware from django.core.exceptions import ImproperlyConfigured -from ..config.legacy import PUBLIC_SNAPSHOTS, REVERSE_PROXY_USER_HEADER, REVERSE_PROXY_WHITELIST +from archivebox.config import SERVER_CONFIG def detect_timezone(request, activate: bool=True): @@ -32,7 +32,7 @@ def CacheControlMiddleware(get_response): response = get_response(request) if '/archive/' in request.path or '/static/' in request.path: - policy = 'public' if PUBLIC_SNAPSHOTS else 'private' + policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private' response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300' # print('Set Cache-Control header to', response['Cache-Control']) return response @@ -40,15 +40,15 @@ def CacheControlMiddleware(get_response): return middleware class ReverseProxyAuthMiddleware(RemoteUserMiddleware): - header = 'HTTP_{normalized}'.format(normalized=REVERSE_PROXY_USER_HEADER.replace('-', '_').upper()) + header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper()) def process_request(self, request): - if REVERSE_PROXY_WHITELIST == '': + if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == '': return ip = request.META.get('REMOTE_ADDR') - for cidr in REVERSE_PROXY_WHITELIST.split(','): + for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(','): try: network = ipaddress.ip_network(cidr) except ValueError: diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index c0e612c7..d97c8529 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -13,9 +13,7 @@ import abx.archivebox import abx.archivebox.use import abx.django.use -from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa - -from ..config.legacy import CONFIG +from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG, SERVER_CONFIG # noqa IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3] IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ @@ -80,7 +78,7 @@ LOGOUT_REDIRECT_URL = os.environ.get('LOGOUT_REDIRECT_URL', '/') PASSWORD_RESET_URL = '/accounts/password_reset/' APPEND_SLASH = True -DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv) +DEBUG = SHELL_CONFIG.DEBUG or ('--debug' in sys.argv) INSTALLED_APPS = [ @@ -364,10 +362,10 @@ STORAGES = { ### Security Settings ################################################################################ -SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') +SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') -ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',') -CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(','))) +ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(',') +CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(','))) # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com) # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index 971b8ea2..e9eb4bca 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -10,7 +10,7 @@ from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthC from .serve_static import serve_static # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306 -# from .config.legacy import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE +# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE} diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 8e05e4b2..7dbbf110 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -1,7 +1,7 @@ __package__ = 'archivebox.core' -from typing import Callable -from benedict import benedict +import inspect +from typing import Callable, get_type_hints from pathlib import Path from django.shortcuts import render, redirect @@ -27,21 +27,13 @@ from core.admin import result_url from queues.tasks import bg_add -from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG -from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG - -from ..config.legacy import ( - CONFIG_SCHEMA, - DYNAMIC_CONFIG_SCHEMA, - USER_CONFIG, - CONFIG, -) -from ..logging_util import printable_filesize +from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG from archivebox.misc.util import base_url, htmlencode, ts_to_date_str -from ..search import query_search_index -from .serve_static import serve_static_with_byterange_support -CONFIG = benedict({**CONSTANTS, **CONFIG, **settings.FLAT_CONFIG}) +from .serve_static import serve_static_with_byterange_support +from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG +from ..logging_util import printable_filesize +from ..search import query_search_index class HomepageView(View): @@ -502,27 +494,43 @@ class HealthCheckView(View): def find_config_section(key: str) -> str: - if key in CONSTANTS: + if key in CONSTANTS_CONFIG: return 'CONSTANT' matching_sections = [ - name for name, opts in CONFIG_SCHEMA.items() if key in opts + section.id for section in settings.CONFIGS.values() if key in section.model_fields ] section = matching_sections[0] if matching_sections else 'DYNAMIC' return section def find_config_default(key: str) -> str: - default_val = USER_CONFIG.get(key, {}).get('default', lambda: None) + if key in CONSTANTS_CONFIG: + return str(CONSTANTS_CONFIG[key]) + + default_val = None + + for config in settings.CONFIGS.values(): + if key in config.model_fields: + default_val = config.model_fields[key].default + break + if isinstance(default_val, Callable): - return None + default_val = inspect.getsource(default_val).split('lambda', 1)[-1].split(':', 1)[-1].replace('\n', ' ').strip() + if default_val.count(')') > default_val.count('('): + default_val = default_val[:-1] else: - default_val = repr(default_val) + default_val = str(default_val) + + return default_val def find_config_type(key: str) -> str: - if key in USER_CONFIG: - return str(USER_CONFIG[key]['type']) - elif key in DYNAMIC_CONFIG_SCHEMA: - return str(type(CONFIG[key])) + for config in settings.CONFIGS.values(): + if hasattr(config, key): + type_hints = get_type_hints(config) + try: + return str(type_hints[key].__name__) + except AttributeError: + return str(type_hints[key]) return 'str' def key_is_safe(key: str) -> bool: @@ -543,40 +551,29 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: "Value": [], "Default": [], # "Documentation": [], - "Aliases": [], + # "Aliases": [], } - for section in CONFIG_SCHEMA.keys(): - for key in CONFIG_SCHEMA[section].keys(): - rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '') + for section in reversed(list(settings.CONFIGS.values())): + for key, field in section.model_fields.items(): + rows['Section'].append(section.id) # section.replace('_', ' ').title().replace(' Config', '') rows['Key'].append(ItemLink(key, key=key)) - rows['Type'].append(mark_safe(f'{find_config_type(key)}')) - rows['Value'].append(mark_safe(f'{CONFIG[key]}') if key_is_safe(key) else '******** (redacted)') - rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) + rows['Type'].append(format_html('{}', find_config_type(key))) + rows['Value'].append(mark_safe(f'{getattr(section, key)}') if key_is_safe(key) else '******** (redacted)') + rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) - rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', []))) - - section = 'DYNAMIC' - for key in DYNAMIC_CONFIG_SCHEMA.keys(): - if key in CONSTANTS: - continue - rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '') - rows['Key'].append(ItemLink(key, key=key)) - rows['Type'].append(mark_safe(f'{find_config_type(key)}')) - rows['Value'].append(mark_safe(f'{CONFIG[key]}') if key_is_safe(key) else '******** (redacted)') - rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) - # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) - rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '') + # rows['Aliases'].append(', '.join(find_config_aliases(key))) + section = 'CONSTANT' - for key in CONSTANTS.keys(): + for key in CONSTANTS_CONFIG.keys(): rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '') rows['Key'].append(ItemLink(key, key=key)) - rows['Type'].append(mark_safe(f'{find_config_type(key)}')) - rows['Value'].append(mark_safe(f'{CONFIG[key]}') if key_is_safe(key) else '******** (redacted)') - rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) + rows['Type'].append(format_html('{}', getattr(type(CONSTANTS_CONFIG[key]), '__name__', repr(CONSTANTS_CONFIG[key])))) + rows['Value'].append(format_html('{}', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)') + rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) - rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '') + # rows['Aliases'].append('') return TableContext( @@ -589,11 +586,12 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' - aliases = USER_CONFIG.get(key, {}).get("aliases", []) + # aliases = USER_CONFIG.get(key, {}).get("aliases", []) + aliases = [] - if key in CONSTANTS: + if key in CONSTANTS_CONFIG: section_header = mark_safe(f'[CONSTANTS]   {key}   (read-only, hardcoded by ArchiveBox)') - elif key in USER_CONFIG: + elif key in settings.FLAT_CONFIG: section_header = mark_safe(f'data / ArchiveBox.conf   [{find_config_section(key)}]   {key}') else: section_header = mark_safe(f'[DYNAMIC CONFIG]   {key}   (read-only, calculated at runtime)') @@ -609,7 +607,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont "fields": { 'Key': key, 'Type': find_config_type(key), - 'Value': CONFIG[key] if key_is_safe(key) else '********', + 'Value': settings.FLAT_CONFIG[key] if key_is_safe(key) else '********', }, "help_texts": { 'Key': mark_safe(f''' @@ -619,25 +617,25 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont '''), 'Type': mark_safe(f''' - - See full definition in archivebox/config.py... + + See full definition in archivebox/config... '''), 'Value': mark_safe(f''' {'Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)

' if not key_is_safe(key) else ''}


Default:                               - + {find_config_default(key) or '↗️ See in ArchiveBox source code...'}

-

+

To change this value, edit data/ArchiveBox.conf or run:

archivebox config --set {key}="{ val.strip("'") if (val := find_config_default(key)) else - (repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'") + (repr(settings.FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'") }"

'''), diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index c9bbca2b..4c49f1f6 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -7,21 +7,10 @@ from collections import defaultdict from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file -from archivebox.misc.util import ( - enforce_types, - is_static_file, - dedupe, -) -from ..config.legacy import ( - TIMEOUT, - CURL_ARGS, - CURL_EXTRA_ARGS, - CHECK_SSL_VALIDITY, - SAVE_ARCHIVE_DOT_ORG, - CURL_BINARY, - CURL_VERSION, - CURL_USER_AGENT, -) +from archivebox.misc.util import enforce_types, is_static_file, dedupe +from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG +from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY + from ..logging_util import TimedProgress @@ -39,27 +28,30 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr # if open(path, 'r', encoding='utf-8').read().strip() != 'None': return False - return SAVE_ARCHIVE_DOT_ORG + return ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG @enforce_types -def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult: """submit site to archive.org for archiving via their service, save returned archive url""" + curl_binary = CURL_BINARY.load() + assert curl_binary.abspath and curl_binary.version + out_dir = out_dir or Path(link.link_dir) output: ArchiveOutput = get_output_path() archive_org_url = None submit_url = 'https://web.archive.org/save/{}'.format(link.url) # later options take precedence options = [ - *CURL_ARGS, - *CURL_EXTRA_ARGS, + *CURL_CONFIG.CURL_ARGS, + *CURL_CONFIG.CURL_EXTRA_ARGS, '--head', '--max-time', str(timeout), - *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), - *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []), + *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']), ] cmd = [ - CURL_BINARY, + str(curl_binary.abspath), *dedupe(options), submit_url, ] @@ -97,22 +89,22 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=CURL_VERSION, + cmd_version=str(curl_binary.version), output=output, status=status, **timer.stats, ) @enforce_types -def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]: +def parse_archive_dot_org_response(response: str) -> Tuple[List[str], List[str]]: # Parse archive.org response headers headers: Dict[str, List[str]] = defaultdict(list) # lowercase all the header names and store in dict for header in response.splitlines(): - if b':' not in header or not header.strip(): + if ':' not in header or not header.strip(): continue - name, val = header.decode().split(':', 1) + name, val = header.split(':', 1) headers[name.lower().strip()].append(val.strip()) # Get successful archive url in "content-location" header or any errors diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 4121aa29..06bc1386 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -2,16 +2,11 @@ __package__ = 'archivebox.extractors' from pathlib import Path -from typing import Optional - -from ..index.schema import Link, ArchiveResult, ArchiveOutput from archivebox.misc.system import chmod_file, run -from archivebox.misc.util import ( - enforce_types, - domain, - dedupe, -) -from ..config.legacy import CONFIG +from archivebox.misc.util import enforce_types, domain, dedupe +from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG +from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..logging_util import TimedProgress @@ -22,7 +17,7 @@ def should_save_favicon(link: Link, out_dir: str | Path | None=None, overwrite: if not overwrite and (out_dir / 'favicon.ico').exists(): return False - return CONFIG.SAVE_FAVICON + return FAVICON_CONFIG.SAVE_FAVICON @enforce_types def get_output_path(): @@ -30,26 +25,29 @@ def get_output_path(): @enforce_types -def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult: +def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult: """download site favicon from google's favicon api""" + curl_binary = CURL_BINARY.load() + assert curl_binary.abspath and curl_binary.version + out_dir = Path(out_dir or link.link_dir) assert out_dir.exists() output: ArchiveOutput = 'favicon.ico' # later options take precedence options = [ - *CONFIG.CURL_ARGS, - *CONFIG.CURL_EXTRA_ARGS, + *CURL_CONFIG.CURL_ARGS, + *CURL_CONFIG.CURL_EXTRA_ARGS, '--max-time', str(timeout), '--output', str(output), - *(['--user-agent', '{}'.format(CONFIG.CURL_USER_AGENT)] if CONFIG.CURL_USER_AGENT else []), - *([] if CONFIG.CHECK_SSL_VALIDITY else ['--insecure']), + *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []), + *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']), ] cmd = [ - CONFIG.CURL_BINARY, + str(curl_binary.abspath), *dedupe(options), - CONFIG.FAVICON_PROVIDER.format(domain(link.url)), + FAVICON_CONFIG.FAVICON_PROVIDER.format(domain(link.url)), ] status = 'failed' timer = TimedProgress(timeout, prefix=' ') @@ -65,7 +63,7 @@ def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFI return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=CONFIG.CURL_VERSION, + cmd_version=str(curl_binary.version), output=output, status=status, **timer.stats, diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index 90f82c00..2ae08064 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -4,7 +4,6 @@ __package__ = 'archivebox.extractors' from pathlib import Path from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file from archivebox.misc.util import ( enforce_types, @@ -14,8 +13,9 @@ from archivebox.misc.util import ( without_query, without_fragment, ) -from ..config.legacy import CONFIG +from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY from ..logging_util import TimedProgress +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError def get_output_path(): @@ -42,28 +42,31 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona return False is_clonable_url = ( - (domain(link.url) in CONFIG.GIT_DOMAINS) + (domain(link.url) in GIT_CONFIG.GIT_DOMAINS) or (extension(link.url) == 'git') ) if not is_clonable_url: return False - return CONFIG.SAVE_GIT + return GIT_CONFIG.SAVE_GIT @enforce_types -def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult: +def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=GIT_CONFIG.GIT_TIMEOUT) -> ArchiveResult: """download full site using git""" + + git_binary = GIT_BINARY.load() + assert git_binary.abspath and git_binary.version out_dir = out_dir or Path(link.link_dir) output: ArchiveOutput = get_output_path() output_path = out_dir / output output_path.mkdir(exist_ok=True) cmd = [ - CONFIG.GIT_BINARY, + str(git_binary.abspath), 'clone', - *CONFIG.GIT_ARGS, - *([] if CONFIG.CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), + *GIT_CONFIG.GIT_ARGS, + *([] if GIT_CONFIG.GIT_CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), without_query(without_fragment(link.url)), ] status = 'succeeded' @@ -88,7 +91,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEO return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=CONFIG.GIT_VERSION, + cmd_version=str(git_binary.version), output=output, status=status, **timer.stats, diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 4c188587..85946619 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -4,23 +4,14 @@ from pathlib import Path from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput from archivebox.misc.system import atomic_write from archivebox.misc.util import ( enforce_types, get_headers, dedupe, ) -from ..config.legacy import ( - TIMEOUT, - CURL_BINARY, - CURL_ARGS, - CURL_EXTRA_ARGS, - CURL_USER_AGENT, - CURL_VERSION, - CHECK_SSL_VALIDITY, - SAVE_HEADERS -) +from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..logging_util import TimedProgress def get_output_path(): @@ -29,34 +20,38 @@ def get_output_path(): @enforce_types def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / get_output_path()).exists(): + out_dir_path = Path(out_dir or link.link_dir) + assert out_dir_path + if not overwrite and (out_dir_path / get_output_path()).exists(): return False - return SAVE_HEADERS + return CURL_CONFIG.SAVE_HEADERS @enforce_types -def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult: """Download site headers""" - out_dir = Path(out_dir or link.link_dir) - output_folder = out_dir.absolute() + curl_binary = CURL_BINARY.load() + assert curl_binary.abspath and curl_binary.version + + out_dir_path = Path(out_dir or link.link_dir) + output_folder = out_dir_path.absolute() output: ArchiveOutput = get_output_path() status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') + timer = TimedProgress(timeout + 1, prefix=' ') # later options take precedence options = [ - *CURL_ARGS, - *CURL_EXTRA_ARGS, + *CURL_CONFIG.CURL_ARGS, + *CURL_CONFIG.CURL_EXTRA_ARGS, '--head', '--max-time', str(timeout), - *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), - *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []), + *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']), ] cmd = [ - CURL_BINARY, + str(curl_binary.abspath), *dedupe(options), link.url, ] @@ -72,8 +67,8 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) return ArchiveResult( cmd=cmd, - pwd=str(out_dir), - cmd_version=CURL_VERSION, + pwd=str(out_dir_path), + cmd_version=str(curl_binary.version), output=output, status=status, **timer.stats, diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py index 925b18a4..423f1601 100644 --- a/archivebox/extractors/htmltotext.py +++ b/archivebox/extractors/htmltotext.py @@ -5,18 +5,13 @@ import io from pathlib import Path from typing import Optional -from archivebox.config import VERSION -from ..config.legacy import ( - SAVE_HTMLTOTEXT, - TIMEOUT, -) -from ..index.schema import Link, ArchiveResult, ArchiveError -from ..logging_util import TimedProgress +from archivebox.config import VERSION, ARCHIVING_CONFIG +from archivebox.config.legacy import SAVE_HTMLTOTEXT from archivebox.misc.system import atomic_write -from archivebox.misc.util import ( - enforce_types, - is_static_file, -) +from archivebox.misc.util import enforce_types, is_static_file + +from ..logging_util import TimedProgress +from ..index.schema import Link, ArchiveResult, ArchiveError from .title import get_html @@ -122,7 +117,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite: @enforce_types -def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=ARCHIVING_CONFIG.TIMEOUT) -> ArchiveResult: """extract search-indexing-friendly text from an HTML document""" out_dir = Path(out_dir or link.link_dir) diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 7eb058be..fa528a97 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -5,23 +5,14 @@ from html.parser import HTMLParser from pathlib import Path from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.util import ( enforce_types, download_url, htmldecode, dedupe, ) -from ..config.legacy import ( - TIMEOUT, - CHECK_SSL_VALIDITY, - SAVE_TITLE, - CURL_BINARY, - CURL_ARGS, - CURL_EXTRA_ARGS, - CURL_VERSION, - CURL_USER_AGENT, -) +from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..logging_util import TimedProgress @@ -62,7 +53,7 @@ class TitleParser(HTMLParser): @enforce_types -def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: +def get_html(link: Link, path: Path, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> str: """ Try to find wget, singlefile and then dom files. If none is found, download the url again. @@ -98,7 +89,7 @@ def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Option if not overwrite and link.title and not link.title.lower().startswith('http'): return False - return SAVE_TITLE + return CURL_CONFIG.SAVE_TITLE def extract_title_with_regex(html): match = re.search(HTML_TITLE_REGEX, html) @@ -106,22 +97,25 @@ def extract_title_with_regex(html): return output @enforce_types -def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult: """try to guess the page's title from its content""" from core.models import Snapshot + curl_binary = CURL_BINARY.load() + assert curl_binary.abspath and curl_binary.version + output: ArchiveOutput = None # later options take precedence options = [ - *CURL_ARGS, - *CURL_EXTRA_ARGS, + *CURL_CONFIG.CURL_ARGS, + *CURL_CONFIG.CURL_EXTRA_ARGS, '--max-time', str(timeout), - *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), - *([] if CHECK_SSL_VALIDITY else ['--insecure']), + *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []), + *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']), ] cmd = [ - CURL_BINARY, + str(curl_binary.abspath), *dedupe(options), link.url, ] @@ -161,7 +155,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=CURL_VERSION, + cmd_version=str(curl_binary.version), output=output, status=status, **timer.stats, diff --git a/archivebox/main.py b/archivebox/main.py index 8e3a7200..1380cc8b 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -430,7 +430,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat def status(out_dir: Path=DATA_DIR) -> None: """Print out some info and statistics about the archive collection""" - check_data_folder(CONFIG) + check_data_folder() from core.models import Snapshot from django.contrib.auth import get_user_model @@ -573,7 +573,7 @@ def add(urls: Union[str, List[str]], run_subcommand('init', stdin=None, pwd=out_dir) # Load list of links from the existing index - check_data_folder(CONFIG) + check_data_folder() # worker = start_cli_workers() @@ -673,7 +673,7 @@ def remove(filter_str: Optional[str]=None, out_dir: Path=DATA_DIR) -> List[Link]: """Remove the specified URLs from the archive""" - check_data_folder(CONFIG) + check_data_folder() if snapshots is None: if filter_str and filter_patterns: @@ -762,7 +762,7 @@ def update(resume: Optional[float]=None, # from .queues.supervisor_util import start_cli_workers - check_data_folder(CONFIG) + check_data_folder() # start_cli_workers() new_links: List[Link] = [] # TODO: Remove input argument: only_new @@ -833,7 +833,7 @@ def list_all(filter_patterns_str: Optional[str]=None, out_dir: Path=DATA_DIR) -> Iterable[Link]: """List, filter, and export information about archive entries""" - check_data_folder(CONFIG) + check_data_folder() if filter_patterns and filter_patterns_str: stderr( @@ -881,7 +881,7 @@ def list_links(snapshots: Optional[QuerySet]=None, before: Optional[float]=None, out_dir: Path=DATA_DIR) -> Iterable[Link]: - check_data_folder(CONFIG) + check_data_folder() if snapshots: all_snapshots = snapshots @@ -905,7 +905,7 @@ def list_folders(links: List[Link], status: str, out_dir: Path=DATA_DIR) -> Dict[str, Optional[Link]]: - check_data_folder(CONFIG) + check_data_folder() STATUS_FUNCTIONS = { "indexed": get_indexed_folders, @@ -926,7 +926,7 @@ def list_folders(links: List[Link], raise ValueError('Status not recognized.') @enforce_types -def setup(out_dir: Path=DATA_DIR) -> None: +def install(out_dir: Path=DATA_DIR) -> None: """Automatically install all ArchiveBox dependencies and extras""" from rich import print @@ -937,40 +937,20 @@ def setup(out_dir: Path=DATA_DIR) -> None: stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green') - for binary in settings.BINARIES.values(): + for binary in reversed(list(settings.BINARIES.values())): try: print(binary.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) except Exception as e: print(f'[X] Failed to install {binary.name}: {e}') - # from plugins_extractor.curl.apps import CURL_BINARY - # print(CURL_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) - - # from plugins_extractor.wget.apps import WGET_BINARY - # print(WGET_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) - - # from plugins_extractor.ytdlp.apps import YTDLP_BINARY - # print(YTDLP_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) - - # from plugins_extractor.chrome.apps import CHROME_BINARY - # print(CHROME_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) - - # from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY - # print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) - - # from plugins_extractor.readability.apps import READABILITY_BINARY - # print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) - - # from plugins_extractor.mercury.apps import MERCURY_BINARY - # print(MERCURY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) - from django.contrib.auth import get_user_model User = get_user_model() if not User.objects.filter(is_superuser=True).exists(): - stderr('\n[+] Creating new admin user for the Web UI...', color='green') - run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) + stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green') + stderr(' archivebox manage createsuperuser') + # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green') @@ -978,6 +958,10 @@ def setup(out_dir: Path=DATA_DIR) -> None: run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir) +# backwards-compatibility: +setup = install + + @enforce_types def config(config_options_str: Optional[str]=None, config_options: Optional[List[str]]=None, @@ -989,7 +973,7 @@ def config(config_options_str: Optional[str]=None, from rich import print - check_data_folder(CONFIG) + check_data_folder() if config_options and config_options_str: stderr( '[X] You should either pass config values as an arguments ' @@ -1090,8 +1074,8 @@ def schedule(add: bool=False, out_dir: Path=DATA_DIR): """Set ArchiveBox to regularly import URLs at specific times using cron""" - check_data_folder(CONFIG) - from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY + check_data_folder() + from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) @@ -1228,7 +1212,7 @@ def server(runserver_args: Optional[List[str]]=None, print() - check_data_folder(CONFIG) + check_data_folder() from django.core.management import call_command from django.contrib.auth.models import User @@ -1280,7 +1264,7 @@ def server(runserver_args: Optional[List[str]]=None, def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None: """Run an ArchiveBox Django management command""" - check_data_folder(CONFIG) + check_data_folder() from django.core.management import execute_from_command_line if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY): @@ -1297,7 +1281,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None: def shell(out_dir: Path=DATA_DIR) -> None: """Enter an interactive ArchiveBox Django shell""" - check_data_folder(CONFIG) + check_data_folder() from django.core.management import call_command call_command("shell_plus") diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index 69e0c52c..4dcf1f0a 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -1,13 +1,11 @@ __package__ = 'archivebox.misc' -from benedict import benedict - from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG from .logging import stderr -def check_data_folder(config: benedict) -> None: +def check_data_folder() -> None: archive_dir_exists = ARCHIVE_DIR.exists() if not archive_dir_exists: @@ -23,7 +21,7 @@ def check_data_folder(config: benedict) -> None: raise SystemExit(2) -def check_migrations(config: benedict): +def check_migrations(): from ..index.sql import list_migrations pending_migrations = [name for status, name in list_migrations() if not status] diff --git a/archivebox/plugins_extractor/curl/apps.py b/archivebox/plugins_extractor/curl/apps.py index 4e4bfdea..cab683b5 100644 --- a/archivebox/plugins_extractor/curl/apps.py +++ b/archivebox/plugins_extractor/curl/apps.py @@ -1,10 +1,10 @@ __package__ = 'plugins_extractor.curl' -from typing import List, Optional, Dict +from typing import List, Optional from pathlib import Path from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict +from pydantic_pkgr import BinProvider, BinName from abx.archivebox.base_plugin import BasePlugin, BaseHook from abx.archivebox.base_configset import BaseConfigSet @@ -12,15 +12,26 @@ from abx.archivebox.base_binary import BaseBinary, env, apt, brew # from abx.archivebox.base_extractor import BaseExtractor, ExtractorName from archivebox.config import ARCHIVING_CONFIG - +from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG +from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG class CurlConfig(BaseConfigSet): - - SAVE_CURL: bool = True - # USE_CURL: bool = Field(default=lambda c: c.SAVE_HEADERS or c.SAVE_FAVICON) + SAVE_TITLE: bool = Field(default=True) + SAVE_HEADERS: bool = Field(default=True) + USE_CURL: bool = Field(default=lambda c: + ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG + or FAVICON_CONFIG.SAVE_FAVICON + or c.SAVE_HEADERS + or c.SAVE_TITLE + ) CURL_BINARY: str = Field(default='curl') + CURL_ARGS: List[str] = [ + '--silent', + '--location', + '--compressed', + ] CURL_EXTRA_ARGS: List[str] = [] CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) @@ -35,12 +46,6 @@ CURL_CONFIG = CurlConfig() class CurlBinary(BaseBinary): name: BinName = CURL_CONFIG.CURL_BINARY binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { - brew.name: { - 'abspath': lambda: bin_abspath(CURL_CONFIG.CURL_BINARY, PATH=f'/opt/homebrew/opt/curl/bin:{brew.PATH}'), - }, - } CURL_BINARY = CurlBinary() diff --git a/archivebox/plugins_extractor/wget/apps.py b/archivebox/plugins_extractor/wget/apps.py index e272df06..7cda7059 100644 --- a/archivebox/plugins_extractor/wget/apps.py +++ b/archivebox/plugins_extractor/wget/apps.py @@ -1,13 +1,13 @@ __package__ = 'plugins_extractor.wget' import sys -from typing import List, Optional, Dict +from typing import List, Optional from pathlib import Path from subprocess import run, DEVNULL from rich import print from pydantic import InstanceOf, Field, model_validator -from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict +from pydantic_pkgr import BinProvider, BinName from abx.archivebox.base_plugin import BasePlugin, BaseHook from abx.archivebox.base_configset import BaseConfigSet @@ -80,12 +80,6 @@ WGET_CONFIG = WgetConfig() class WgetBinary(BaseBinary): name: BinName = WGET_CONFIG.WGET_BINARY binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { - brew.name: { - 'abspath': lambda: bin_abspath(WGET_CONFIG.WGET_BINARY, PATH=f'/opt/homebrew/opt/wget/bin:{brew.PATH}'), - }, - } WGET_BINARY = WgetBinary() diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 37175512..0bb5d0c7 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -11,7 +11,7 @@ from archivebox.misc.util import enforce_types from archivebox.misc.logging import stderr from archivebox.config.legacy import ANSI -# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig +from archivebox.config import SEARCH_BACKEND_CONFIG def log_index_started(url): @@ -58,13 +58,13 @@ def get_indexable_content(results: QuerySet): def import_backend(): for backend in settings.SEARCH_BACKENDS.values(): - if backend.name == settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE: + if backend.name == SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE: return backend - raise Exception(f'Could not load {settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE} as search backend') + raise Exception(f'Could not load {SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE} as search backend') @enforce_types def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None: - if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND: + if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND: return if not skip_text_index and texts: @@ -86,7 +86,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet: from core.models import Snapshot - if settings.CONFIGS.SearchBackendConfig.USE_SEARCHING_BACKEND: + if SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND: backend = import_backend() try: snapshot_pks = backend.search(query) @@ -106,7 +106,7 @@ def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet: @enforce_types def flush_search_index(snapshots: QuerySet): - if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND or not snapshots: + if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND or not snapshots: return backend = import_backend() snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True)) diff --git a/archivebox/vendor/pydantic-pkgr b/archivebox/vendor/pydantic-pkgr index 4f9486ab..4f31b355 160000 --- a/archivebox/vendor/pydantic-pkgr +++ b/archivebox/vendor/pydantic-pkgr @@ -1 +1 @@ -Subproject commit 4f9486ab86a65f83ad1bfd94320795b8e09871aa +Subproject commit 4f31b355fbf319a54b38953795b17b1b04db4348