finish migrating almost all config to new system

This commit is contained in:
Nick Sweeting 2024-09-30 23:21:34 -07:00
parent 4b6a2a3e50
commit d21bc86075
No known key found for this signature in database
25 changed files with 246 additions and 349 deletions

View file

@ -13,43 +13,6 @@ HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', '
hook_type_names: Tuple[HookType] = get_args(HookType) hook_type_names: Tuple[HookType] = get_args(HookType)
class BaseHook(BaseModel): class BaseHook(BaseModel):
"""
A Plugin consists of a list of Hooks, applied to django.conf.settings when AppConfig.read() -> Plugin.register() is called.
Plugin.register() then calls each Hook.register() on the provided settings.
each Hook.regsiter() function (ideally pure) takes a django.conf.settings as input and returns a new one back.
or
it modifies django.conf.settings in-place to add changes corresponding to its HookType.
e.g. for a HookType.CONFIG, the Hook.register() function places the hook in settings.CONFIG (and settings.HOOKS)
An example of an impure Hook would be a CHECK that modifies settings but also calls django.core.checks.register(check).
In practice any object that subclasses BaseHook and provides a .register() function can behave as a Hook.
setup_django() -> imports all settings.INSTALLED_APPS...
# django imports AppConfig, models, migrations, admins, etc. for all installed apps
# django then calls AppConfig.ready() on each installed app...
plugins_pkg.npm.NpmPlugin().AppConfig.ready() # called by django
plugins_pkg.npm.NpmPlugin().register(settings) ->
plugins_pkg.npm.NpmConfigSet().register(settings)
abx.archivebox.base_configset.BaseConfigSet().register(settings)
abx.archivebox.base_hook.BaseHook().register(settings, parent_plugin=plugins_pkg.npm.NpmPlugin())
...
...
Both core ArchiveBox code and plugin code depend on python >= 3.10 and django >= 5.0 w/ sqlite and a filesystem.
Core ArchiveBox code can depend only on python and the pip libraries it ships with, and can never depend on plugin code / node / other binaries.
Plugin code can depend on archivebox core, other django apps, other pip libraries, and other plugins.
Plugins can provide BinProviders + Binaries which can depend on arbitrary other binaries / package managers like curl / wget / yt-dlp / etc.
The execution interface between plugins is simply calling builtinplugins.npm.... functions directly, django handles
importing all plugin code. There is no need to manually register methods/classes, only register to call
impure setup functions or provide runtime state.
settings.CONFIGS / settings.BINPROVIDERS / settings.BINARIES /... etc. are reserved for dynamic runtime state only.
This state is exposed to the broader system in a flat namespace, e.g. CONFIG.IS_DOCKER=True, or BINARIES = [
..., Binary('node', abspath='/usr/local/bin/node', version='22.2.0'), ...
]
"""
model_config = ConfigDict( model_config = ConfigDict(
extra="allow", extra="allow",
arbitrary_types_allowed=True, arbitrary_types_allowed=True,

View file

@ -13,7 +13,7 @@ from ..main import (
schedule, schedule,
) )
from archivebox.misc.util import ansi_to_html from archivebox.misc.util import ansi_to_html
from ..config.legacy import ONLY_NEW from archivebox.config import ARCHIVING_CONFIG
from .auth import API_AUTH_METHODS from .auth import API_AUTH_METHODS
@ -58,7 +58,7 @@ class AddCommandSchema(Schema):
urls: List[str] urls: List[str]
tag: str = "" tag: str = ""
depth: int = 0 depth: int = 0
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
update_all: bool = False update_all: bool = False
index_only: bool = False index_only: bool = False
overwrite: bool = False overwrite: bool = False
@ -68,7 +68,7 @@ class AddCommandSchema(Schema):
class UpdateCommandSchema(Schema): class UpdateCommandSchema(Schema):
resume: Optional[float] = 0 resume: Optional[float] = 0
only_new: bool = ONLY_NEW only_new: bool = ARCHIVING_CONFIG.ONLY_NEW
index_only: bool = False index_only: bool = False
overwrite: bool = False overwrite: bool = False
after: Optional[float] = 0 after: Optional[float] = 0
@ -85,7 +85,7 @@ class ScheduleCommandSchema(Schema):
tag: str = '' tag: str = ''
depth: int = 0 depth: int = 0
overwrite: bool = False overwrite: bool = False
update: bool = not ONLY_NEW update: bool = not ARCHIVING_CONFIG.ONLY_NEW
clear: bool = False clear: bool = False
class ListCommandSchema(Schema): class ListCommandSchema(Schema):

View file

@ -152,18 +152,15 @@ def run_subcommand(subcommand: str,
subcommand_args = subcommand_args or [] subcommand_args = subcommand_args or []
if subcommand not in meta_cmds: if subcommand not in meta_cmds:
from ..config.legacy import setup_django, CONFIG from archivebox.config.legacy import setup_django
cmd_requires_db = subcommand in archive_cmds cmd_requires_db = subcommand in archive_cmds
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
if cmd_requires_db:
check_data_folder(CONFIG)
setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending) setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
if cmd_requires_db: if cmd_requires_db:
check_migrations(CONFIG) check_migrations()
module = import_module('.archivebox_{}'.format(subcommand), __package__) module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore

View file

@ -1,6 +1,6 @@
__package__ = 'archivebox.config' __package__ = 'archivebox.config'
from .constants import CONSTANTS, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
from .defaults import ( from .defaults import (
SHELL_CONFIG, SHELL_CONFIG,
STORAGE_CONFIG, STORAGE_CONFIG,
@ -23,4 +23,5 @@ __all__ = [
'SERVER_CONFIG', 'SERVER_CONFIG',
'ARCHIVING_CONFIG', 'ARCHIVING_CONFIG',
'SEARCH_BACKEND_CONFIG', 'SEARCH_BACKEND_CONFIG',
'CONSTANTS_CONFIG',
] ]

View file

@ -60,6 +60,7 @@ from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CON
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG
ANSI = SHELL_CONFIG.ANSI ANSI = SHELL_CONFIG.ANSI
LDAP = LDAP_CONFIG.LDAP_ENABLED LDAP = LDAP_CONFIG.LDAP_ENABLED
@ -81,9 +82,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(), 'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(), # 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(), # 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
# 'CURL_CONFIG': CURL_CONFIG.as_legacy_config_schema(),
'ARCHIVE_METHOD_TOGGLES': { 'ARCHIVE_METHOD_TOGGLES': {
@ -109,7 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'ARCHIVE_METHOD_OPTIONS': { 'ARCHIVE_METHOD_OPTIONS': {
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')}, 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION','WINDOW_SIZE')},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'}, # 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
@ -144,15 +147,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
]}, ]},
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None}, 'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
'CURL_ARGS': {'type': list, 'default': ['--silent',
'--location',
'--compressed'
]},
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default': None},
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
}, },
'DEPENDENCY_CONFIG': { 'DEPENDENCY_CONFIG': {
@ -164,9 +158,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'USE_YOUTUBEDL': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True},
'USE_RIPGREP': {'type': bool, 'default': True}, 'USE_RIPGREP': {'type': bool, 'default': True},
'CURL_BINARY': {'type': str, 'default': 'curl'}, # 'GIT_BINARY': {'type': str, 'default': 'git'},
'GIT_BINARY': {'type': str, 'default': 'git'}, # 'CURL_BINARY': {'type': str, 'default': 'curl'},
'NODE_BINARY': {'type': str, 'default': 'node'}, # 'NODE_BINARY': {'type': str, 'default': 'node'},
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl # 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, # 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
# 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, # 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
@ -209,21 +203,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
# 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])}, # 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None}, # 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
# 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, # 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
# 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, # 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
# 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, # 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
@ -613,13 +598,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
# 'is_valid': True, # 'is_valid': True,
# }, # },
'CURL_BINARY': { # 'CURL_BINARY': {
'path': bin_path(config['CURL_BINARY']), # 'path': bin_path(config['CURL_BINARY']),
'version': config['CURL_VERSION'], # 'version': config['CURL_VERSION'],
'hash': bin_hash(config['CURL_BINARY']), # 'hash': bin_hash(config['CURL_BINARY']),
'enabled': config['USE_CURL'], # 'enabled': config['USE_CURL'],
'is_valid': bool(config['CURL_VERSION']), # 'is_valid': bool(config['CURL_VERSION']),
}, # },
# 'WGET_BINARY': { # 'WGET_BINARY': {
# 'path': bin_path(config['WGET_BINARY']), # 'path': bin_path(config['WGET_BINARY']),
# 'version': config['WGET_VERSION'], # 'version': config['WGET_VERSION'],
@ -641,13 +626,13 @@ def get_dependency_info(config: benedict) -> ConfigValue:
# 'enabled': config['USE_MERCURY'], # 'enabled': config['USE_MERCURY'],
# 'is_valid': bool(config['MERCURY_VERSION']), # 'is_valid': bool(config['MERCURY_VERSION']),
# }, # },
'GIT_BINARY': { # 'GIT_BINARY': {
'path': bin_path(config['GIT_BINARY']), # 'path': bin_path(config['GIT_BINARY']),
'version': config['GIT_VERSION'], # 'version': config['GIT_VERSION'],
'hash': bin_hash(config['GIT_BINARY']), # 'hash': bin_hash(config['GIT_BINARY']),
'enabled': config['USE_GIT'], # 'enabled': config['USE_GIT'],
'is_valid': bool(config['GIT_VERSION']), # 'is_valid': bool(config['GIT_VERSION']),
}, # },
# 'SINGLEFILE_BINARY': { # 'SINGLEFILE_BINARY': {
# 'path': bin_path(config['SINGLEFILE_BINARY']), # 'path': bin_path(config['SINGLEFILE_BINARY']),
# 'version': config['SINGLEFILE_VERSION'], # 'version': config['SINGLEFILE_VERSION'],

View file

@ -76,7 +76,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
relevant_configs = { relevant_configs = {
key: val key: val
for key, val in settings.CONFIG.items() for key, val in settings.FLAT_CONFIG.items()
if '_BINARY' in key or '_VERSION' in key if '_BINARY' in key or '_VERSION' in key
} }
@ -105,6 +105,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>' f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
for config_key, config_value in relevant_configs.items() for config_key, config_value in relevant_configs.items()
if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower() if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
or config_value.lower().endswith(binary.name.lower())
# or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower() # or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
))) )))
# if not binary.provider_overrides: # if not binary.provider_overrides:

View file

@ -36,7 +36,7 @@ from main import remove
from extractors import archive_links from extractors import archive_links
CONFIG = settings.CONFIG CONFIG = settings.FLAT_CONFIG
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False} GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}

View file

@ -1,13 +1,11 @@
__package__ = 'archivebox.core' __package__ = 'archivebox.core'
from ..config.legacy import ( from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
LDAP
)
def register_signals(): def register_signals():
if LDAP: if LDAP_CONFIG.LDAP_ENABLED:
import django_auth_ldap.backend import django_auth_ldap.backend
from .auth_ldap import create_user from .auth_ldap import create_user

View file

@ -1,9 +1,7 @@
from ..config.legacy import ( from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
LDAP_CREATE_SUPERUSER
)
def create_user(sender, user=None, ldap_user=None, **kwargs): def create_user(sender, user=None, ldap_user=None, **kwargs):
if not user.id and LDAP_CREATE_SUPERUSER: if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
user.is_superuser = True user.is_superuser = True
user.is_staff = True user.is_staff = True

View file

@ -5,7 +5,7 @@ from django.utils import timezone
from django.contrib.auth.middleware import RemoteUserMiddleware from django.contrib.auth.middleware import RemoteUserMiddleware
from django.core.exceptions import ImproperlyConfigured from django.core.exceptions import ImproperlyConfigured
from ..config.legacy import PUBLIC_SNAPSHOTS, REVERSE_PROXY_USER_HEADER, REVERSE_PROXY_WHITELIST from archivebox.config import SERVER_CONFIG
def detect_timezone(request, activate: bool=True): def detect_timezone(request, activate: bool=True):
@ -32,7 +32,7 @@ def CacheControlMiddleware(get_response):
response = get_response(request) response = get_response(request)
if '/archive/' in request.path or '/static/' in request.path: if '/archive/' in request.path or '/static/' in request.path:
policy = 'public' if PUBLIC_SNAPSHOTS else 'private' policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300' response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
# print('Set Cache-Control header to', response['Cache-Control']) # print('Set Cache-Control header to', response['Cache-Control'])
return response return response
@ -40,15 +40,15 @@ def CacheControlMiddleware(get_response):
return middleware return middleware
class ReverseProxyAuthMiddleware(RemoteUserMiddleware): class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
header = 'HTTP_{normalized}'.format(normalized=REVERSE_PROXY_USER_HEADER.replace('-', '_').upper()) header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())
def process_request(self, request): def process_request(self, request):
if REVERSE_PROXY_WHITELIST == '': if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == '':
return return
ip = request.META.get('REMOTE_ADDR') ip = request.META.get('REMOTE_ADDR')
for cidr in REVERSE_PROXY_WHITELIST.split(','): for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(','):
try: try:
network = ipaddress.ip_network(cidr) network = ipaddress.ip_network(cidr)
except ValueError: except ValueError:

View file

@ -13,9 +13,7 @@ import abx.archivebox
import abx.archivebox.use import abx.archivebox.use
import abx.django.use import abx.django.use
from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG, SERVER_CONFIG # noqa
from ..config.legacy import CONFIG
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3] IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
@ -80,7 +78,7 @@ LOGOUT_REDIRECT_URL = os.environ.get('LOGOUT_REDIRECT_URL', '/')
PASSWORD_RESET_URL = '/accounts/password_reset/' PASSWORD_RESET_URL = '/accounts/password_reset/'
APPEND_SLASH = True APPEND_SLASH = True
DEBUG = CONFIG.DEBUG or ('--debug' in sys.argv) DEBUG = SHELL_CONFIG.DEBUG or ('--debug' in sys.argv)
INSTALLED_APPS = [ INSTALLED_APPS = [
@ -364,10 +362,10 @@ STORAGES = {
### Security Settings ### Security Settings
################################################################################ ################################################################################
SECRET_KEY = CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
ALLOWED_HOSTS = CONFIG.ALLOWED_HOSTS.split(',') ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(',')
CSRF_TRUSTED_ORIGINS = list(set(CONFIG.CSRF_TRUSTED_ORIGINS.split(','))) CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com) # automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS # but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS

View file

@ -10,7 +10,7 @@ from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthC
from .serve_static import serve_static from .serve_static import serve_static
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306 # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from .config.legacy import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE # from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE} # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}

View file

@ -1,7 +1,7 @@
__package__ = 'archivebox.core' __package__ = 'archivebox.core'
from typing import Callable import inspect
from benedict import benedict from typing import Callable, get_type_hints
from pathlib import Path from pathlib import Path
from django.shortcuts import render, redirect from django.shortcuts import render, redirect
@ -27,21 +27,13 @@ from core.admin import result_url
from queues.tasks import bg_add from queues.tasks import bg_add
from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from ..config.legacy import (
CONFIG_SCHEMA,
DYNAMIC_CONFIG_SCHEMA,
USER_CONFIG,
CONFIG,
)
from ..logging_util import printable_filesize
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from ..search import query_search_index
from .serve_static import serve_static_with_byterange_support
CONFIG = benedict({**CONSTANTS, **CONFIG, **settings.FLAT_CONFIG}) from .serve_static import serve_static_with_byterange_support
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from ..logging_util import printable_filesize
from ..search import query_search_index
class HomepageView(View): class HomepageView(View):
@ -502,27 +494,43 @@ class HealthCheckView(View):
def find_config_section(key: str) -> str: def find_config_section(key: str) -> str:
if key in CONSTANTS: if key in CONSTANTS_CONFIG:
return 'CONSTANT' return 'CONSTANT'
matching_sections = [ matching_sections = [
name for name, opts in CONFIG_SCHEMA.items() if key in opts section.id for section in settings.CONFIGS.values() if key in section.model_fields
] ]
section = matching_sections[0] if matching_sections else 'DYNAMIC' section = matching_sections[0] if matching_sections else 'DYNAMIC'
return section return section
def find_config_default(key: str) -> str: def find_config_default(key: str) -> str:
default_val = USER_CONFIG.get(key, {}).get('default', lambda: None) if key in CONSTANTS_CONFIG:
return str(CONSTANTS_CONFIG[key])
default_val = None
for config in settings.CONFIGS.values():
if key in config.model_fields:
default_val = config.model_fields[key].default
break
if isinstance(default_val, Callable): if isinstance(default_val, Callable):
return None default_val = inspect.getsource(default_val).split('lambda', 1)[-1].split(':', 1)[-1].replace('\n', ' ').strip()
if default_val.count(')') > default_val.count('('):
default_val = default_val[:-1]
else: else:
default_val = repr(default_val) default_val = str(default_val)
return default_val return default_val
def find_config_type(key: str) -> str: def find_config_type(key: str) -> str:
if key in USER_CONFIG: for config in settings.CONFIGS.values():
return str(USER_CONFIG[key]['type']) if hasattr(config, key):
elif key in DYNAMIC_CONFIG_SCHEMA: type_hints = get_type_hints(config)
return str(type(CONFIG[key])) try:
return str(type_hints[key].__name__)
except AttributeError:
return str(type_hints[key])
return 'str' return 'str'
def key_is_safe(key: str) -> bool: def key_is_safe(key: str) -> bool:
@ -543,40 +551,29 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
"Value": [], "Value": [],
"Default": [], "Default": [],
# "Documentation": [], # "Documentation": [],
"Aliases": [], # "Aliases": [],
} }
for section in CONFIG_SCHEMA.keys(): for section in reversed(list(settings.CONFIGS.values())):
for key in CONFIG_SCHEMA[section].keys(): for key, field in section.model_fields.items():
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '') rows['Section'].append(section.id) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key)) rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>')) rows['Type'].append(format_html('<code>{}</code>', find_config_type(key)))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)') rows['Value'].append(mark_safe(f'<code>{getattr(section, key)}</code>') if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>')) rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>')) # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', []))) # rows['Aliases'].append(', '.join(find_config_aliases(key)))
section = 'DYNAMIC'
for key in DYNAMIC_CONFIG_SCHEMA.keys():
if key in CONSTANTS:
continue
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
section = 'CONSTANT' section = 'CONSTANT'
for key in CONSTANTS.keys(): for key in CONSTANTS_CONFIG.keys():
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '') rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key)) rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>')) rows['Type'].append(format_html('<code>{}</code>', getattr(type(CONSTANTS_CONFIG[key]), '__name__', repr(CONSTANTS_CONFIG[key]))))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)') rows['Value'].append(format_html('<code>{}</code>', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)')
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>')) rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>')) # rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '') # rows['Aliases'].append('')
return TableContext( return TableContext(
@ -589,11 +586,12 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
aliases = USER_CONFIG.get(key, {}).get("aliases", []) # aliases = USER_CONFIG.get(key, {}).get("aliases", [])
aliases = []
if key in CONSTANTS: if key in CONSTANTS_CONFIG:
section_header = mark_safe(f'[CONSTANTS] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, hardcoded by ArchiveBox)</small>') section_header = mark_safe(f'[CONSTANTS] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, hardcoded by ArchiveBox)</small>')
elif key in USER_CONFIG: elif key in settings.FLAT_CONFIG:
section_header = mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}] &nbsp; <b><code style="color: lightgray">{key}</code></b>') section_header = mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}] &nbsp; <b><code style="color: lightgray">{key}</code></b>')
else: else:
section_header = mark_safe(f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>') section_header = mark_safe(f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
@ -609,7 +607,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
"fields": { "fields": {
'Key': key, 'Key': key,
'Type': find_config_type(key), 'Type': find_config_type(key),
'Value': CONFIG[key] if key_is_safe(key) else '********', 'Value': settings.FLAT_CONFIG[key] if key_is_safe(key) else '********',
}, },
"help_texts": { "help_texts": {
'Key': mark_safe(f''' 'Key': mark_safe(f'''
@ -619,25 +617,25 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
</span> </span>
'''), '''),
'Type': mark_safe(f''' 'Type': mark_safe(f'''
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"> <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
See full definition in <code>archivebox/config.py</code>... See full definition in <code>archivebox/config</code>...
</a> </a>
'''), '''),
'Value': mark_safe(f''' 'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''} {'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
<br/><hr/><br/> <br/><hr/><br/>
Default: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Default: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+{key}&type=code"> <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
<code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code> <code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code>
</a> </a>
<br/><br/> <br/><br/>
<p style="display: {"block" if key in USER_CONFIG else "none"}"> <p style="display: {"block" if key in settings.FLAT_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i> <i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
<br/><br/> <br/><br/>
<code>archivebox config --set {key}="{ <code>archivebox config --set {key}="{
val.strip("'") val.strip("'")
if (val := find_config_default(key)) else if (val := find_config_default(key)) else
(repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'") (repr(settings.FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
}"</code> }"</code>
</p> </p>
'''), '''),

View file

@ -7,21 +7,10 @@ from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import ( from archivebox.misc.util import enforce_types, is_static_file, dedupe
enforce_types, from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
is_static_file, from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
dedupe,
)
from ..config.legacy import (
TIMEOUT,
CURL_ARGS,
CURL_EXTRA_ARGS,
CHECK_SSL_VALIDITY,
SAVE_ARCHIVE_DOT_ORG,
CURL_BINARY,
CURL_VERSION,
CURL_USER_AGENT,
)
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@ -39,27 +28,30 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
# if open(path, 'r', encoding='utf-8').read().strip() != 'None': # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
return False return False
return SAVE_ARCHIVE_DOT_ORG return ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
@enforce_types @enforce_types
def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
"""submit site to archive.org for archiving via their service, save returned archive url""" """submit site to archive.org for archiving via their service, save returned archive url"""
curl_binary = CURL_BINARY.load()
assert curl_binary.abspath and curl_binary.version
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path() output: ArchiveOutput = get_output_path()
archive_org_url = None archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url) submit_url = 'https://web.archive.org/save/{}'.format(link.url)
# later options take precedence # later options take precedence
options = [ options = [
*CURL_ARGS, *CURL_CONFIG.CURL_ARGS,
*CURL_EXTRA_ARGS, *CURL_CONFIG.CURL_EXTRA_ARGS,
'--head', '--head',
'--max-time', str(timeout), '--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
] ]
cmd = [ cmd = [
CURL_BINARY, str(curl_binary.abspath),
*dedupe(options), *dedupe(options),
submit_url, submit_url,
] ]
@ -97,22 +89,22 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=CURL_VERSION, cmd_version=str(curl_binary.version),
output=output, output=output,
status=status, status=status,
**timer.stats, **timer.stats,
) )
@enforce_types @enforce_types
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]: def parse_archive_dot_org_response(response: str) -> Tuple[List[str], List[str]]:
# Parse archive.org response headers # Parse archive.org response headers
headers: Dict[str, List[str]] = defaultdict(list) headers: Dict[str, List[str]] = defaultdict(list)
# lowercase all the header names and store in dict # lowercase all the header names and store in dict
for header in response.splitlines(): for header in response.splitlines():
if b':' not in header or not header.strip(): if ':' not in header or not header.strip():
continue continue
name, val = header.decode().split(':', 1) name, val = header.split(':', 1)
headers[name.lower().strip()].append(val.strip()) headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors # Get successful archive url in "content-location" header or any errors

View file

@ -2,16 +2,11 @@ __package__ = 'archivebox.extractors'
from pathlib import Path from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from archivebox.misc.system import chmod_file, run from archivebox.misc.system import chmod_file, run
from archivebox.misc.util import ( from archivebox.misc.util import enforce_types, domain, dedupe
enforce_types, from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
domain, from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
dedupe, from ..index.schema import Link, ArchiveResult, ArchiveOutput
)
from ..config.legacy import CONFIG
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@ -22,7 +17,7 @@ def should_save_favicon(link: Link, out_dir: str | Path | None=None, overwrite:
if not overwrite and (out_dir / 'favicon.ico').exists(): if not overwrite and (out_dir / 'favicon.ico').exists():
return False return False
return CONFIG.SAVE_FAVICON return FAVICON_CONFIG.SAVE_FAVICON
@enforce_types @enforce_types
def get_output_path(): def get_output_path():
@ -30,26 +25,29 @@ def get_output_path():
@enforce_types @enforce_types
def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult: def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api""" """download site favicon from google's favicon api"""
curl_binary = CURL_BINARY.load()
assert curl_binary.abspath and curl_binary.version
out_dir = Path(out_dir or link.link_dir) out_dir = Path(out_dir or link.link_dir)
assert out_dir.exists() assert out_dir.exists()
output: ArchiveOutput = 'favicon.ico' output: ArchiveOutput = 'favicon.ico'
# later options take precedence # later options take precedence
options = [ options = [
*CONFIG.CURL_ARGS, *CURL_CONFIG.CURL_ARGS,
*CONFIG.CURL_EXTRA_ARGS, *CURL_CONFIG.CURL_EXTRA_ARGS,
'--max-time', str(timeout), '--max-time', str(timeout),
'--output', str(output), '--output', str(output),
*(['--user-agent', '{}'.format(CONFIG.CURL_USER_AGENT)] if CONFIG.CURL_USER_AGENT else []), *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
*([] if CONFIG.CHECK_SSL_VALIDITY else ['--insecure']), *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
] ]
cmd = [ cmd = [
CONFIG.CURL_BINARY, str(curl_binary.abspath),
*dedupe(options), *dedupe(options),
CONFIG.FAVICON_PROVIDER.format(domain(link.url)), FAVICON_CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
] ]
status = 'failed' status = 'failed'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
@ -65,7 +63,7 @@ def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CONFI
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=CONFIG.CURL_VERSION, cmd_version=str(curl_binary.version),
output=output, output=output,
status=status, status=status,
**timer.stats, **timer.stats,

View file

@ -4,7 +4,6 @@ __package__ = 'archivebox.extractors'
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
@ -14,8 +13,9 @@ from archivebox.misc.util import (
without_query, without_query,
without_fragment, without_fragment,
) )
from ..config.legacy import CONFIG from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
def get_output_path(): def get_output_path():
@ -42,28 +42,31 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
return False return False
is_clonable_url = ( is_clonable_url = (
(domain(link.url) in CONFIG.GIT_DOMAINS) (domain(link.url) in GIT_CONFIG.GIT_DOMAINS)
or (extension(link.url) == 'git') or (extension(link.url) == 'git')
) )
if not is_clonable_url: if not is_clonable_url:
return False return False
return CONFIG.SAVE_GIT return GIT_CONFIG.SAVE_GIT
@enforce_types @enforce_types
def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEOUT) -> ArchiveResult: def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=GIT_CONFIG.GIT_TIMEOUT) -> ArchiveResult:
"""download full site using git""" """download full site using git"""
git_binary = GIT_BINARY.load()
assert git_binary.abspath and git_binary.version
out_dir = out_dir or Path(link.link_dir) out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path() output: ArchiveOutput = get_output_path()
output_path = out_dir / output output_path = out_dir / output
output_path.mkdir(exist_ok=True) output_path.mkdir(exist_ok=True)
cmd = [ cmd = [
CONFIG.GIT_BINARY, str(git_binary.abspath),
'clone', 'clone',
*CONFIG.GIT_ARGS, *GIT_CONFIG.GIT_ARGS,
*([] if CONFIG.CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), *([] if GIT_CONFIG.GIT_CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
without_query(without_fragment(link.url)), without_query(without_fragment(link.url)),
] ]
status = 'succeeded' status = 'succeeded'
@ -88,7 +91,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=CONFIG.TIMEO
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=CONFIG.GIT_VERSION, cmd_version=str(git_binary.version),
output=output, output=output,
status=status, status=status,
**timer.stats, **timer.stats,

View file

@ -4,23 +4,14 @@ from pathlib import Path
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from archivebox.misc.system import atomic_write from archivebox.misc.system import atomic_write
from archivebox.misc.util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
get_headers, get_headers,
dedupe, dedupe,
) )
from ..config.legacy import ( from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
TIMEOUT, from ..index.schema import Link, ArchiveResult, ArchiveOutput
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_USER_AGENT,
CURL_VERSION,
CHECK_SSL_VALIDITY,
SAVE_HEADERS
)
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
def get_output_path(): def get_output_path():
@ -29,34 +20,38 @@ def get_output_path():
@enforce_types @enforce_types
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
out_dir = out_dir or Path(link.link_dir) out_dir_path = Path(out_dir or link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists(): assert out_dir_path
if not overwrite and (out_dir_path / get_output_path()).exists():
return False return False
return SAVE_HEADERS return CURL_CONFIG.SAVE_HEADERS
@enforce_types @enforce_types
def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
"""Download site headers""" """Download site headers"""
out_dir = Path(out_dir or link.link_dir) curl_binary = CURL_BINARY.load()
output_folder = out_dir.absolute() assert curl_binary.abspath and curl_binary.version
out_dir_path = Path(out_dir or link.link_dir)
output_folder = out_dir_path.absolute()
output: ArchiveOutput = get_output_path() output: ArchiveOutput = get_output_path()
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout + 1, prefix=' ')
# later options take precedence # later options take precedence
options = [ options = [
*CURL_ARGS, *CURL_CONFIG.CURL_ARGS,
*CURL_EXTRA_ARGS, *CURL_CONFIG.CURL_EXTRA_ARGS,
'--head', '--head',
'--max-time', str(timeout), '--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
] ]
cmd = [ cmd = [
CURL_BINARY, str(curl_binary.abspath),
*dedupe(options), *dedupe(options),
link.url, link.url,
] ]
@ -72,8 +67,8 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir_path),
cmd_version=CURL_VERSION, cmd_version=str(curl_binary.version),
output=output, output=output,
status=status, status=status,
**timer.stats, **timer.stats,

View file

@ -5,18 +5,13 @@ import io
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from archivebox.config import VERSION from archivebox.config import VERSION, ARCHIVING_CONFIG
from ..config.legacy import ( from archivebox.config.legacy import SAVE_HTMLTOTEXT
SAVE_HTMLTOTEXT,
TIMEOUT,
)
from ..index.schema import Link, ArchiveResult, ArchiveError
from ..logging_util import TimedProgress
from archivebox.misc.system import atomic_write from archivebox.misc.system import atomic_write
from archivebox.misc.util import ( from archivebox.misc.util import enforce_types, is_static_file
enforce_types,
is_static_file, from ..logging_util import TimedProgress
) from ..index.schema import Link, ArchiveResult, ArchiveError
from .title import get_html from .title import get_html
@ -122,7 +117,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
@enforce_types @enforce_types
def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=ARCHIVING_CONFIG.TIMEOUT) -> ArchiveResult:
"""extract search-indexing-friendly text from an HTML document""" """extract search-indexing-friendly text from an HTML document"""
out_dir = Path(out_dir or link.link_dir) out_dir = Path(out_dir or link.link_dir)

View file

@ -5,23 +5,14 @@ from html.parser import HTMLParser
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.util import ( from archivebox.misc.util import (
enforce_types, enforce_types,
download_url, download_url,
htmldecode, htmldecode,
dedupe, dedupe,
) )
from ..config.legacy import ( from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
TIMEOUT, from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
CHECK_SSL_VALIDITY,
SAVE_TITLE,
CURL_BINARY,
CURL_ARGS,
CURL_EXTRA_ARGS,
CURL_VERSION,
CURL_USER_AGENT,
)
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@ -62,7 +53,7 @@ class TitleParser(HTMLParser):
@enforce_types @enforce_types
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: def get_html(link: Link, path: Path, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> str:
""" """
Try to find wget, singlefile and then dom files. Try to find wget, singlefile and then dom files.
If none is found, download the url again. If none is found, download the url again.
@ -98,7 +89,7 @@ def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Option
if not overwrite and link.title and not link.title.lower().startswith('http'): if not overwrite and link.title and not link.title.lower().startswith('http'):
return False return False
return SAVE_TITLE return CURL_CONFIG.SAVE_TITLE
def extract_title_with_regex(html): def extract_title_with_regex(html):
match = re.search(HTML_TITLE_REGEX, html) match = re.search(HTML_TITLE_REGEX, html)
@ -106,22 +97,25 @@ def extract_title_with_regex(html):
return output return output
@enforce_types @enforce_types
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content""" """try to guess the page's title from its content"""
from core.models import Snapshot from core.models import Snapshot
curl_binary = CURL_BINARY.load()
assert curl_binary.abspath and curl_binary.version
output: ArchiveOutput = None output: ArchiveOutput = None
# later options take precedence # later options take precedence
options = [ options = [
*CURL_ARGS, *CURL_CONFIG.CURL_ARGS,
*CURL_EXTRA_ARGS, *CURL_CONFIG.CURL_EXTRA_ARGS,
'--max-time', str(timeout), '--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']), *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
] ]
cmd = [ cmd = [
CURL_BINARY, str(curl_binary.abspath),
*dedupe(options), *dedupe(options),
link.url, link.url,
] ]
@ -161,7 +155,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
return ArchiveResult( return ArchiveResult(
cmd=cmd, cmd=cmd,
pwd=str(out_dir), pwd=str(out_dir),
cmd_version=CURL_VERSION, cmd_version=str(curl_binary.version),
output=output, output=output,
status=status, status=status,
**timer.stats, **timer.stats,

View file

@ -430,7 +430,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
def status(out_dir: Path=DATA_DIR) -> None: def status(out_dir: Path=DATA_DIR) -> None:
"""Print out some info and statistics about the archive collection""" """Print out some info and statistics about the archive collection"""
check_data_folder(CONFIG) check_data_folder()
from core.models import Snapshot from core.models import Snapshot
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_model
@ -573,7 +573,7 @@ def add(urls: Union[str, List[str]],
run_subcommand('init', stdin=None, pwd=out_dir) run_subcommand('init', stdin=None, pwd=out_dir)
# Load list of links from the existing index # Load list of links from the existing index
check_data_folder(CONFIG) check_data_folder()
# worker = start_cli_workers() # worker = start_cli_workers()
@ -673,7 +673,7 @@ def remove(filter_str: Optional[str]=None,
out_dir: Path=DATA_DIR) -> List[Link]: out_dir: Path=DATA_DIR) -> List[Link]:
"""Remove the specified URLs from the archive""" """Remove the specified URLs from the archive"""
check_data_folder(CONFIG) check_data_folder()
if snapshots is None: if snapshots is None:
if filter_str and filter_patterns: if filter_str and filter_patterns:
@ -762,7 +762,7 @@ def update(resume: Optional[float]=None,
# from .queues.supervisor_util import start_cli_workers # from .queues.supervisor_util import start_cli_workers
check_data_folder(CONFIG) check_data_folder()
# start_cli_workers() # start_cli_workers()
new_links: List[Link] = [] # TODO: Remove input argument: only_new new_links: List[Link] = [] # TODO: Remove input argument: only_new
@ -833,7 +833,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
out_dir: Path=DATA_DIR) -> Iterable[Link]: out_dir: Path=DATA_DIR) -> Iterable[Link]:
"""List, filter, and export information about archive entries""" """List, filter, and export information about archive entries"""
check_data_folder(CONFIG) check_data_folder()
if filter_patterns and filter_patterns_str: if filter_patterns and filter_patterns_str:
stderr( stderr(
@ -881,7 +881,7 @@ def list_links(snapshots: Optional[QuerySet]=None,
before: Optional[float]=None, before: Optional[float]=None,
out_dir: Path=DATA_DIR) -> Iterable[Link]: out_dir: Path=DATA_DIR) -> Iterable[Link]:
check_data_folder(CONFIG) check_data_folder()
if snapshots: if snapshots:
all_snapshots = snapshots all_snapshots = snapshots
@ -905,7 +905,7 @@ def list_folders(links: List[Link],
status: str, status: str,
out_dir: Path=DATA_DIR) -> Dict[str, Optional[Link]]: out_dir: Path=DATA_DIR) -> Dict[str, Optional[Link]]:
check_data_folder(CONFIG) check_data_folder()
STATUS_FUNCTIONS = { STATUS_FUNCTIONS = {
"indexed": get_indexed_folders, "indexed": get_indexed_folders,
@ -926,7 +926,7 @@ def list_folders(links: List[Link],
raise ValueError('Status not recognized.') raise ValueError('Status not recognized.')
@enforce_types @enforce_types
def setup(out_dir: Path=DATA_DIR) -> None: def install(out_dir: Path=DATA_DIR) -> None:
"""Automatically install all ArchiveBox dependencies and extras""" """Automatically install all ArchiveBox dependencies and extras"""
from rich import print from rich import print
@ -937,40 +937,20 @@ def setup(out_dir: Path=DATA_DIR) -> None:
stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green') stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green')
for binary in settings.BINARIES.values(): for binary in reversed(list(settings.BINARIES.values())):
try: try:
print(binary.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'})) print(binary.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
except Exception as e: except Exception as e:
print(f'[X] Failed to install {binary.name}: {e}') print(f'[X] Failed to install {binary.name}: {e}')
# from plugins_extractor.curl.apps import CURL_BINARY
# print(CURL_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.wget.apps import WGET_BINARY
# print(WGET_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.ytdlp.apps import YTDLP_BINARY
# print(YTDLP_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.chrome.apps import CHROME_BINARY
# print(CHROME_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
# print(SINGLEFILE_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.readability.apps import READABILITY_BINARY
# print(READABILITY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
# from plugins_extractor.mercury.apps import MERCURY_BINARY
# print(MERCURY_BINARY.load_or_install().model_dump(exclude={'binproviders_supported', 'loaded_binprovider', 'provider_overrides', 'loaded_abspaths', 'bin_dir', 'loaded_respath'}))
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_model
User = get_user_model() User = get_user_model()
if not User.objects.filter(is_superuser=True).exists(): if not User.objects.filter(is_superuser=True).exists():
stderr('\n[+] Creating new admin user for the Web UI...', color='green') stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) stderr(' archivebox manage createsuperuser')
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green') stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
@ -978,6 +958,10 @@ def setup(out_dir: Path=DATA_DIR) -> None:
run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir) run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
# backwards-compatibility:
setup = install
@enforce_types @enforce_types
def config(config_options_str: Optional[str]=None, def config(config_options_str: Optional[str]=None,
config_options: Optional[List[str]]=None, config_options: Optional[List[str]]=None,
@ -989,7 +973,7 @@ def config(config_options_str: Optional[str]=None,
from rich import print from rich import print
check_data_folder(CONFIG) check_data_folder()
if config_options and config_options_str: if config_options and config_options_str:
stderr( stderr(
'[X] You should either pass config values as an arguments ' '[X] You should either pass config values as an arguments '
@ -1090,8 +1074,8 @@ def schedule(add: bool=False,
out_dir: Path=DATA_DIR): out_dir: Path=DATA_DIR):
"""Set ArchiveBox to regularly import URLs at specific times using cron""" """Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder(CONFIG) check_data_folder()
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
@ -1228,7 +1212,7 @@ def server(runserver_args: Optional[List[str]]=None,
print() print()
check_data_folder(CONFIG) check_data_folder()
from django.core.management import call_command from django.core.management import call_command
from django.contrib.auth.models import User from django.contrib.auth.models import User
@ -1280,7 +1264,7 @@ def server(runserver_args: Optional[List[str]]=None,
def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None: def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
"""Run an ArchiveBox Django management command""" """Run an ArchiveBox Django management command"""
check_data_folder(CONFIG) check_data_folder()
from django.core.management import execute_from_command_line from django.core.management import execute_from_command_line
if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY): if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
@ -1297,7 +1281,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
def shell(out_dir: Path=DATA_DIR) -> None: def shell(out_dir: Path=DATA_DIR) -> None:
"""Enter an interactive ArchiveBox Django shell""" """Enter an interactive ArchiveBox Django shell"""
check_data_folder(CONFIG) check_data_folder()
from django.core.management import call_command from django.core.management import call_command
call_command("shell_plus") call_command("shell_plus")

View file

@ -1,13 +1,11 @@
__package__ = 'archivebox.misc' __package__ = 'archivebox.misc'
from benedict import benedict
from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG
from .logging import stderr from .logging import stderr
def check_data_folder(config: benedict) -> None: def check_data_folder() -> None:
archive_dir_exists = ARCHIVE_DIR.exists() archive_dir_exists = ARCHIVE_DIR.exists()
if not archive_dir_exists: if not archive_dir_exists:
@ -23,7 +21,7 @@ def check_data_folder(config: benedict) -> None:
raise SystemExit(2) raise SystemExit(2)
def check_migrations(config: benedict): def check_migrations():
from ..index.sql import list_migrations from ..index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status] pending_migrations = [name for status, name in list_migrations() if not status]

View file

@ -1,10 +1,10 @@
__package__ = 'plugins_extractor.curl' __package__ = 'plugins_extractor.curl'
from typing import List, Optional, Dict from typing import List, Optional
from pathlib import Path from pathlib import Path
from pydantic import InstanceOf, Field from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_configset import BaseConfigSet
@ -12,15 +12,26 @@ from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName # from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config import ARCHIVING_CONFIG from archivebox.config import ARCHIVING_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
class CurlConfig(BaseConfigSet): class CurlConfig(BaseConfigSet):
SAVE_CURL: bool = True SAVE_TITLE: bool = Field(default=True)
SAVE_HEADERS: bool = Field(default=True)
# USE_CURL: bool = Field(default=lambda c: c.SAVE_HEADERS or c.SAVE_FAVICON) USE_CURL: bool = Field(default=lambda c:
ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
or FAVICON_CONFIG.SAVE_FAVICON
or c.SAVE_HEADERS
or c.SAVE_TITLE
)
CURL_BINARY: str = Field(default='curl') CURL_BINARY: str = Field(default='curl')
CURL_ARGS: List[str] = [
'--silent',
'--location',
'--compressed',
]
CURL_EXTRA_ARGS: List[str] = [] CURL_EXTRA_ARGS: List[str] = []
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
@ -36,12 +47,6 @@ class CurlBinary(BaseBinary):
name: BinName = CURL_CONFIG.CURL_BINARY name: BinName = CURL_CONFIG.CURL_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
brew.name: {
'abspath': lambda: bin_abspath(CURL_CONFIG.CURL_BINARY, PATH=f'/opt/homebrew/opt/curl/bin:{brew.PATH}'),
},
}
CURL_BINARY = CurlBinary() CURL_BINARY = CurlBinary()

View file

@ -1,13 +1,13 @@
__package__ = 'plugins_extractor.wget' __package__ = 'plugins_extractor.wget'
import sys import sys
from typing import List, Optional, Dict from typing import List, Optional
from pathlib import Path from pathlib import Path
from subprocess import run, DEVNULL from subprocess import run, DEVNULL
from rich import print from rich import print
from pydantic import InstanceOf, Field, model_validator from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import BinProvider, BinName, bin_abspath, BinProviderName, ProviderLookupDict from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_configset import BaseConfigSet
@ -81,12 +81,6 @@ class WgetBinary(BaseBinary):
name: BinName = WGET_CONFIG.WGET_BINARY name: BinName = WGET_CONFIG.WGET_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
brew.name: {
'abspath': lambda: bin_abspath(WGET_CONFIG.WGET_BINARY, PATH=f'/opt/homebrew/opt/wget/bin:{brew.PATH}'),
},
}
WGET_BINARY = WgetBinary() WGET_BINARY = WgetBinary()

View file

@ -11,7 +11,7 @@ from archivebox.misc.util import enforce_types
from archivebox.misc.logging import stderr from archivebox.misc.logging import stderr
from archivebox.config.legacy import ANSI from archivebox.config.legacy import ANSI
# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig from archivebox.config import SEARCH_BACKEND_CONFIG
def log_index_started(url): def log_index_started(url):
@ -58,13 +58,13 @@ def get_indexable_content(results: QuerySet):
def import_backend(): def import_backend():
for backend in settings.SEARCH_BACKENDS.values(): for backend in settings.SEARCH_BACKENDS.values():
if backend.name == settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE: if backend.name == SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE:
return backend return backend
raise Exception(f'Could not load {settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE} as search backend') raise Exception(f'Could not load {SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE} as search backend')
@enforce_types @enforce_types
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None: def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None:
if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND: if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND:
return return
if not skip_text_index and texts: if not skip_text_index and texts:
@ -86,7 +86,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet: def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
from core.models import Snapshot from core.models import Snapshot
if settings.CONFIGS.SearchBackendConfig.USE_SEARCHING_BACKEND: if SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
backend = import_backend() backend = import_backend()
try: try:
snapshot_pks = backend.search(query) snapshot_pks = backend.search(query)
@ -106,7 +106,7 @@ def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
@enforce_types @enforce_types
def flush_search_index(snapshots: QuerySet): def flush_search_index(snapshots: QuerySet):
if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND or not snapshots: if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND or not snapshots:
return return
backend = import_backend() backend = import_backend()
snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True)) snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))

@ -1 +1 @@
Subproject commit 4f9486ab86a65f83ad1bfd94320795b8e09871aa Subproject commit 4f31b355fbf319a54b38953795b17b1b04db4348