diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 0924fd32..4d53f3d5 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1,4 +1,30 @@ __package__ = 'archivebox' - from .monkey_patches import * + +import os +import importlib +from pathlib import Path + + +PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir +DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir + + +def _detect_installed_version(): + try: + return importlib.metadata.version(__package__ or 'archivebox') + except importlib.metadata.PackageNotFoundError: + try: + pyproject_config = (PACKAGE_DIR / 'pyproject.toml').read_text() + for line in pyproject_config: + if line.startswith('version = '): + return line.split(' = ', 1)[-1].strip('"') + except FileNotFoundError: + # building docs, pyproject.toml is not available + return 'dev' + + raise Exception('Failed to detect installed archivebox version!') + + +__version__ = _detect_installed_version() diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index bf3463c1..c394494a 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -6,6 +6,7 @@ import re import logging import inspect import tempfile +import archivebox from typing import Dict from pathlib import Path @@ -22,14 +23,16 @@ IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3] -PACKAGE_DIR = Path(__file__).resolve().parent.parent +PACKAGE_DIR = archivebox.PACKAGE_DIR assert PACKAGE_DIR == CONFIG.PACKAGE_DIR -DATA_DIR = Path(os.curdir).resolve() +DATA_DIR = archivebox.DATA_DIR assert DATA_DIR == CONFIG.OUTPUT_DIR ARCHIVE_DIR = DATA_DIR / 'archive' assert ARCHIVE_DIR == CONFIG.ARCHIVE_DIR +VERSION = archivebox.__version__ + ################################################################################ ### ArchiveBox Plugin Settings ################################################################################ @@ -164,11 +167,19 @@ STATIC_URL = '/static/' STATICFILES_DIRS = [ *([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []), + *[ + str(plugin_dir / 'static') + for plugin_dir in PLUGIN_DIRS.values() + ], str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'), ] TEMPLATE_DIRS = [ *([str(CONFIG.CUSTOM_TEMPLATES_DIR)] if CONFIG.CUSTOM_TEMPLATES_DIR else []), + *[ + str(plugin_dir / 'templates') + for plugin_dir in PLUGIN_DIRS.values() + ], str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'), str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'), str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME), @@ -394,7 +405,7 @@ SHELL_PLUS_PRINT_SQL = False IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner'] IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell' if IS_SHELL: - os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'core' / 'welcome_message.py') + os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'core' / 'shell_welcome_message.py') ################################################################################ @@ -411,7 +422,7 @@ TIME_ZONE = CONFIG.TIMEZONE # django convention is TIME_ZONE, archivebox from django.conf.locale.en import formats as en_formats # type: ignore -en_formats.DATETIME_FORMAT = DATETIME_FORMAT +en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT @@ -419,193 +430,10 @@ en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT ### Logging Settings ################################################################################ -IGNORABLE_URL_PATTERNS = [ - re.compile(r"/.*/?apple-touch-icon.*\.png"), - re.compile(r"/.*/?favicon\.ico"), - re.compile(r"/.*/?robots\.txt"), - re.compile(r"/.*/?.*\.(css|js)\.map"), - re.compile(r"/.*/?.*\.(css|js)\.map"), - re.compile(r"/static/.*"), - re.compile(r"/admin/jsi18n/"), -] -class NoisyRequestsFilter(logging.Filter): - def filter(self, record) -> bool: - logline = record.getMessage() - # '"GET /api/v1/docs HTTP/1.1" 200 1023' - # '"GET /static/admin/js/SelectFilter2.js HTTP/1.1" 200 15502' - # '"GET /static/admin/js/SelectBox.js HTTP/1.1" 304 0' - # '"GET /admin/jsi18n/ HTTP/1.1" 200 3352' - # '"GET /admin/api/apitoken/0191bbf8-fd5e-0b8c-83a8-0f32f048a0af/change/ HTTP/1.1" 200 28778' +from .settings_logging import SETTINGS_LOGGING, LOGS_DIR, ERROR_LOG - # ignore harmless 404s for the patterns in IGNORABLE_URL_PATTERNS - for pattern in IGNORABLE_URL_PATTERNS: - ignorable_GET_request = re.compile(f'"GET {pattern.pattern} HTTP/.*" (2..|30.|404) .+$', re.I | re.M) - if ignorable_GET_request.match(logline): - return False - - ignorable_404_pattern = re.compile(f'Not Found: {pattern.pattern}', re.I | re.M) - if ignorable_404_pattern.match(logline): - return False - - return True - - -class CustomOutboundWebhookLogFormatter(logging.Formatter): - def format(self, record): - result = super().format(record) - return result.replace('HTTP Request: ', 'OutboundWebhook: ') - - -ERROR_LOG = tempfile.NamedTemporaryFile().name - -if CONFIG.LOGS_DIR.exists(): - ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log') -else: - # historically too many edge cases here around creating log dir w/ correct permissions early on - # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr - print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}') - - -LOG_LEVEL_DATABASE = 'DEBUG' if DEBUG else 'WARNING' -LOG_LEVEL_REQUEST = 'DEBUG' if DEBUG else 'WARNING' - - -import pydantic -import django.template - -LOGGING = { - "version": 1, - "disable_existing_loggers": False, - "formatters": { - "rich": { - "datefmt": "[%Y-%m-%d %H:%M:%S]", - # "format": "{asctime} {levelname} {module} {name} {message} {username}", - "format": "%(name)s %(message)s", - }, - "outbound_webhooks": { - "()": CustomOutboundWebhookLogFormatter, - "datefmt": "[%Y-%m-%d %H:%M:%S]", - }, - }, - "filters": { - "noisyrequestsfilter": { - "()": NoisyRequestsFilter, - }, - "require_debug_false": { - "()": "django.utils.log.RequireDebugFalse", - }, - "require_debug_true": { - "()": "django.utils.log.RequireDebugTrue", - }, - }, - "handlers": { - # "console": { - # "level": "DEBUG", - # 'formatter': 'simple', - # "class": "logging.StreamHandler", - # 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'], - # }, - "default": { - "class": "rich.logging.RichHandler", - "formatter": "rich", - "level": "DEBUG", - "markup": False, - "rich_tracebacks": True, - "filters": ["noisyrequestsfilter"], - "tracebacks_suppress": [ - django, - pydantic, - ], - }, - "logfile": { - "level": "INFO", - "class": "logging.handlers.RotatingFileHandler", - "filename": ERROR_LOG, - "maxBytes": 1024 * 1024 * 25, # 25 MB - "backupCount": 10, - "formatter": "rich", - "filters": ["noisyrequestsfilter"], - }, - "outbound_webhooks": { - "class": "rich.logging.RichHandler", - "markup": False, - "rich_tracebacks": True, - "formatter": "outbound_webhooks", - }, - # "mail_admins": { - # "level": "ERROR", - # "filters": ["require_debug_false"], - # "class": "django.utils.log.AdminEmailHandler", - # }, - "null": { - "class": "logging.NullHandler", - }, - }, - "root": { - "handlers": ["default", "logfile"], - "level": "INFO", - "formatter": "rich", - }, - "loggers": { - "api": { - "handlers": ["default", "logfile"], - "level": "DEBUG", - }, - "checks": { - "handlers": ["default", "logfile"], - "level": "DEBUG", - }, - "core": { - "handlers": ["default", "logfile"], - "level": "DEBUG", - }, - "plugins_extractor": { - "handlers": ["default", "logfile"], - "level": "DEBUG", - }, - "httpx": { - "handlers": ["outbound_webhooks"], - "level": "INFO", - "formatter": "outbound_webhooks", - "propagate": False, - }, - "django": { - "handlers": ["default", "logfile"], - "level": "INFO", - "filters": ["noisyrequestsfilter"], - }, - "django.utils.autoreload": { - "propagate": False, - "handlers": [], - "level": "ERROR", - }, - "django.channels.server": { - # see archivebox.monkey_patches.ModifiedAccessLogGenerator for dedicated daphne server logging settings - "propagate": False, - "handlers": ["default", "logfile"], - "level": "INFO", - "filters": ["noisyrequestsfilter"], - }, - "django.server": { # logs all requests (2xx, 3xx, 4xx) - "propagate": False, - "handlers": ["default", "logfile"], - "level": "INFO", - "filters": ["noisyrequestsfilter"], - }, - "django.request": { # only logs 4xx and 5xx errors - "propagate": False, - "handlers": ["default", "logfile"], - "level": "ERROR", - "filters": ["noisyrequestsfilter"], - }, - "django.db.backends": { - "propagate": False, - "handlers": ["default"], - "level": LOG_LEVEL_DATABASE, - }, - }, -} +LOGGING = SETTINGS_LOGGING ################################################################################ diff --git a/archivebox/core/settings_logging.py b/archivebox/core/settings_logging.py new file mode 100644 index 00000000..3a012a9d --- /dev/null +++ b/archivebox/core/settings_logging.py @@ -0,0 +1,198 @@ +import re +import tempfile +import logging + +import pydantic +import django.template + +import archivebox + + +IGNORABLE_URL_PATTERNS = [ + re.compile(r"/.*/?apple-touch-icon.*\.png"), + re.compile(r"/.*/?favicon\.ico"), + re.compile(r"/.*/?robots\.txt"), + re.compile(r"/.*/?.*\.(css|js)\.map"), + re.compile(r"/.*/?.*\.(css|js)\.map"), + re.compile(r"/static/.*"), + re.compile(r"/admin/jsi18n/"), +] + +class NoisyRequestsFilter(logging.Filter): + def filter(self, record) -> bool: + logline = record.getMessage() + # '"GET /api/v1/docs HTTP/1.1" 200 1023' + # '"GET /static/admin/js/SelectFilter2.js HTTP/1.1" 200 15502' + # '"GET /static/admin/js/SelectBox.js HTTP/1.1" 304 0' + # '"GET /admin/jsi18n/ HTTP/1.1" 200 3352' + # '"GET /admin/api/apitoken/0191bbf8-fd5e-0b8c-83a8-0f32f048a0af/change/ HTTP/1.1" 200 28778' + + # ignore harmless 404s for the patterns in IGNORABLE_URL_PATTERNS + for pattern in IGNORABLE_URL_PATTERNS: + ignorable_GET_request = re.compile(f'"GET {pattern.pattern} HTTP/.*" (2..|30.|404) .+$', re.I | re.M) + if ignorable_GET_request.match(logline): + return False + + ignorable_404_pattern = re.compile(f'Not Found: {pattern.pattern}', re.I | re.M) + if ignorable_404_pattern.match(logline): + return False + + return True + + +class CustomOutboundWebhookLogFormatter(logging.Formatter): + def format(self, record): + result = super().format(record) + return result.replace('HTTP Request: ', 'OutboundWebhook: ') + + +ERROR_LOG = tempfile.NamedTemporaryFile().name + +LOGS_DIR = archivebox.DATA_DIR / 'logs' + +if LOGS_DIR.is_dir(): + ERROR_LOG = (LOGS_DIR / 'errors.log') +else: + # historically too many edge cases here around creating log dir w/ correct permissions early on + # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr + # print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}') + pass + + +LOG_LEVEL_DATABASE = 'WARNING' # if DEBUG else 'WARNING' +LOG_LEVEL_REQUEST = 'WARNING' # if DEBUG else 'WARNING' + + + +SETTINGS_LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "rich": { + "datefmt": "[%Y-%m-%d %H:%M:%S]", + # "format": "{asctime} {levelname} {module} {name} {message} {username}", + "format": "%(name)s %(message)s", + }, + "outbound_webhooks": { + "()": CustomOutboundWebhookLogFormatter, + "datefmt": "[%Y-%m-%d %H:%M:%S]", + }, + }, + "filters": { + "noisyrequestsfilter": { + "()": NoisyRequestsFilter, + }, + "require_debug_false": { + "()": "django.utils.log.RequireDebugFalse", + }, + "require_debug_true": { + "()": "django.utils.log.RequireDebugTrue", + }, + }, + "handlers": { + # "console": { + # "level": "DEBUG", + # 'formatter': 'simple', + # "class": "logging.StreamHandler", + # 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'], + # }, + "default": { + "class": "rich.logging.RichHandler", + "formatter": "rich", + "level": "DEBUG", + "markup": False, + "rich_tracebacks": True, + "filters": ["noisyrequestsfilter"], + "tracebacks_suppress": [ + django, + pydantic, + ], + }, + "logfile": { + "level": "INFO", + "class": "logging.handlers.RotatingFileHandler", + "filename": ERROR_LOG, + "maxBytes": 1024 * 1024 * 25, # 25 MB + "backupCount": 10, + "formatter": "rich", + "filters": ["noisyrequestsfilter"], + }, + "outbound_webhooks": { + "class": "rich.logging.RichHandler", + "markup": False, + "rich_tracebacks": True, + "formatter": "outbound_webhooks", + }, + # "mail_admins": { + # "level": "ERROR", + # "filters": ["require_debug_false"], + # "class": "django.utils.log.AdminEmailHandler", + # }, + "null": { + "class": "logging.NullHandler", + }, + }, + "root": { + "handlers": ["default", "logfile"], + "level": "INFO", + "formatter": "rich", + }, + "loggers": { + "api": { + "handlers": ["default", "logfile"], + "level": "DEBUG", + }, + "checks": { + "handlers": ["default", "logfile"], + "level": "DEBUG", + }, + "core": { + "handlers": ["default", "logfile"], + "level": "DEBUG", + }, + "plugins_extractor": { + "handlers": ["default", "logfile"], + "level": "DEBUG", + }, + "httpx": { + "handlers": ["outbound_webhooks"], + "level": "INFO", + "formatter": "outbound_webhooks", + "propagate": False, + }, + "django": { + "handlers": ["default", "logfile"], + "level": "INFO", + "filters": ["noisyrequestsfilter"], + }, + "django.utils.autoreload": { + "propagate": False, + "handlers": [], + "level": "ERROR", + }, + "django.channels.server": { + # see archivebox.monkey_patches.ModifiedAccessLogGenerator for dedicated daphne server logging settings + "propagate": False, + "handlers": ["default", "logfile"], + "level": "INFO", + "filters": ["noisyrequestsfilter"], + }, + "django.server": { # logs all requests (2xx, 3xx, 4xx) + "propagate": False, + "handlers": ["default", "logfile"], + "level": "INFO", + "filters": ["noisyrequestsfilter"], + }, + "django.request": { # only logs 4xx and 5xx errors + "propagate": False, + "handlers": ["default", "logfile"], + "level": "ERROR", + "filters": ["noisyrequestsfilter"], + }, + "django.db.backends": { + "propagate": False, + "handlers": ["default"], + "level": LOG_LEVEL_DATABASE, + }, + }, +} diff --git a/archivebox/core/welcome_message.py b/archivebox/core/shell_welcome_message.py similarity index 100% rename from archivebox/core/welcome_message.py rename to archivebox/core/shell_welcome_message.py diff --git a/archivebox/plugantic/base_configset.py b/archivebox/plugantic/base_configset.py index d104afd5..01f9d12d 100644 --- a/archivebox/plugantic/base_configset.py +++ b/archivebox/plugantic/base_configset.py @@ -176,21 +176,42 @@ class ArchiveBoxBaseConfig(BaseSettings): """Populate any unset values using function provided as their default""" for key, field in self.model_fields.items(): - config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False)) value = getattr(self, key) + if isinstance(value, Callable): - # if value is a function, execute it to get the actual value, passing existing config as a dict arg + # if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected if func_takes_args_or_kwargs(value): + # assemble dict of existing field values to pass to default factory functions + config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False)) computed_default = field.default(config_so_far) else: + # otherwise it's a pure function with no args, just call it computed_default = field.default() - # check to make sure default factory return value matches type annotation + # coerce/check to make sure default factory return value matches type annotation TypeAdapter(field.annotation).validate_python(computed_default) # set generated default value as final validated value setattr(self, key, computed_default) return self + + def update_in_place(self, warn=True, **kwargs): + """ + Update the config with new values. Use this sparingly! We should almost never be updating config at runtime. + Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment + + Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it. + SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue. + """ + if warn: + print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:') + for key, value in kwargs.items(): + os.environ[key] = str(value) + original_value = getattr(self, key) + if warn: + print(f' {key}={original_value} -> {value}') + self.__init__() + return self class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg] hook_type: ClassVar[HookType] = 'CONFIG' diff --git a/archivebox/plugins_auth/ldap/settings.py b/archivebox/plugins_auth/ldap/settings.py index f9eb7a3c..a4aa0b40 100644 --- a/archivebox/plugins_auth/ldap/settings.py +++ b/archivebox/plugins_auth/ldap/settings.py @@ -20,8 +20,9 @@ except ImportError: class LdapConfig(BaseConfigSet): """ - LDAP Config gets imported by core/settings.py very early during startup, so it needs to be in a separate file from apps.py - so that it can be imported during settings.py initialization before the apps are loaded. + LDAP Config gets imported by core/settings.py very early during startup. + It needs to be in a separate file from apps.py so that it can be imported + during settings.py initialization before the apps are loaded. """ section: ClassVar[ConfigSectionName] = 'LDAP_CONFIG' @@ -41,20 +42,29 @@ class LdapConfig(BaseConfigSet): @model_validator(mode='after') def validate_ldap_config(self): + # Check that LDAP libraries are installed if self.LDAP_ENABLED and LDAP_LIB is None: - sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n') + sys.stderr.write('[X] Error: LDAP Authentication is enabled but LDAP libraries are not installed. You may need to run: pip install archivebox[ldap]\n') # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap # sys.exit(1) - self.LDAP_ENABLED = False + self.update(LDAP_ENABLED=False) - if self.LDAP_ENABLED: - assert ( - self.LDAP_SERVER_URI - and self.LDAP_BIND_DN - and self.LDAP_BIND_PASSWORD - and self.LDAP_USER_BASE - and self.LDAP_USER_FILTER - ), 'LDAP_* config options must all be set if LDAP_ENABLED=True' + # Check that all required LDAP config options are set + all_config_is_set = ( + self.LDAP_SERVER_URI + and self.LDAP_BIND_DN + and self.LDAP_BIND_PASSWORD + and self.LDAP_USER_BASE + and self.LDAP_USER_FILTER + ) + if self.LDAP_ENABLED and not all_config_is_set: + missing_config_options = [ + key for key, value in self.model_dump().items() + if value is None and key != 'LDAP_ENABLED' + ] + sys.stderr.write('[X] Error: LDAP_* config options must all be set if LDAP_ENABLED=True\n') + sys.stderr.write(f' Missing: {", ".join(missing_config_options)}\n') + self.update(LDAP_ENABLED=False) return self @property diff --git a/archivebox/plugins_search/ripgrep/apps.py b/archivebox/plugins_search/ripgrep/apps.py index 1326c010..006a049a 100644 --- a/archivebox/plugins_search/ripgrep/apps.py +++ b/archivebox/plugins_search/ripgrep/apps.py @@ -27,9 +27,22 @@ class RipgrepConfig(BaseConfigSet): section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG' RIPGREP_BINARY: str = Field(default='rg') + + RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg') + RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [ + # https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md + f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}', + '--type-not=ignore', + '--ignore-case', + '--files-with-matches', + '--regexp', + ]) + RIPGREP_SEARCH_DIR: str = Field(default=lambda: str(settings.ARCHIVE_DIR)) RIPGREP_CONFIG = RipgrepConfig() + + class RipgrepBinary(BaseBinary): name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] @@ -41,17 +54,8 @@ class RipgrepBinary(BaseBinary): RIPGREP_BINARY = RipgrepBinary() - -RG_IGNORE_EXTENSIONS = ('css','js','orig','svg') - -RG_ADD_TYPE = '--type-add' -RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}" -RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l) -RG_REGEX_ARGUMENT = '-e' - -TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/' -ts_regex = re.compile(TIMESTAMP_REGEX) - +# regex to match archive//... snapshot dir names +TIMESTAMP_REGEX = re.compile(r'\/([\d]+\.[\d]+)\/') class RipgrepSearchBackend(BaseSearchBackend): name: str = 'ripgrep' @@ -67,30 +71,29 @@ class RipgrepSearchBackend(BaseSearchBackend): @staticmethod def search(text: str) -> List[str]: - rg_bin = RIPGREP_BINARY.load() - if not rg_bin.version: + from core.models import Snapshot + + ripgrep_binary = RIPGREP_BINARY.load() + if not ripgrep_binary.version: raise Exception("ripgrep binary not found, install ripgrep to use this search backend") - rg_cmd = [ - rg_bin.abspath, - RG_ADD_TYPE, - RG_IGNORE_ARGUMENTS, - RG_DEFAULT_ARGUMENTS, - RG_REGEX_ARGUMENT, - text, - str(settings.ARCHIVE_DIR) + cmd = [ + ripgrep_binary.abspath, + *RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT, + text, + RIPGREP_CONFIG.RIPGREP_SEARCH_DIR, ] - rg = run(rg_cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True) + proc = run(cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True) timestamps = set() - for path in rg.stdout.splitlines(): - ts = ts_regex.findall(path) + for path in proc.stdout.splitlines(): + ts = TIMESTAMP_REGEX.findall(path) if ts: timestamps.add(ts[0]) snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] return snap_ids - + RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend() diff --git a/archivebox/plugins_search/sonic/apps.py b/archivebox/plugins_search/sonic/apps.py index f6d7a6eb..1c8077ab 100644 --- a/archivebox/plugins_search/sonic/apps.py +++ b/archivebox/plugins_search/sonic/apps.py @@ -1,5 +1,6 @@ __package__ = 'archivebox.plugins_search.sonic' +import os import sys from typing import List, Dict, ClassVar, Generator, cast @@ -38,16 +39,24 @@ class SonicConfig(BaseConfigSet): SONIC_PASSWORD: str = Field(default='SecretPassword', alias='SEARCH_BACKEND_PASSWORD') SONIC_COLLECTION: str = Field(default='archivebox') SONIC_BUCKET: str = Field(default='archivebox') + + SONIC_MAX_CHUNK_LENGTH: int = Field(default=2000) + SONIC_MAX_TEXT_LENGTH: int = Field(default=100000000) + SONIC_MAX_RETRIES: int = Field(default=5) @model_validator(mode='after') def validate_sonic_port(self): - if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic': - if SONIC_LIB is None: - sys.stderr.write('[!] Sonic search backend is enabled but not installed. Install Sonic to use the Sonic search backend.\n') + if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic' and SONIC_LIB is None: + sys.stderr.write('[X] Error: Sonic search backend is enabled but sonic-client lib is not installed. You may need to run: pip install archivebox[sonic]\n') + # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap + # sys.exit(1) + SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') return self SONIC_CONFIG = SonicConfig() + + class SonicBinary(BaseBinary): name: BinName = SONIC_CONFIG.SONIC_BINARY binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo @@ -57,6 +66,7 @@ class SonicBinary(BaseBinary): # cargo.name: {'packages': lambda: ['sonic-server']}, # TODO: add cargo } + # TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally # def on_get_version(self): # with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: # return SemVer.parse(str(ingestcl.protocol)) @@ -64,11 +74,6 @@ class SonicBinary(BaseBinary): SONIC_BINARY = SonicBinary() -MAX_SONIC_TEXT_TOTAL_LENGTH = 100000000 # dont index more than 100 million characters per text -MAX_SONIC_TEXT_CHUNK_LENGTH = 2000 # dont index more than 2000 characters per chunk -MAX_SONIC_ERRORS_BEFORE_ABORT = 5 - - class SonicSearchBackend(BaseSearchBackend): name: str = 'sonic' @@ -80,11 +85,11 @@ class SonicSearchBackend(BaseSearchBackend): with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: for text in texts: chunks = ( - text[i:i+MAX_SONIC_TEXT_CHUNK_LENGTH] + text[i:i+SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH] for i in range( 0, - min(len(text), MAX_SONIC_TEXT_TOTAL_LENGTH), - MAX_SONIC_TEXT_CHUNK_LENGTH, + min(len(text), SONIC_CONFIG.SONIC_MAX_TEXT_LENGTH), + SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH, ) ) try: @@ -93,7 +98,7 @@ class SonicSearchBackend(BaseSearchBackend): except Exception as err: print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}') error_count += 1 - if error_count > MAX_SONIC_ERRORS_BEFORE_ABORT: + if error_count > SONIC_CONFIG.SONIC_MAX_RETRIES: raise @staticmethod diff --git a/archivebox/plugins_search/sqlite/apps.py b/archivebox/plugins_search/sqlite/apps.py index a3f9da10..c773843d 100644 --- a/archivebox/plugins_search/sqlite/apps.py +++ b/archivebox/plugins_search/sqlite/apps.py @@ -1,8 +1,9 @@ __package__ = 'archivebox.plugins_search.sqlite' +import sys import sqlite3 import codecs -from typing import List, ClassVar, Generator, Callable +from typing import List, ClassVar, Iterable, Callable from django.conf import settings from django.db import connection as database @@ -17,7 +18,7 @@ from plugantic.base_hook import BaseHook from plugantic.base_searchbackend import BaseSearchBackend # Depends on Other Plugins: -# from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG +from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG @@ -26,19 +27,21 @@ from plugantic.base_searchbackend import BaseSearchBackend class SqliteftsConfig(BaseConfigSet): section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG' - SQLITEFTS_SEPARATE_DATABASE: bool = Field(default=True, alias='FTS_SEPARATE_DATABASE') - SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS') - SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH') + SQLITEFTS_SEPARATE_DATABASE: bool = Field(default=True, alias='FTS_SEPARATE_DATABASE') + SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS') + SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH') - SQLITEFTS_DB: str = Field(default='search.sqlite3') - SQLITEFTS_TABLE: str = Field(default='snapshot_fts') - SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts') - SQLITEFTS_COLUMN: str = Field(default='texts') + # Not really meant to be user-modified, just here as constants + SQLITEFTS_DB: str = Field(default='search.sqlite3') + SQLITEFTS_TABLE: str = Field(default='snapshot_fts') + SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts') + SQLITEFTS_COLUMN: str = Field(default='texts') @model_validator(mode='after') def validate_fts_separate_database(self): - if self.SQLITEFTS_SEPARATE_DATABASE: - assert self.SQLITEFTS_DB, "SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True" + if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sqlite' and self.SQLITEFTS_SEPARATE_DATABASE and not self.SQLITEFTS_DB: + sys.stderr.write('[X] Error: SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True\n') + SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') return self @property @@ -84,8 +87,7 @@ def _escape_sqlite3(value: str, *, quote: str, errors='strict') -> str: nul_index = encodable.find("\x00") if nul_index >= 0: - error = UnicodeEncodeError("NUL-terminated utf-8", encodable, - nul_index, nul_index + 1, "NUL not allowed") + error = UnicodeEncodeError("NUL-terminated utf-8", encodable, nul_index, nul_index + 1, "NUL not allowed") error_handler = codecs.lookup_error(errors) replacement, _ = error_handler(error) assert isinstance(replacement, str), "handling a UnicodeEncodeError should return a str replacement" @@ -224,7 +226,7 @@ class SqliteftsSearchBackend(BaseSearchBackend): return snap_ids @staticmethod - def flush(snapshot_ids: Generator[str, None, None]): + def flush(snapshot_ids: Iterable[str]): snapshot_ids = list(snapshot_ids) # type: ignore[assignment] id_table = _escape_sqlite3_identifier(SQLITEFTS_CONFIG.SQLITEFTS_ID_TABLE) @@ -243,7 +245,7 @@ SQLITEFTS_SEARCH_BACKEND = SqliteftsSearchBackend() class SqliteftsSearchPlugin(BasePlugin): app_label: str ='sqlitefts' - verbose_name: str = 'Sqlitefts' + verbose_name: str = 'SQLite FTS5 Search' hooks: List[InstanceOf[BaseHook]] = [ SQLITEFTS_CONFIG, diff --git a/archivebox/plugins_sys/config/apps.py b/archivebox/plugins_sys/config/apps.py index 15ca23e2..ecd905f7 100644 --- a/archivebox/plugins_sys/config/apps.py +++ b/archivebox/plugins_sys/config/apps.py @@ -115,9 +115,6 @@ class SearchBackendConfig(BaseConfigSet): USE_SEARCHING_BACKEND: bool = Field(default=True) SEARCH_BACKEND_ENGINE: str = Field(default='ripgrep') - SEARCH_BACKEND_HOST_NAME: str = Field(default='localhost') - SEARCH_BACKEND_PORT: int = Field(default=1491) - SEARCH_BACKEND_PASSWORD: str = Field(default='SecretPassword') SEARCH_PROCESS_HTML: bool = Field(default=True) SEARCH_BACKEND_TIMEOUT: int = Field(default=10)