more settings loading tweaks and improvements

This commit is contained in:
Nick Sweeting 2024-09-24 15:13:54 -07:00
parent fbfd16e195
commit 97695bda5e
No known key found for this signature in database
10 changed files with 350 additions and 260 deletions

View file

@ -1,4 +1,30 @@
__package__ = 'archivebox' __package__ = 'archivebox'
from .monkey_patches import * from .monkey_patches import *
import os
import importlib
from pathlib import Path
PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir
DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir
def _detect_installed_version():
try:
return importlib.metadata.version(__package__ or 'archivebox')
except importlib.metadata.PackageNotFoundError:
try:
pyproject_config = (PACKAGE_DIR / 'pyproject.toml').read_text()
for line in pyproject_config:
if line.startswith('version = '):
return line.split(' = ', 1)[-1].strip('"')
except FileNotFoundError:
# building docs, pyproject.toml is not available
return 'dev'
raise Exception('Failed to detect installed archivebox version!')
__version__ = _detect_installed_version()

View file

@ -6,6 +6,7 @@ import re
import logging import logging
import inspect import inspect
import tempfile import tempfile
import archivebox
from typing import Dict from typing import Dict
from pathlib import Path from pathlib import Path
@ -22,14 +23,16 @@ IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3] IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
PACKAGE_DIR = Path(__file__).resolve().parent.parent PACKAGE_DIR = archivebox.PACKAGE_DIR
assert PACKAGE_DIR == CONFIG.PACKAGE_DIR assert PACKAGE_DIR == CONFIG.PACKAGE_DIR
DATA_DIR = Path(os.curdir).resolve() DATA_DIR = archivebox.DATA_DIR
assert DATA_DIR == CONFIG.OUTPUT_DIR assert DATA_DIR == CONFIG.OUTPUT_DIR
ARCHIVE_DIR = DATA_DIR / 'archive' ARCHIVE_DIR = DATA_DIR / 'archive'
assert ARCHIVE_DIR == CONFIG.ARCHIVE_DIR assert ARCHIVE_DIR == CONFIG.ARCHIVE_DIR
VERSION = archivebox.__version__
################################################################################ ################################################################################
### ArchiveBox Plugin Settings ### ArchiveBox Plugin Settings
################################################################################ ################################################################################
@ -164,11 +167,19 @@ STATIC_URL = '/static/'
STATICFILES_DIRS = [ STATICFILES_DIRS = [
*([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []), *([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
*[
str(plugin_dir / 'static')
for plugin_dir in PLUGIN_DIRS.values()
],
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'), str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'),
] ]
TEMPLATE_DIRS = [ TEMPLATE_DIRS = [
*([str(CONFIG.CUSTOM_TEMPLATES_DIR)] if CONFIG.CUSTOM_TEMPLATES_DIR else []), *([str(CONFIG.CUSTOM_TEMPLATES_DIR)] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
*[
str(plugin_dir / 'templates')
for plugin_dir in PLUGIN_DIRS.values()
],
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'), str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'),
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'), str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'),
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME), str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME),
@ -394,7 +405,7 @@ SHELL_PLUS_PRINT_SQL = False
IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner'] IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell' IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
if IS_SHELL: if IS_SHELL:
os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'core' / 'welcome_message.py') os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'core' / 'shell_welcome_message.py')
################################################################################ ################################################################################
@ -411,7 +422,7 @@ TIME_ZONE = CONFIG.TIMEZONE # django convention is TIME_ZONE, archivebox
from django.conf.locale.en import formats as en_formats # type: ignore from django.conf.locale.en import formats as en_formats # type: ignore
en_formats.DATETIME_FORMAT = DATETIME_FORMAT en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format
en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
@ -419,193 +430,10 @@ en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
### Logging Settings ### Logging Settings
################################################################################ ################################################################################
IGNORABLE_URL_PATTERNS = [
re.compile(r"/.*/?apple-touch-icon.*\.png"),
re.compile(r"/.*/?favicon\.ico"),
re.compile(r"/.*/?robots\.txt"),
re.compile(r"/.*/?.*\.(css|js)\.map"),
re.compile(r"/.*/?.*\.(css|js)\.map"),
re.compile(r"/static/.*"),
re.compile(r"/admin/jsi18n/"),
]
class NoisyRequestsFilter(logging.Filter): from .settings_logging import SETTINGS_LOGGING, LOGS_DIR, ERROR_LOG
def filter(self, record) -> bool:
logline = record.getMessage()
# '"GET /api/v1/docs HTTP/1.1" 200 1023'
# '"GET /static/admin/js/SelectFilter2.js HTTP/1.1" 200 15502'
# '"GET /static/admin/js/SelectBox.js HTTP/1.1" 304 0'
# '"GET /admin/jsi18n/ HTTP/1.1" 200 3352'
# '"GET /admin/api/apitoken/0191bbf8-fd5e-0b8c-83a8-0f32f048a0af/change/ HTTP/1.1" 200 28778'
# ignore harmless 404s for the patterns in IGNORABLE_URL_PATTERNS LOGGING = SETTINGS_LOGGING
for pattern in IGNORABLE_URL_PATTERNS:
ignorable_GET_request = re.compile(f'"GET {pattern.pattern} HTTP/.*" (2..|30.|404) .+$', re.I | re.M)
if ignorable_GET_request.match(logline):
return False
ignorable_404_pattern = re.compile(f'Not Found: {pattern.pattern}', re.I | re.M)
if ignorable_404_pattern.match(logline):
return False
return True
class CustomOutboundWebhookLogFormatter(logging.Formatter):
def format(self, record):
result = super().format(record)
return result.replace('HTTP Request: ', 'OutboundWebhook: ')
ERROR_LOG = tempfile.NamedTemporaryFile().name
if CONFIG.LOGS_DIR.exists():
ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log')
else:
# historically too many edge cases here around creating log dir w/ correct permissions early on
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
LOG_LEVEL_DATABASE = 'DEBUG' if DEBUG else 'WARNING'
LOG_LEVEL_REQUEST = 'DEBUG' if DEBUG else 'WARNING'
import pydantic
import django.template
LOGGING = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"rich": {
"datefmt": "[%Y-%m-%d %H:%M:%S]",
# "format": "{asctime} {levelname} {module} {name} {message} {username}",
"format": "%(name)s %(message)s",
},
"outbound_webhooks": {
"()": CustomOutboundWebhookLogFormatter,
"datefmt": "[%Y-%m-%d %H:%M:%S]",
},
},
"filters": {
"noisyrequestsfilter": {
"()": NoisyRequestsFilter,
},
"require_debug_false": {
"()": "django.utils.log.RequireDebugFalse",
},
"require_debug_true": {
"()": "django.utils.log.RequireDebugTrue",
},
},
"handlers": {
# "console": {
# "level": "DEBUG",
# 'formatter': 'simple',
# "class": "logging.StreamHandler",
# 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
# },
"default": {
"class": "rich.logging.RichHandler",
"formatter": "rich",
"level": "DEBUG",
"markup": False,
"rich_tracebacks": True,
"filters": ["noisyrequestsfilter"],
"tracebacks_suppress": [
django,
pydantic,
],
},
"logfile": {
"level": "INFO",
"class": "logging.handlers.RotatingFileHandler",
"filename": ERROR_LOG,
"maxBytes": 1024 * 1024 * 25, # 25 MB
"backupCount": 10,
"formatter": "rich",
"filters": ["noisyrequestsfilter"],
},
"outbound_webhooks": {
"class": "rich.logging.RichHandler",
"markup": False,
"rich_tracebacks": True,
"formatter": "outbound_webhooks",
},
# "mail_admins": {
# "level": "ERROR",
# "filters": ["require_debug_false"],
# "class": "django.utils.log.AdminEmailHandler",
# },
"null": {
"class": "logging.NullHandler",
},
},
"root": {
"handlers": ["default", "logfile"],
"level": "INFO",
"formatter": "rich",
},
"loggers": {
"api": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"checks": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"core": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"plugins_extractor": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"httpx": {
"handlers": ["outbound_webhooks"],
"level": "INFO",
"formatter": "outbound_webhooks",
"propagate": False,
},
"django": {
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.utils.autoreload": {
"propagate": False,
"handlers": [],
"level": "ERROR",
},
"django.channels.server": {
# see archivebox.monkey_patches.ModifiedAccessLogGenerator for dedicated daphne server logging settings
"propagate": False,
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.server": { # logs all requests (2xx, 3xx, 4xx)
"propagate": False,
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.request": { # only logs 4xx and 5xx errors
"propagate": False,
"handlers": ["default", "logfile"],
"level": "ERROR",
"filters": ["noisyrequestsfilter"],
},
"django.db.backends": {
"propagate": False,
"handlers": ["default"],
"level": LOG_LEVEL_DATABASE,
},
},
}
################################################################################ ################################################################################

View file

@ -0,0 +1,198 @@
import re
import tempfile
import logging
import pydantic
import django.template
import archivebox
IGNORABLE_URL_PATTERNS = [
re.compile(r"/.*/?apple-touch-icon.*\.png"),
re.compile(r"/.*/?favicon\.ico"),
re.compile(r"/.*/?robots\.txt"),
re.compile(r"/.*/?.*\.(css|js)\.map"),
re.compile(r"/.*/?.*\.(css|js)\.map"),
re.compile(r"/static/.*"),
re.compile(r"/admin/jsi18n/"),
]
class NoisyRequestsFilter(logging.Filter):
def filter(self, record) -> bool:
logline = record.getMessage()
# '"GET /api/v1/docs HTTP/1.1" 200 1023'
# '"GET /static/admin/js/SelectFilter2.js HTTP/1.1" 200 15502'
# '"GET /static/admin/js/SelectBox.js HTTP/1.1" 304 0'
# '"GET /admin/jsi18n/ HTTP/1.1" 200 3352'
# '"GET /admin/api/apitoken/0191bbf8-fd5e-0b8c-83a8-0f32f048a0af/change/ HTTP/1.1" 200 28778'
# ignore harmless 404s for the patterns in IGNORABLE_URL_PATTERNS
for pattern in IGNORABLE_URL_PATTERNS:
ignorable_GET_request = re.compile(f'"GET {pattern.pattern} HTTP/.*" (2..|30.|404) .+$', re.I | re.M)
if ignorable_GET_request.match(logline):
return False
ignorable_404_pattern = re.compile(f'Not Found: {pattern.pattern}', re.I | re.M)
if ignorable_404_pattern.match(logline):
return False
return True
class CustomOutboundWebhookLogFormatter(logging.Formatter):
def format(self, record):
result = super().format(record)
return result.replace('HTTP Request: ', 'OutboundWebhook: ')
ERROR_LOG = tempfile.NamedTemporaryFile().name
LOGS_DIR = archivebox.DATA_DIR / 'logs'
if LOGS_DIR.is_dir():
ERROR_LOG = (LOGS_DIR / 'errors.log')
else:
# historically too many edge cases here around creating log dir w/ correct permissions early on
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
# print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
pass
LOG_LEVEL_DATABASE = 'WARNING' # if DEBUG else 'WARNING'
LOG_LEVEL_REQUEST = 'WARNING' # if DEBUG else 'WARNING'
SETTINGS_LOGGING = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"rich": {
"datefmt": "[%Y-%m-%d %H:%M:%S]",
# "format": "{asctime} {levelname} {module} {name} {message} {username}",
"format": "%(name)s %(message)s",
},
"outbound_webhooks": {
"()": CustomOutboundWebhookLogFormatter,
"datefmt": "[%Y-%m-%d %H:%M:%S]",
},
},
"filters": {
"noisyrequestsfilter": {
"()": NoisyRequestsFilter,
},
"require_debug_false": {
"()": "django.utils.log.RequireDebugFalse",
},
"require_debug_true": {
"()": "django.utils.log.RequireDebugTrue",
},
},
"handlers": {
# "console": {
# "level": "DEBUG",
# 'formatter': 'simple',
# "class": "logging.StreamHandler",
# 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
# },
"default": {
"class": "rich.logging.RichHandler",
"formatter": "rich",
"level": "DEBUG",
"markup": False,
"rich_tracebacks": True,
"filters": ["noisyrequestsfilter"],
"tracebacks_suppress": [
django,
pydantic,
],
},
"logfile": {
"level": "INFO",
"class": "logging.handlers.RotatingFileHandler",
"filename": ERROR_LOG,
"maxBytes": 1024 * 1024 * 25, # 25 MB
"backupCount": 10,
"formatter": "rich",
"filters": ["noisyrequestsfilter"],
},
"outbound_webhooks": {
"class": "rich.logging.RichHandler",
"markup": False,
"rich_tracebacks": True,
"formatter": "outbound_webhooks",
},
# "mail_admins": {
# "level": "ERROR",
# "filters": ["require_debug_false"],
# "class": "django.utils.log.AdminEmailHandler",
# },
"null": {
"class": "logging.NullHandler",
},
},
"root": {
"handlers": ["default", "logfile"],
"level": "INFO",
"formatter": "rich",
},
"loggers": {
"api": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"checks": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"core": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"plugins_extractor": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"httpx": {
"handlers": ["outbound_webhooks"],
"level": "INFO",
"formatter": "outbound_webhooks",
"propagate": False,
},
"django": {
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.utils.autoreload": {
"propagate": False,
"handlers": [],
"level": "ERROR",
},
"django.channels.server": {
# see archivebox.monkey_patches.ModifiedAccessLogGenerator for dedicated daphne server logging settings
"propagate": False,
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.server": { # logs all requests (2xx, 3xx, 4xx)
"propagate": False,
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.request": { # only logs 4xx and 5xx errors
"propagate": False,
"handlers": ["default", "logfile"],
"level": "ERROR",
"filters": ["noisyrequestsfilter"],
},
"django.db.backends": {
"propagate": False,
"handlers": ["default"],
"level": LOG_LEVEL_DATABASE,
},
},
}

View file

@ -176,22 +176,43 @@ class ArchiveBoxBaseConfig(BaseSettings):
"""Populate any unset values using function provided as their default""" """Populate any unset values using function provided as their default"""
for key, field in self.model_fields.items(): for key, field in self.model_fields.items():
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
value = getattr(self, key) value = getattr(self, key)
if isinstance(value, Callable): if isinstance(value, Callable):
# if value is a function, execute it to get the actual value, passing existing config as a dict arg # if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
if func_takes_args_or_kwargs(value): if func_takes_args_or_kwargs(value):
# assemble dict of existing field values to pass to default factory functions
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
computed_default = field.default(config_so_far) computed_default = field.default(config_so_far)
else: else:
# otherwise it's a pure function with no args, just call it
computed_default = field.default() computed_default = field.default()
# check to make sure default factory return value matches type annotation # coerce/check to make sure default factory return value matches type annotation
TypeAdapter(field.annotation).validate_python(computed_default) TypeAdapter(field.annotation).validate_python(computed_default)
# set generated default value as final validated value # set generated default value as final validated value
setattr(self, key, computed_default) setattr(self, key, computed_default)
return self return self
def update_in_place(self, warn=True, **kwargs):
"""
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it.
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue.
"""
if warn:
print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:')
for key, value in kwargs.items():
os.environ[key] = str(value)
original_value = getattr(self, key)
if warn:
print(f' {key}={original_value} -> {value}')
self.__init__()
return self
class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg] class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg]
hook_type: ClassVar[HookType] = 'CONFIG' hook_type: ClassVar[HookType] = 'CONFIG'

View file

@ -20,8 +20,9 @@ except ImportError:
class LdapConfig(BaseConfigSet): class LdapConfig(BaseConfigSet):
""" """
LDAP Config gets imported by core/settings.py very early during startup, so it needs to be in a separate file from apps.py LDAP Config gets imported by core/settings.py very early during startup.
so that it can be imported during settings.py initialization before the apps are loaded. It needs to be in a separate file from apps.py so that it can be imported
during settings.py initialization before the apps are loaded.
""" """
section: ClassVar[ConfigSectionName] = 'LDAP_CONFIG' section: ClassVar[ConfigSectionName] = 'LDAP_CONFIG'
@ -41,20 +42,29 @@ class LdapConfig(BaseConfigSet):
@model_validator(mode='after') @model_validator(mode='after')
def validate_ldap_config(self): def validate_ldap_config(self):
# Check that LDAP libraries are installed
if self.LDAP_ENABLED and LDAP_LIB is None: if self.LDAP_ENABLED and LDAP_LIB is None:
sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n') sys.stderr.write('[X] Error: LDAP Authentication is enabled but LDAP libraries are not installed. You may need to run: pip install archivebox[ldap]\n')
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
# sys.exit(1) # sys.exit(1)
self.LDAP_ENABLED = False self.update(LDAP_ENABLED=False)
if self.LDAP_ENABLED: # Check that all required LDAP config options are set
assert ( all_config_is_set = (
self.LDAP_SERVER_URI self.LDAP_SERVER_URI
and self.LDAP_BIND_DN and self.LDAP_BIND_DN
and self.LDAP_BIND_PASSWORD and self.LDAP_BIND_PASSWORD
and self.LDAP_USER_BASE and self.LDAP_USER_BASE
and self.LDAP_USER_FILTER and self.LDAP_USER_FILTER
), 'LDAP_* config options must all be set if LDAP_ENABLED=True' )
if self.LDAP_ENABLED and not all_config_is_set:
missing_config_options = [
key for key, value in self.model_dump().items()
if value is None and key != 'LDAP_ENABLED'
]
sys.stderr.write('[X] Error: LDAP_* config options must all be set if LDAP_ENABLED=True\n')
sys.stderr.write(f' Missing: {", ".join(missing_config_options)}\n')
self.update(LDAP_ENABLED=False)
return self return self
@property @property

View file

@ -28,8 +28,21 @@ class RipgrepConfig(BaseConfigSet):
RIPGREP_BINARY: str = Field(default='rg') RIPGREP_BINARY: str = Field(default='rg')
RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg')
RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [
# https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md
f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}',
'--type-not=ignore',
'--ignore-case',
'--files-with-matches',
'--regexp',
])
RIPGREP_SEARCH_DIR: str = Field(default=lambda: str(settings.ARCHIVE_DIR))
RIPGREP_CONFIG = RipgrepConfig() RIPGREP_CONFIG = RipgrepConfig()
class RipgrepBinary(BaseBinary): class RipgrepBinary(BaseBinary):
name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
@ -41,17 +54,8 @@ class RipgrepBinary(BaseBinary):
RIPGREP_BINARY = RipgrepBinary() RIPGREP_BINARY = RipgrepBinary()
# regex to match archive/<ts>/... snapshot dir names
RG_IGNORE_EXTENSIONS = ('css','js','orig','svg') TIMESTAMP_REGEX = re.compile(r'\/([\d]+\.[\d]+)\/')
RG_ADD_TYPE = '--type-add'
RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}"
RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l)
RG_REGEX_ARGUMENT = '-e'
TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
ts_regex = re.compile(TIMESTAMP_REGEX)
class RipgrepSearchBackend(BaseSearchBackend): class RipgrepSearchBackend(BaseSearchBackend):
name: str = 'ripgrep' name: str = 'ripgrep'
@ -67,23 +71,22 @@ class RipgrepSearchBackend(BaseSearchBackend):
@staticmethod @staticmethod
def search(text: str) -> List[str]: def search(text: str) -> List[str]:
rg_bin = RIPGREP_BINARY.load() from core.models import Snapshot
if not rg_bin.version:
ripgrep_binary = RIPGREP_BINARY.load()
if not ripgrep_binary.version:
raise Exception("ripgrep binary not found, install ripgrep to use this search backend") raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
rg_cmd = [ cmd = [
rg_bin.abspath, ripgrep_binary.abspath,
RG_ADD_TYPE, *RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT,
RG_IGNORE_ARGUMENTS,
RG_DEFAULT_ARGUMENTS,
RG_REGEX_ARGUMENT,
text, text,
str(settings.ARCHIVE_DIR) RIPGREP_CONFIG.RIPGREP_SEARCH_DIR,
] ]
rg = run(rg_cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True) proc = run(cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True)
timestamps = set() timestamps = set()
for path in rg.stdout.splitlines(): for path in proc.stdout.splitlines():
ts = ts_regex.findall(path) ts = TIMESTAMP_REGEX.findall(path)
if ts: if ts:
timestamps.add(ts[0]) timestamps.add(ts[0])

View file

@ -1,5 +1,6 @@
__package__ = 'archivebox.plugins_search.sonic' __package__ = 'archivebox.plugins_search.sonic'
import os
import sys import sys
from typing import List, Dict, ClassVar, Generator, cast from typing import List, Dict, ClassVar, Generator, cast
@ -39,15 +40,23 @@ class SonicConfig(BaseConfigSet):
SONIC_COLLECTION: str = Field(default='archivebox') SONIC_COLLECTION: str = Field(default='archivebox')
SONIC_BUCKET: str = Field(default='archivebox') SONIC_BUCKET: str = Field(default='archivebox')
SONIC_MAX_CHUNK_LENGTH: int = Field(default=2000)
SONIC_MAX_TEXT_LENGTH: int = Field(default=100000000)
SONIC_MAX_RETRIES: int = Field(default=5)
@model_validator(mode='after') @model_validator(mode='after')
def validate_sonic_port(self): def validate_sonic_port(self):
if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic': if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic' and SONIC_LIB is None:
if SONIC_LIB is None: sys.stderr.write('[X] Error: Sonic search backend is enabled but sonic-client lib is not installed. You may need to run: pip install archivebox[sonic]\n')
sys.stderr.write('[!] Sonic search backend is enabled but not installed. Install Sonic to use the Sonic search backend.\n') # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
# sys.exit(1)
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep')
return self return self
SONIC_CONFIG = SonicConfig() SONIC_CONFIG = SonicConfig()
class SonicBinary(BaseBinary): class SonicBinary(BaseBinary):
name: BinName = SONIC_CONFIG.SONIC_BINARY name: BinName = SONIC_CONFIG.SONIC_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo
@ -57,6 +66,7 @@ class SonicBinary(BaseBinary):
# cargo.name: {'packages': lambda: ['sonic-server']}, # TODO: add cargo # cargo.name: {'packages': lambda: ['sonic-server']}, # TODO: add cargo
} }
# TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally
# def on_get_version(self): # def on_get_version(self):
# with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: # with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl:
# return SemVer.parse(str(ingestcl.protocol)) # return SemVer.parse(str(ingestcl.protocol))
@ -64,11 +74,6 @@ class SonicBinary(BaseBinary):
SONIC_BINARY = SonicBinary() SONIC_BINARY = SonicBinary()
MAX_SONIC_TEXT_TOTAL_LENGTH = 100000000 # dont index more than 100 million characters per text
MAX_SONIC_TEXT_CHUNK_LENGTH = 2000 # dont index more than 2000 characters per chunk
MAX_SONIC_ERRORS_BEFORE_ABORT = 5
class SonicSearchBackend(BaseSearchBackend): class SonicSearchBackend(BaseSearchBackend):
name: str = 'sonic' name: str = 'sonic'
@ -80,11 +85,11 @@ class SonicSearchBackend(BaseSearchBackend):
with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl:
for text in texts: for text in texts:
chunks = ( chunks = (
text[i:i+MAX_SONIC_TEXT_CHUNK_LENGTH] text[i:i+SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH]
for i in range( for i in range(
0, 0,
min(len(text), MAX_SONIC_TEXT_TOTAL_LENGTH), min(len(text), SONIC_CONFIG.SONIC_MAX_TEXT_LENGTH),
MAX_SONIC_TEXT_CHUNK_LENGTH, SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH,
) )
) )
try: try:
@ -93,7 +98,7 @@ class SonicSearchBackend(BaseSearchBackend):
except Exception as err: except Exception as err:
print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}') print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}')
error_count += 1 error_count += 1
if error_count > MAX_SONIC_ERRORS_BEFORE_ABORT: if error_count > SONIC_CONFIG.SONIC_MAX_RETRIES:
raise raise
@staticmethod @staticmethod

View file

@ -1,8 +1,9 @@
__package__ = 'archivebox.plugins_search.sqlite' __package__ = 'archivebox.plugins_search.sqlite'
import sys
import sqlite3 import sqlite3
import codecs import codecs
from typing import List, ClassVar, Generator, Callable from typing import List, ClassVar, Iterable, Callable
from django.conf import settings from django.conf import settings
from django.db import connection as database from django.db import connection as database
@ -17,7 +18,7 @@ from plugantic.base_hook import BaseHook
from plugantic.base_searchbackend import BaseSearchBackend from plugantic.base_searchbackend import BaseSearchBackend
# Depends on Other Plugins: # Depends on Other Plugins:
# from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
@ -30,6 +31,7 @@ class SqliteftsConfig(BaseConfigSet):
SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS') SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS')
SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH') SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH')
# Not really meant to be user-modified, just here as constants
SQLITEFTS_DB: str = Field(default='search.sqlite3') SQLITEFTS_DB: str = Field(default='search.sqlite3')
SQLITEFTS_TABLE: str = Field(default='snapshot_fts') SQLITEFTS_TABLE: str = Field(default='snapshot_fts')
SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts') SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts')
@ -37,8 +39,9 @@ class SqliteftsConfig(BaseConfigSet):
@model_validator(mode='after') @model_validator(mode='after')
def validate_fts_separate_database(self): def validate_fts_separate_database(self):
if self.SQLITEFTS_SEPARATE_DATABASE: if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sqlite' and self.SQLITEFTS_SEPARATE_DATABASE and not self.SQLITEFTS_DB:
assert self.SQLITEFTS_DB, "SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True" sys.stderr.write('[X] Error: SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True\n')
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep')
return self return self
@property @property
@ -84,8 +87,7 @@ def _escape_sqlite3(value: str, *, quote: str, errors='strict') -> str:
nul_index = encodable.find("\x00") nul_index = encodable.find("\x00")
if nul_index >= 0: if nul_index >= 0:
error = UnicodeEncodeError("NUL-terminated utf-8", encodable, error = UnicodeEncodeError("NUL-terminated utf-8", encodable, nul_index, nul_index + 1, "NUL not allowed")
nul_index, nul_index + 1, "NUL not allowed")
error_handler = codecs.lookup_error(errors) error_handler = codecs.lookup_error(errors)
replacement, _ = error_handler(error) replacement, _ = error_handler(error)
assert isinstance(replacement, str), "handling a UnicodeEncodeError should return a str replacement" assert isinstance(replacement, str), "handling a UnicodeEncodeError should return a str replacement"
@ -224,7 +226,7 @@ class SqliteftsSearchBackend(BaseSearchBackend):
return snap_ids return snap_ids
@staticmethod @staticmethod
def flush(snapshot_ids: Generator[str, None, None]): def flush(snapshot_ids: Iterable[str]):
snapshot_ids = list(snapshot_ids) # type: ignore[assignment] snapshot_ids = list(snapshot_ids) # type: ignore[assignment]
id_table = _escape_sqlite3_identifier(SQLITEFTS_CONFIG.SQLITEFTS_ID_TABLE) id_table = _escape_sqlite3_identifier(SQLITEFTS_CONFIG.SQLITEFTS_ID_TABLE)
@ -243,7 +245,7 @@ SQLITEFTS_SEARCH_BACKEND = SqliteftsSearchBackend()
class SqliteftsSearchPlugin(BasePlugin): class SqliteftsSearchPlugin(BasePlugin):
app_label: str ='sqlitefts' app_label: str ='sqlitefts'
verbose_name: str = 'Sqlitefts' verbose_name: str = 'SQLite FTS5 Search'
hooks: List[InstanceOf[BaseHook]] = [ hooks: List[InstanceOf[BaseHook]] = [
SQLITEFTS_CONFIG, SQLITEFTS_CONFIG,

View file

@ -115,9 +115,6 @@ class SearchBackendConfig(BaseConfigSet):
USE_SEARCHING_BACKEND: bool = Field(default=True) USE_SEARCHING_BACKEND: bool = Field(default=True)
SEARCH_BACKEND_ENGINE: str = Field(default='ripgrep') SEARCH_BACKEND_ENGINE: str = Field(default='ripgrep')
SEARCH_BACKEND_HOST_NAME: str = Field(default='localhost')
SEARCH_BACKEND_PORT: int = Field(default=1491)
SEARCH_BACKEND_PASSWORD: str = Field(default='SecretPassword')
SEARCH_PROCESS_HTML: bool = Field(default=True) SEARCH_PROCESS_HTML: bool = Field(default=True)
SEARCH_BACKEND_TIMEOUT: int = Field(default=10) SEARCH_BACKEND_TIMEOUT: int = Field(default=10)