mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-09 12:21:57 -04:00
more settings loading tweaks and improvements
This commit is contained in:
parent
fbfd16e195
commit
97695bda5e
10 changed files with 350 additions and 260 deletions
|
@ -1,4 +1,30 @@
|
|||
__package__ = 'archivebox'
|
||||
|
||||
|
||||
from .monkey_patches import *
|
||||
|
||||
import os
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir
|
||||
DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir
|
||||
|
||||
|
||||
def _detect_installed_version():
|
||||
try:
|
||||
return importlib.metadata.version(__package__ or 'archivebox')
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
try:
|
||||
pyproject_config = (PACKAGE_DIR / 'pyproject.toml').read_text()
|
||||
for line in pyproject_config:
|
||||
if line.startswith('version = '):
|
||||
return line.split(' = ', 1)[-1].strip('"')
|
||||
except FileNotFoundError:
|
||||
# building docs, pyproject.toml is not available
|
||||
return 'dev'
|
||||
|
||||
raise Exception('Failed to detect installed archivebox version!')
|
||||
|
||||
|
||||
__version__ = _detect_installed_version()
|
||||
|
|
|
@ -6,6 +6,7 @@ import re
|
|||
import logging
|
||||
import inspect
|
||||
import tempfile
|
||||
import archivebox
|
||||
|
||||
from typing import Dict
|
||||
from pathlib import Path
|
||||
|
@ -22,14 +23,16 @@ IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
|
|||
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
|
||||
|
||||
|
||||
PACKAGE_DIR = Path(__file__).resolve().parent.parent
|
||||
PACKAGE_DIR = archivebox.PACKAGE_DIR
|
||||
assert PACKAGE_DIR == CONFIG.PACKAGE_DIR
|
||||
|
||||
DATA_DIR = Path(os.curdir).resolve()
|
||||
DATA_DIR = archivebox.DATA_DIR
|
||||
assert DATA_DIR == CONFIG.OUTPUT_DIR
|
||||
ARCHIVE_DIR = DATA_DIR / 'archive'
|
||||
assert ARCHIVE_DIR == CONFIG.ARCHIVE_DIR
|
||||
|
||||
VERSION = archivebox.__version__
|
||||
|
||||
################################################################################
|
||||
### ArchiveBox Plugin Settings
|
||||
################################################################################
|
||||
|
@ -164,11 +167,19 @@ STATIC_URL = '/static/'
|
|||
|
||||
STATICFILES_DIRS = [
|
||||
*([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
|
||||
*[
|
||||
str(plugin_dir / 'static')
|
||||
for plugin_dir in PLUGIN_DIRS.values()
|
||||
],
|
||||
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'),
|
||||
]
|
||||
|
||||
TEMPLATE_DIRS = [
|
||||
*([str(CONFIG.CUSTOM_TEMPLATES_DIR)] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
|
||||
*[
|
||||
str(plugin_dir / 'templates')
|
||||
for plugin_dir in PLUGIN_DIRS.values()
|
||||
],
|
||||
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'),
|
||||
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'),
|
||||
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME),
|
||||
|
@ -394,7 +405,7 @@ SHELL_PLUS_PRINT_SQL = False
|
|||
IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
|
||||
IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
|
||||
if IS_SHELL:
|
||||
os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'core' / 'welcome_message.py')
|
||||
os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'core' / 'shell_welcome_message.py')
|
||||
|
||||
|
||||
################################################################################
|
||||
|
@ -411,7 +422,7 @@ TIME_ZONE = CONFIG.TIMEZONE # django convention is TIME_ZONE, archivebox
|
|||
|
||||
from django.conf.locale.en import formats as en_formats # type: ignore
|
||||
|
||||
en_formats.DATETIME_FORMAT = DATETIME_FORMAT
|
||||
en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format
|
||||
en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
|
||||
|
||||
|
||||
|
@ -419,193 +430,10 @@ en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
|
|||
### Logging Settings
|
||||
################################################################################
|
||||
|
||||
IGNORABLE_URL_PATTERNS = [
|
||||
re.compile(r"/.*/?apple-touch-icon.*\.png"),
|
||||
re.compile(r"/.*/?favicon\.ico"),
|
||||
re.compile(r"/.*/?robots\.txt"),
|
||||
re.compile(r"/.*/?.*\.(css|js)\.map"),
|
||||
re.compile(r"/.*/?.*\.(css|js)\.map"),
|
||||
re.compile(r"/static/.*"),
|
||||
re.compile(r"/admin/jsi18n/"),
|
||||
]
|
||||
|
||||
class NoisyRequestsFilter(logging.Filter):
|
||||
def filter(self, record) -> bool:
|
||||
logline = record.getMessage()
|
||||
# '"GET /api/v1/docs HTTP/1.1" 200 1023'
|
||||
# '"GET /static/admin/js/SelectFilter2.js HTTP/1.1" 200 15502'
|
||||
# '"GET /static/admin/js/SelectBox.js HTTP/1.1" 304 0'
|
||||
# '"GET /admin/jsi18n/ HTTP/1.1" 200 3352'
|
||||
# '"GET /admin/api/apitoken/0191bbf8-fd5e-0b8c-83a8-0f32f048a0af/change/ HTTP/1.1" 200 28778'
|
||||
from .settings_logging import SETTINGS_LOGGING, LOGS_DIR, ERROR_LOG
|
||||
|
||||
# ignore harmless 404s for the patterns in IGNORABLE_URL_PATTERNS
|
||||
for pattern in IGNORABLE_URL_PATTERNS:
|
||||
ignorable_GET_request = re.compile(f'"GET {pattern.pattern} HTTP/.*" (2..|30.|404) .+$', re.I | re.M)
|
||||
if ignorable_GET_request.match(logline):
|
||||
return False
|
||||
|
||||
ignorable_404_pattern = re.compile(f'Not Found: {pattern.pattern}', re.I | re.M)
|
||||
if ignorable_404_pattern.match(logline):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class CustomOutboundWebhookLogFormatter(logging.Formatter):
|
||||
def format(self, record):
|
||||
result = super().format(record)
|
||||
return result.replace('HTTP Request: ', 'OutboundWebhook: ')
|
||||
|
||||
|
||||
ERROR_LOG = tempfile.NamedTemporaryFile().name
|
||||
|
||||
if CONFIG.LOGS_DIR.exists():
|
||||
ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log')
|
||||
else:
|
||||
# historically too many edge cases here around creating log dir w/ correct permissions early on
|
||||
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
|
||||
print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
|
||||
|
||||
|
||||
LOG_LEVEL_DATABASE = 'DEBUG' if DEBUG else 'WARNING'
|
||||
LOG_LEVEL_REQUEST = 'DEBUG' if DEBUG else 'WARNING'
|
||||
|
||||
|
||||
import pydantic
|
||||
import django.template
|
||||
|
||||
LOGGING = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"formatters": {
|
||||
"rich": {
|
||||
"datefmt": "[%Y-%m-%d %H:%M:%S]",
|
||||
# "format": "{asctime} {levelname} {module} {name} {message} {username}",
|
||||
"format": "%(name)s %(message)s",
|
||||
},
|
||||
"outbound_webhooks": {
|
||||
"()": CustomOutboundWebhookLogFormatter,
|
||||
"datefmt": "[%Y-%m-%d %H:%M:%S]",
|
||||
},
|
||||
},
|
||||
"filters": {
|
||||
"noisyrequestsfilter": {
|
||||
"()": NoisyRequestsFilter,
|
||||
},
|
||||
"require_debug_false": {
|
||||
"()": "django.utils.log.RequireDebugFalse",
|
||||
},
|
||||
"require_debug_true": {
|
||||
"()": "django.utils.log.RequireDebugTrue",
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
# "console": {
|
||||
# "level": "DEBUG",
|
||||
# 'formatter': 'simple',
|
||||
# "class": "logging.StreamHandler",
|
||||
# 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
|
||||
# },
|
||||
"default": {
|
||||
"class": "rich.logging.RichHandler",
|
||||
"formatter": "rich",
|
||||
"level": "DEBUG",
|
||||
"markup": False,
|
||||
"rich_tracebacks": True,
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
"tracebacks_suppress": [
|
||||
django,
|
||||
pydantic,
|
||||
],
|
||||
},
|
||||
"logfile": {
|
||||
"level": "INFO",
|
||||
"class": "logging.handlers.RotatingFileHandler",
|
||||
"filename": ERROR_LOG,
|
||||
"maxBytes": 1024 * 1024 * 25, # 25 MB
|
||||
"backupCount": 10,
|
||||
"formatter": "rich",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
},
|
||||
"outbound_webhooks": {
|
||||
"class": "rich.logging.RichHandler",
|
||||
"markup": False,
|
||||
"rich_tracebacks": True,
|
||||
"formatter": "outbound_webhooks",
|
||||
},
|
||||
# "mail_admins": {
|
||||
# "level": "ERROR",
|
||||
# "filters": ["require_debug_false"],
|
||||
# "class": "django.utils.log.AdminEmailHandler",
|
||||
# },
|
||||
"null": {
|
||||
"class": "logging.NullHandler",
|
||||
},
|
||||
},
|
||||
"root": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "INFO",
|
||||
"formatter": "rich",
|
||||
},
|
||||
"loggers": {
|
||||
"api": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "DEBUG",
|
||||
},
|
||||
"checks": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "DEBUG",
|
||||
},
|
||||
"core": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "DEBUG",
|
||||
},
|
||||
"plugins_extractor": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "DEBUG",
|
||||
},
|
||||
"httpx": {
|
||||
"handlers": ["outbound_webhooks"],
|
||||
"level": "INFO",
|
||||
"formatter": "outbound_webhooks",
|
||||
"propagate": False,
|
||||
},
|
||||
"django": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "INFO",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
},
|
||||
"django.utils.autoreload": {
|
||||
"propagate": False,
|
||||
"handlers": [],
|
||||
"level": "ERROR",
|
||||
},
|
||||
"django.channels.server": {
|
||||
# see archivebox.monkey_patches.ModifiedAccessLogGenerator for dedicated daphne server logging settings
|
||||
"propagate": False,
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "INFO",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
},
|
||||
"django.server": { # logs all requests (2xx, 3xx, 4xx)
|
||||
"propagate": False,
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "INFO",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
},
|
||||
"django.request": { # only logs 4xx and 5xx errors
|
||||
"propagate": False,
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "ERROR",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
},
|
||||
"django.db.backends": {
|
||||
"propagate": False,
|
||||
"handlers": ["default"],
|
||||
"level": LOG_LEVEL_DATABASE,
|
||||
},
|
||||
},
|
||||
}
|
||||
LOGGING = SETTINGS_LOGGING
|
||||
|
||||
|
||||
################################################################################
|
||||
|
|
198
archivebox/core/settings_logging.py
Normal file
198
archivebox/core/settings_logging.py
Normal file
|
@ -0,0 +1,198 @@
|
|||
import re
|
||||
import tempfile
|
||||
import logging
|
||||
|
||||
import pydantic
|
||||
import django.template
|
||||
|
||||
import archivebox
|
||||
|
||||
|
||||
IGNORABLE_URL_PATTERNS = [
|
||||
re.compile(r"/.*/?apple-touch-icon.*\.png"),
|
||||
re.compile(r"/.*/?favicon\.ico"),
|
||||
re.compile(r"/.*/?robots\.txt"),
|
||||
re.compile(r"/.*/?.*\.(css|js)\.map"),
|
||||
re.compile(r"/.*/?.*\.(css|js)\.map"),
|
||||
re.compile(r"/static/.*"),
|
||||
re.compile(r"/admin/jsi18n/"),
|
||||
]
|
||||
|
||||
class NoisyRequestsFilter(logging.Filter):
|
||||
def filter(self, record) -> bool:
|
||||
logline = record.getMessage()
|
||||
# '"GET /api/v1/docs HTTP/1.1" 200 1023'
|
||||
# '"GET /static/admin/js/SelectFilter2.js HTTP/1.1" 200 15502'
|
||||
# '"GET /static/admin/js/SelectBox.js HTTP/1.1" 304 0'
|
||||
# '"GET /admin/jsi18n/ HTTP/1.1" 200 3352'
|
||||
# '"GET /admin/api/apitoken/0191bbf8-fd5e-0b8c-83a8-0f32f048a0af/change/ HTTP/1.1" 200 28778'
|
||||
|
||||
# ignore harmless 404s for the patterns in IGNORABLE_URL_PATTERNS
|
||||
for pattern in IGNORABLE_URL_PATTERNS:
|
||||
ignorable_GET_request = re.compile(f'"GET {pattern.pattern} HTTP/.*" (2..|30.|404) .+$', re.I | re.M)
|
||||
if ignorable_GET_request.match(logline):
|
||||
return False
|
||||
|
||||
ignorable_404_pattern = re.compile(f'Not Found: {pattern.pattern}', re.I | re.M)
|
||||
if ignorable_404_pattern.match(logline):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class CustomOutboundWebhookLogFormatter(logging.Formatter):
|
||||
def format(self, record):
|
||||
result = super().format(record)
|
||||
return result.replace('HTTP Request: ', 'OutboundWebhook: ')
|
||||
|
||||
|
||||
ERROR_LOG = tempfile.NamedTemporaryFile().name
|
||||
|
||||
LOGS_DIR = archivebox.DATA_DIR / 'logs'
|
||||
|
||||
if LOGS_DIR.is_dir():
|
||||
ERROR_LOG = (LOGS_DIR / 'errors.log')
|
||||
else:
|
||||
# historically too many edge cases here around creating log dir w/ correct permissions early on
|
||||
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
|
||||
# print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
|
||||
pass
|
||||
|
||||
|
||||
LOG_LEVEL_DATABASE = 'WARNING' # if DEBUG else 'WARNING'
|
||||
LOG_LEVEL_REQUEST = 'WARNING' # if DEBUG else 'WARNING'
|
||||
|
||||
|
||||
|
||||
SETTINGS_LOGGING = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"formatters": {
|
||||
"rich": {
|
||||
"datefmt": "[%Y-%m-%d %H:%M:%S]",
|
||||
# "format": "{asctime} {levelname} {module} {name} {message} {username}",
|
||||
"format": "%(name)s %(message)s",
|
||||
},
|
||||
"outbound_webhooks": {
|
||||
"()": CustomOutboundWebhookLogFormatter,
|
||||
"datefmt": "[%Y-%m-%d %H:%M:%S]",
|
||||
},
|
||||
},
|
||||
"filters": {
|
||||
"noisyrequestsfilter": {
|
||||
"()": NoisyRequestsFilter,
|
||||
},
|
||||
"require_debug_false": {
|
||||
"()": "django.utils.log.RequireDebugFalse",
|
||||
},
|
||||
"require_debug_true": {
|
||||
"()": "django.utils.log.RequireDebugTrue",
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
# "console": {
|
||||
# "level": "DEBUG",
|
||||
# 'formatter': 'simple',
|
||||
# "class": "logging.StreamHandler",
|
||||
# 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
|
||||
# },
|
||||
"default": {
|
||||
"class": "rich.logging.RichHandler",
|
||||
"formatter": "rich",
|
||||
"level": "DEBUG",
|
||||
"markup": False,
|
||||
"rich_tracebacks": True,
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
"tracebacks_suppress": [
|
||||
django,
|
||||
pydantic,
|
||||
],
|
||||
},
|
||||
"logfile": {
|
||||
"level": "INFO",
|
||||
"class": "logging.handlers.RotatingFileHandler",
|
||||
"filename": ERROR_LOG,
|
||||
"maxBytes": 1024 * 1024 * 25, # 25 MB
|
||||
"backupCount": 10,
|
||||
"formatter": "rich",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
},
|
||||
"outbound_webhooks": {
|
||||
"class": "rich.logging.RichHandler",
|
||||
"markup": False,
|
||||
"rich_tracebacks": True,
|
||||
"formatter": "outbound_webhooks",
|
||||
},
|
||||
# "mail_admins": {
|
||||
# "level": "ERROR",
|
||||
# "filters": ["require_debug_false"],
|
||||
# "class": "django.utils.log.AdminEmailHandler",
|
||||
# },
|
||||
"null": {
|
||||
"class": "logging.NullHandler",
|
||||
},
|
||||
},
|
||||
"root": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "INFO",
|
||||
"formatter": "rich",
|
||||
},
|
||||
"loggers": {
|
||||
"api": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "DEBUG",
|
||||
},
|
||||
"checks": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "DEBUG",
|
||||
},
|
||||
"core": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "DEBUG",
|
||||
},
|
||||
"plugins_extractor": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "DEBUG",
|
||||
},
|
||||
"httpx": {
|
||||
"handlers": ["outbound_webhooks"],
|
||||
"level": "INFO",
|
||||
"formatter": "outbound_webhooks",
|
||||
"propagate": False,
|
||||
},
|
||||
"django": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "INFO",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
},
|
||||
"django.utils.autoreload": {
|
||||
"propagate": False,
|
||||
"handlers": [],
|
||||
"level": "ERROR",
|
||||
},
|
||||
"django.channels.server": {
|
||||
# see archivebox.monkey_patches.ModifiedAccessLogGenerator for dedicated daphne server logging settings
|
||||
"propagate": False,
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "INFO",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
},
|
||||
"django.server": { # logs all requests (2xx, 3xx, 4xx)
|
||||
"propagate": False,
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "INFO",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
},
|
||||
"django.request": { # only logs 4xx and 5xx errors
|
||||
"propagate": False,
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "ERROR",
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
},
|
||||
"django.db.backends": {
|
||||
"propagate": False,
|
||||
"handlers": ["default"],
|
||||
"level": LOG_LEVEL_DATABASE,
|
||||
},
|
||||
},
|
||||
}
|
|
@ -176,22 +176,43 @@ class ArchiveBoxBaseConfig(BaseSettings):
|
|||
"""Populate any unset values using function provided as their default"""
|
||||
|
||||
for key, field in self.model_fields.items():
|
||||
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
|
||||
value = getattr(self, key)
|
||||
|
||||
if isinstance(value, Callable):
|
||||
# if value is a function, execute it to get the actual value, passing existing config as a dict arg
|
||||
# if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
|
||||
if func_takes_args_or_kwargs(value):
|
||||
# assemble dict of existing field values to pass to default factory functions
|
||||
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
|
||||
computed_default = field.default(config_so_far)
|
||||
else:
|
||||
# otherwise it's a pure function with no args, just call it
|
||||
computed_default = field.default()
|
||||
|
||||
# check to make sure default factory return value matches type annotation
|
||||
# coerce/check to make sure default factory return value matches type annotation
|
||||
TypeAdapter(field.annotation).validate_python(computed_default)
|
||||
|
||||
# set generated default value as final validated value
|
||||
setattr(self, key, computed_default)
|
||||
return self
|
||||
|
||||
def update_in_place(self, warn=True, **kwargs):
|
||||
"""
|
||||
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
|
||||
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
|
||||
|
||||
Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it.
|
||||
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue.
|
||||
"""
|
||||
if warn:
|
||||
print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:')
|
||||
for key, value in kwargs.items():
|
||||
os.environ[key] = str(value)
|
||||
original_value = getattr(self, key)
|
||||
if warn:
|
||||
print(f' {key}={original_value} -> {value}')
|
||||
self.__init__()
|
||||
return self
|
||||
|
||||
class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg]
|
||||
hook_type: ClassVar[HookType] = 'CONFIG'
|
||||
|
||||
|
|
|
@ -20,8 +20,9 @@ except ImportError:
|
|||
|
||||
class LdapConfig(BaseConfigSet):
|
||||
"""
|
||||
LDAP Config gets imported by core/settings.py very early during startup, so it needs to be in a separate file from apps.py
|
||||
so that it can be imported during settings.py initialization before the apps are loaded.
|
||||
LDAP Config gets imported by core/settings.py very early during startup.
|
||||
It needs to be in a separate file from apps.py so that it can be imported
|
||||
during settings.py initialization before the apps are loaded.
|
||||
"""
|
||||
section: ClassVar[ConfigSectionName] = 'LDAP_CONFIG'
|
||||
|
||||
|
@ -41,20 +42,29 @@ class LdapConfig(BaseConfigSet):
|
|||
|
||||
@model_validator(mode='after')
|
||||
def validate_ldap_config(self):
|
||||
# Check that LDAP libraries are installed
|
||||
if self.LDAP_ENABLED and LDAP_LIB is None:
|
||||
sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n')
|
||||
sys.stderr.write('[X] Error: LDAP Authentication is enabled but LDAP libraries are not installed. You may need to run: pip install archivebox[ldap]\n')
|
||||
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
|
||||
# sys.exit(1)
|
||||
self.LDAP_ENABLED = False
|
||||
self.update(LDAP_ENABLED=False)
|
||||
|
||||
if self.LDAP_ENABLED:
|
||||
assert (
|
||||
self.LDAP_SERVER_URI
|
||||
and self.LDAP_BIND_DN
|
||||
and self.LDAP_BIND_PASSWORD
|
||||
and self.LDAP_USER_BASE
|
||||
and self.LDAP_USER_FILTER
|
||||
), 'LDAP_* config options must all be set if LDAP_ENABLED=True'
|
||||
# Check that all required LDAP config options are set
|
||||
all_config_is_set = (
|
||||
self.LDAP_SERVER_URI
|
||||
and self.LDAP_BIND_DN
|
||||
and self.LDAP_BIND_PASSWORD
|
||||
and self.LDAP_USER_BASE
|
||||
and self.LDAP_USER_FILTER
|
||||
)
|
||||
if self.LDAP_ENABLED and not all_config_is_set:
|
||||
missing_config_options = [
|
||||
key for key, value in self.model_dump().items()
|
||||
if value is None and key != 'LDAP_ENABLED'
|
||||
]
|
||||
sys.stderr.write('[X] Error: LDAP_* config options must all be set if LDAP_ENABLED=True\n')
|
||||
sys.stderr.write(f' Missing: {", ".join(missing_config_options)}\n')
|
||||
self.update(LDAP_ENABLED=False)
|
||||
return self
|
||||
|
||||
@property
|
||||
|
|
|
@ -28,8 +28,21 @@ class RipgrepConfig(BaseConfigSet):
|
|||
|
||||
RIPGREP_BINARY: str = Field(default='rg')
|
||||
|
||||
RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg')
|
||||
RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [
|
||||
# https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md
|
||||
f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}',
|
||||
'--type-not=ignore',
|
||||
'--ignore-case',
|
||||
'--files-with-matches',
|
||||
'--regexp',
|
||||
])
|
||||
RIPGREP_SEARCH_DIR: str = Field(default=lambda: str(settings.ARCHIVE_DIR))
|
||||
|
||||
RIPGREP_CONFIG = RipgrepConfig()
|
||||
|
||||
|
||||
|
||||
class RipgrepBinary(BaseBinary):
|
||||
name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
@ -41,17 +54,8 @@ class RipgrepBinary(BaseBinary):
|
|||
|
||||
RIPGREP_BINARY = RipgrepBinary()
|
||||
|
||||
|
||||
RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
|
||||
|
||||
RG_ADD_TYPE = '--type-add'
|
||||
RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}"
|
||||
RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l)
|
||||
RG_REGEX_ARGUMENT = '-e'
|
||||
|
||||
TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
|
||||
ts_regex = re.compile(TIMESTAMP_REGEX)
|
||||
|
||||
# regex to match archive/<ts>/... snapshot dir names
|
||||
TIMESTAMP_REGEX = re.compile(r'\/([\d]+\.[\d]+)\/')
|
||||
|
||||
class RipgrepSearchBackend(BaseSearchBackend):
|
||||
name: str = 'ripgrep'
|
||||
|
@ -67,23 +71,22 @@ class RipgrepSearchBackend(BaseSearchBackend):
|
|||
|
||||
@staticmethod
|
||||
def search(text: str) -> List[str]:
|
||||
rg_bin = RIPGREP_BINARY.load()
|
||||
if not rg_bin.version:
|
||||
from core.models import Snapshot
|
||||
|
||||
ripgrep_binary = RIPGREP_BINARY.load()
|
||||
if not ripgrep_binary.version:
|
||||
raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
|
||||
|
||||
rg_cmd = [
|
||||
rg_bin.abspath,
|
||||
RG_ADD_TYPE,
|
||||
RG_IGNORE_ARGUMENTS,
|
||||
RG_DEFAULT_ARGUMENTS,
|
||||
RG_REGEX_ARGUMENT,
|
||||
cmd = [
|
||||
ripgrep_binary.abspath,
|
||||
*RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT,
|
||||
text,
|
||||
str(settings.ARCHIVE_DIR)
|
||||
RIPGREP_CONFIG.RIPGREP_SEARCH_DIR,
|
||||
]
|
||||
rg = run(rg_cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True)
|
||||
proc = run(cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True)
|
||||
timestamps = set()
|
||||
for path in rg.stdout.splitlines():
|
||||
ts = ts_regex.findall(path)
|
||||
for path in proc.stdout.splitlines():
|
||||
ts = TIMESTAMP_REGEX.findall(path)
|
||||
if ts:
|
||||
timestamps.add(ts[0])
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
__package__ = 'archivebox.plugins_search.sonic'
|
||||
|
||||
import os
|
||||
import sys
|
||||
from typing import List, Dict, ClassVar, Generator, cast
|
||||
|
||||
|
@ -39,15 +40,23 @@ class SonicConfig(BaseConfigSet):
|
|||
SONIC_COLLECTION: str = Field(default='archivebox')
|
||||
SONIC_BUCKET: str = Field(default='archivebox')
|
||||
|
||||
SONIC_MAX_CHUNK_LENGTH: int = Field(default=2000)
|
||||
SONIC_MAX_TEXT_LENGTH: int = Field(default=100000000)
|
||||
SONIC_MAX_RETRIES: int = Field(default=5)
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_sonic_port(self):
|
||||
if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic':
|
||||
if SONIC_LIB is None:
|
||||
sys.stderr.write('[!] Sonic search backend is enabled but not installed. Install Sonic to use the Sonic search backend.\n')
|
||||
if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic' and SONIC_LIB is None:
|
||||
sys.stderr.write('[X] Error: Sonic search backend is enabled but sonic-client lib is not installed. You may need to run: pip install archivebox[sonic]\n')
|
||||
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
|
||||
# sys.exit(1)
|
||||
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep')
|
||||
return self
|
||||
|
||||
SONIC_CONFIG = SonicConfig()
|
||||
|
||||
|
||||
|
||||
class SonicBinary(BaseBinary):
|
||||
name: BinName = SONIC_CONFIG.SONIC_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo
|
||||
|
@ -57,6 +66,7 @@ class SonicBinary(BaseBinary):
|
|||
# cargo.name: {'packages': lambda: ['sonic-server']}, # TODO: add cargo
|
||||
}
|
||||
|
||||
# TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally
|
||||
# def on_get_version(self):
|
||||
# with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl:
|
||||
# return SemVer.parse(str(ingestcl.protocol))
|
||||
|
@ -64,11 +74,6 @@ class SonicBinary(BaseBinary):
|
|||
SONIC_BINARY = SonicBinary()
|
||||
|
||||
|
||||
MAX_SONIC_TEXT_TOTAL_LENGTH = 100000000 # dont index more than 100 million characters per text
|
||||
MAX_SONIC_TEXT_CHUNK_LENGTH = 2000 # dont index more than 2000 characters per chunk
|
||||
MAX_SONIC_ERRORS_BEFORE_ABORT = 5
|
||||
|
||||
|
||||
|
||||
class SonicSearchBackend(BaseSearchBackend):
|
||||
name: str = 'sonic'
|
||||
|
@ -80,11 +85,11 @@ class SonicSearchBackend(BaseSearchBackend):
|
|||
with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl:
|
||||
for text in texts:
|
||||
chunks = (
|
||||
text[i:i+MAX_SONIC_TEXT_CHUNK_LENGTH]
|
||||
text[i:i+SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH]
|
||||
for i in range(
|
||||
0,
|
||||
min(len(text), MAX_SONIC_TEXT_TOTAL_LENGTH),
|
||||
MAX_SONIC_TEXT_CHUNK_LENGTH,
|
||||
min(len(text), SONIC_CONFIG.SONIC_MAX_TEXT_LENGTH),
|
||||
SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH,
|
||||
)
|
||||
)
|
||||
try:
|
||||
|
@ -93,7 +98,7 @@ class SonicSearchBackend(BaseSearchBackend):
|
|||
except Exception as err:
|
||||
print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}')
|
||||
error_count += 1
|
||||
if error_count > MAX_SONIC_ERRORS_BEFORE_ABORT:
|
||||
if error_count > SONIC_CONFIG.SONIC_MAX_RETRIES:
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
__package__ = 'archivebox.plugins_search.sqlite'
|
||||
|
||||
import sys
|
||||
import sqlite3
|
||||
import codecs
|
||||
from typing import List, ClassVar, Generator, Callable
|
||||
from typing import List, ClassVar, Iterable, Callable
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import connection as database
|
||||
|
@ -17,7 +18,7 @@ from plugantic.base_hook import BaseHook
|
|||
from plugantic.base_searchbackend import BaseSearchBackend
|
||||
|
||||
# Depends on Other Plugins:
|
||||
# from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
|
||||
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
|
||||
|
||||
|
||||
|
||||
|
@ -26,19 +27,21 @@ from plugantic.base_searchbackend import BaseSearchBackend
|
|||
class SqliteftsConfig(BaseConfigSet):
|
||||
section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG'
|
||||
|
||||
SQLITEFTS_SEPARATE_DATABASE: bool = Field(default=True, alias='FTS_SEPARATE_DATABASE')
|
||||
SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS')
|
||||
SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH')
|
||||
SQLITEFTS_SEPARATE_DATABASE: bool = Field(default=True, alias='FTS_SEPARATE_DATABASE')
|
||||
SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS')
|
||||
SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH')
|
||||
|
||||
SQLITEFTS_DB: str = Field(default='search.sqlite3')
|
||||
SQLITEFTS_TABLE: str = Field(default='snapshot_fts')
|
||||
SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts')
|
||||
SQLITEFTS_COLUMN: str = Field(default='texts')
|
||||
# Not really meant to be user-modified, just here as constants
|
||||
SQLITEFTS_DB: str = Field(default='search.sqlite3')
|
||||
SQLITEFTS_TABLE: str = Field(default='snapshot_fts')
|
||||
SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts')
|
||||
SQLITEFTS_COLUMN: str = Field(default='texts')
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_fts_separate_database(self):
|
||||
if self.SQLITEFTS_SEPARATE_DATABASE:
|
||||
assert self.SQLITEFTS_DB, "SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True"
|
||||
if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sqlite' and self.SQLITEFTS_SEPARATE_DATABASE and not self.SQLITEFTS_DB:
|
||||
sys.stderr.write('[X] Error: SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True\n')
|
||||
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep')
|
||||
return self
|
||||
|
||||
@property
|
||||
|
@ -84,8 +87,7 @@ def _escape_sqlite3(value: str, *, quote: str, errors='strict') -> str:
|
|||
|
||||
nul_index = encodable.find("\x00")
|
||||
if nul_index >= 0:
|
||||
error = UnicodeEncodeError("NUL-terminated utf-8", encodable,
|
||||
nul_index, nul_index + 1, "NUL not allowed")
|
||||
error = UnicodeEncodeError("NUL-terminated utf-8", encodable, nul_index, nul_index + 1, "NUL not allowed")
|
||||
error_handler = codecs.lookup_error(errors)
|
||||
replacement, _ = error_handler(error)
|
||||
assert isinstance(replacement, str), "handling a UnicodeEncodeError should return a str replacement"
|
||||
|
@ -224,7 +226,7 @@ class SqliteftsSearchBackend(BaseSearchBackend):
|
|||
return snap_ids
|
||||
|
||||
@staticmethod
|
||||
def flush(snapshot_ids: Generator[str, None, None]):
|
||||
def flush(snapshot_ids: Iterable[str]):
|
||||
snapshot_ids = list(snapshot_ids) # type: ignore[assignment]
|
||||
|
||||
id_table = _escape_sqlite3_identifier(SQLITEFTS_CONFIG.SQLITEFTS_ID_TABLE)
|
||||
|
@ -243,7 +245,7 @@ SQLITEFTS_SEARCH_BACKEND = SqliteftsSearchBackend()
|
|||
|
||||
class SqliteftsSearchPlugin(BasePlugin):
|
||||
app_label: str ='sqlitefts'
|
||||
verbose_name: str = 'Sqlitefts'
|
||||
verbose_name: str = 'SQLite FTS5 Search'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
SQLITEFTS_CONFIG,
|
||||
|
|
|
@ -115,9 +115,6 @@ class SearchBackendConfig(BaseConfigSet):
|
|||
USE_SEARCHING_BACKEND: bool = Field(default=True)
|
||||
|
||||
SEARCH_BACKEND_ENGINE: str = Field(default='ripgrep')
|
||||
SEARCH_BACKEND_HOST_NAME: str = Field(default='localhost')
|
||||
SEARCH_BACKEND_PORT: int = Field(default=1491)
|
||||
SEARCH_BACKEND_PASSWORD: str = Field(default='SecretPassword')
|
||||
SEARCH_PROCESS_HTML: bool = Field(default=True)
|
||||
SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue