more settings loading tweaks and improvements

This commit is contained in:
Nick Sweeting 2024-09-24 15:13:54 -07:00
parent fbfd16e195
commit 97695bda5e
No known key found for this signature in database
10 changed files with 350 additions and 260 deletions

View file

@ -1,4 +1,30 @@
__package__ = 'archivebox'
from .monkey_patches import *
import os
import importlib
from pathlib import Path
PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir
DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir
def _detect_installed_version():
try:
return importlib.metadata.version(__package__ or 'archivebox')
except importlib.metadata.PackageNotFoundError:
try:
pyproject_config = (PACKAGE_DIR / 'pyproject.toml').read_text()
for line in pyproject_config:
if line.startswith('version = '):
return line.split(' = ', 1)[-1].strip('"')
except FileNotFoundError:
# building docs, pyproject.toml is not available
return 'dev'
raise Exception('Failed to detect installed archivebox version!')
__version__ = _detect_installed_version()

View file

@ -6,6 +6,7 @@ import re
import logging
import inspect
import tempfile
import archivebox
from typing import Dict
from pathlib import Path
@ -22,14 +23,16 @@ IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
PACKAGE_DIR = Path(__file__).resolve().parent.parent
PACKAGE_DIR = archivebox.PACKAGE_DIR
assert PACKAGE_DIR == CONFIG.PACKAGE_DIR
DATA_DIR = Path(os.curdir).resolve()
DATA_DIR = archivebox.DATA_DIR
assert DATA_DIR == CONFIG.OUTPUT_DIR
ARCHIVE_DIR = DATA_DIR / 'archive'
assert ARCHIVE_DIR == CONFIG.ARCHIVE_DIR
VERSION = archivebox.__version__
################################################################################
### ArchiveBox Plugin Settings
################################################################################
@ -164,11 +167,19 @@ STATIC_URL = '/static/'
STATICFILES_DIRS = [
*([str(CONFIG.CUSTOM_TEMPLATES_DIR / 'static')] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
*[
str(plugin_dir / 'static')
for plugin_dir in PLUGIN_DIRS.values()
],
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'static'),
]
TEMPLATE_DIRS = [
*([str(CONFIG.CUSTOM_TEMPLATES_DIR)] if CONFIG.CUSTOM_TEMPLATES_DIR else []),
*[
str(plugin_dir / 'templates')
for plugin_dir in PLUGIN_DIRS.values()
],
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'core'),
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME / 'admin'),
str(PACKAGE_DIR / CONFIG.TEMPLATES_DIR_NAME),
@ -394,7 +405,7 @@ SHELL_PLUS_PRINT_SQL = False
IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
if IS_SHELL:
os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'core' / 'welcome_message.py')
os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'core' / 'shell_welcome_message.py')
################################################################################
@ -411,7 +422,7 @@ TIME_ZONE = CONFIG.TIMEZONE # django convention is TIME_ZONE, archivebox
from django.conf.locale.en import formats as en_formats # type: ignore
en_formats.DATETIME_FORMAT = DATETIME_FORMAT
en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format
en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
@ -419,193 +430,10 @@ en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
### Logging Settings
################################################################################
IGNORABLE_URL_PATTERNS = [
re.compile(r"/.*/?apple-touch-icon.*\.png"),
re.compile(r"/.*/?favicon\.ico"),
re.compile(r"/.*/?robots\.txt"),
re.compile(r"/.*/?.*\.(css|js)\.map"),
re.compile(r"/.*/?.*\.(css|js)\.map"),
re.compile(r"/static/.*"),
re.compile(r"/admin/jsi18n/"),
]
class NoisyRequestsFilter(logging.Filter):
def filter(self, record) -> bool:
logline = record.getMessage()
# '"GET /api/v1/docs HTTP/1.1" 200 1023'
# '"GET /static/admin/js/SelectFilter2.js HTTP/1.1" 200 15502'
# '"GET /static/admin/js/SelectBox.js HTTP/1.1" 304 0'
# '"GET /admin/jsi18n/ HTTP/1.1" 200 3352'
# '"GET /admin/api/apitoken/0191bbf8-fd5e-0b8c-83a8-0f32f048a0af/change/ HTTP/1.1" 200 28778'
from .settings_logging import SETTINGS_LOGGING, LOGS_DIR, ERROR_LOG
# ignore harmless 404s for the patterns in IGNORABLE_URL_PATTERNS
for pattern in IGNORABLE_URL_PATTERNS:
ignorable_GET_request = re.compile(f'"GET {pattern.pattern} HTTP/.*" (2..|30.|404) .+$', re.I | re.M)
if ignorable_GET_request.match(logline):
return False
ignorable_404_pattern = re.compile(f'Not Found: {pattern.pattern}', re.I | re.M)
if ignorable_404_pattern.match(logline):
return False
return True
class CustomOutboundWebhookLogFormatter(logging.Formatter):
def format(self, record):
result = super().format(record)
return result.replace('HTTP Request: ', 'OutboundWebhook: ')
ERROR_LOG = tempfile.NamedTemporaryFile().name
if CONFIG.LOGS_DIR.exists():
ERROR_LOG = (CONFIG.LOGS_DIR / 'errors.log')
else:
# historically too many edge cases here around creating log dir w/ correct permissions early on
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
LOG_LEVEL_DATABASE = 'DEBUG' if DEBUG else 'WARNING'
LOG_LEVEL_REQUEST = 'DEBUG' if DEBUG else 'WARNING'
import pydantic
import django.template
LOGGING = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"rich": {
"datefmt": "[%Y-%m-%d %H:%M:%S]",
# "format": "{asctime} {levelname} {module} {name} {message} {username}",
"format": "%(name)s %(message)s",
},
"outbound_webhooks": {
"()": CustomOutboundWebhookLogFormatter,
"datefmt": "[%Y-%m-%d %H:%M:%S]",
},
},
"filters": {
"noisyrequestsfilter": {
"()": NoisyRequestsFilter,
},
"require_debug_false": {
"()": "django.utils.log.RequireDebugFalse",
},
"require_debug_true": {
"()": "django.utils.log.RequireDebugTrue",
},
},
"handlers": {
# "console": {
# "level": "DEBUG",
# 'formatter': 'simple',
# "class": "logging.StreamHandler",
# 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
# },
"default": {
"class": "rich.logging.RichHandler",
"formatter": "rich",
"level": "DEBUG",
"markup": False,
"rich_tracebacks": True,
"filters": ["noisyrequestsfilter"],
"tracebacks_suppress": [
django,
pydantic,
],
},
"logfile": {
"level": "INFO",
"class": "logging.handlers.RotatingFileHandler",
"filename": ERROR_LOG,
"maxBytes": 1024 * 1024 * 25, # 25 MB
"backupCount": 10,
"formatter": "rich",
"filters": ["noisyrequestsfilter"],
},
"outbound_webhooks": {
"class": "rich.logging.RichHandler",
"markup": False,
"rich_tracebacks": True,
"formatter": "outbound_webhooks",
},
# "mail_admins": {
# "level": "ERROR",
# "filters": ["require_debug_false"],
# "class": "django.utils.log.AdminEmailHandler",
# },
"null": {
"class": "logging.NullHandler",
},
},
"root": {
"handlers": ["default", "logfile"],
"level": "INFO",
"formatter": "rich",
},
"loggers": {
"api": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"checks": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"core": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"plugins_extractor": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"httpx": {
"handlers": ["outbound_webhooks"],
"level": "INFO",
"formatter": "outbound_webhooks",
"propagate": False,
},
"django": {
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.utils.autoreload": {
"propagate": False,
"handlers": [],
"level": "ERROR",
},
"django.channels.server": {
# see archivebox.monkey_patches.ModifiedAccessLogGenerator for dedicated daphne server logging settings
"propagate": False,
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.server": { # logs all requests (2xx, 3xx, 4xx)
"propagate": False,
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.request": { # only logs 4xx and 5xx errors
"propagate": False,
"handlers": ["default", "logfile"],
"level": "ERROR",
"filters": ["noisyrequestsfilter"],
},
"django.db.backends": {
"propagate": False,
"handlers": ["default"],
"level": LOG_LEVEL_DATABASE,
},
},
}
LOGGING = SETTINGS_LOGGING
################################################################################

View file

@ -0,0 +1,198 @@
import re
import tempfile
import logging
import pydantic
import django.template
import archivebox
IGNORABLE_URL_PATTERNS = [
re.compile(r"/.*/?apple-touch-icon.*\.png"),
re.compile(r"/.*/?favicon\.ico"),
re.compile(r"/.*/?robots\.txt"),
re.compile(r"/.*/?.*\.(css|js)\.map"),
re.compile(r"/.*/?.*\.(css|js)\.map"),
re.compile(r"/static/.*"),
re.compile(r"/admin/jsi18n/"),
]
class NoisyRequestsFilter(logging.Filter):
def filter(self, record) -> bool:
logline = record.getMessage()
# '"GET /api/v1/docs HTTP/1.1" 200 1023'
# '"GET /static/admin/js/SelectFilter2.js HTTP/1.1" 200 15502'
# '"GET /static/admin/js/SelectBox.js HTTP/1.1" 304 0'
# '"GET /admin/jsi18n/ HTTP/1.1" 200 3352'
# '"GET /admin/api/apitoken/0191bbf8-fd5e-0b8c-83a8-0f32f048a0af/change/ HTTP/1.1" 200 28778'
# ignore harmless 404s for the patterns in IGNORABLE_URL_PATTERNS
for pattern in IGNORABLE_URL_PATTERNS:
ignorable_GET_request = re.compile(f'"GET {pattern.pattern} HTTP/.*" (2..|30.|404) .+$', re.I | re.M)
if ignorable_GET_request.match(logline):
return False
ignorable_404_pattern = re.compile(f'Not Found: {pattern.pattern}', re.I | re.M)
if ignorable_404_pattern.match(logline):
return False
return True
class CustomOutboundWebhookLogFormatter(logging.Formatter):
def format(self, record):
result = super().format(record)
return result.replace('HTTP Request: ', 'OutboundWebhook: ')
ERROR_LOG = tempfile.NamedTemporaryFile().name
LOGS_DIR = archivebox.DATA_DIR / 'logs'
if LOGS_DIR.is_dir():
ERROR_LOG = (LOGS_DIR / 'errors.log')
else:
# historically too many edge cases here around creating log dir w/ correct permissions early on
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
# print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}')
pass
LOG_LEVEL_DATABASE = 'WARNING' # if DEBUG else 'WARNING'
LOG_LEVEL_REQUEST = 'WARNING' # if DEBUG else 'WARNING'
SETTINGS_LOGGING = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"rich": {
"datefmt": "[%Y-%m-%d %H:%M:%S]",
# "format": "{asctime} {levelname} {module} {name} {message} {username}",
"format": "%(name)s %(message)s",
},
"outbound_webhooks": {
"()": CustomOutboundWebhookLogFormatter,
"datefmt": "[%Y-%m-%d %H:%M:%S]",
},
},
"filters": {
"noisyrequestsfilter": {
"()": NoisyRequestsFilter,
},
"require_debug_false": {
"()": "django.utils.log.RequireDebugFalse",
},
"require_debug_true": {
"()": "django.utils.log.RequireDebugTrue",
},
},
"handlers": {
# "console": {
# "level": "DEBUG",
# 'formatter': 'simple',
# "class": "logging.StreamHandler",
# 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
# },
"default": {
"class": "rich.logging.RichHandler",
"formatter": "rich",
"level": "DEBUG",
"markup": False,
"rich_tracebacks": True,
"filters": ["noisyrequestsfilter"],
"tracebacks_suppress": [
django,
pydantic,
],
},
"logfile": {
"level": "INFO",
"class": "logging.handlers.RotatingFileHandler",
"filename": ERROR_LOG,
"maxBytes": 1024 * 1024 * 25, # 25 MB
"backupCount": 10,
"formatter": "rich",
"filters": ["noisyrequestsfilter"],
},
"outbound_webhooks": {
"class": "rich.logging.RichHandler",
"markup": False,
"rich_tracebacks": True,
"formatter": "outbound_webhooks",
},
# "mail_admins": {
# "level": "ERROR",
# "filters": ["require_debug_false"],
# "class": "django.utils.log.AdminEmailHandler",
# },
"null": {
"class": "logging.NullHandler",
},
},
"root": {
"handlers": ["default", "logfile"],
"level": "INFO",
"formatter": "rich",
},
"loggers": {
"api": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"checks": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"core": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"plugins_extractor": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
},
"httpx": {
"handlers": ["outbound_webhooks"],
"level": "INFO",
"formatter": "outbound_webhooks",
"propagate": False,
},
"django": {
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.utils.autoreload": {
"propagate": False,
"handlers": [],
"level": "ERROR",
},
"django.channels.server": {
# see archivebox.monkey_patches.ModifiedAccessLogGenerator for dedicated daphne server logging settings
"propagate": False,
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.server": { # logs all requests (2xx, 3xx, 4xx)
"propagate": False,
"handlers": ["default", "logfile"],
"level": "INFO",
"filters": ["noisyrequestsfilter"],
},
"django.request": { # only logs 4xx and 5xx errors
"propagate": False,
"handlers": ["default", "logfile"],
"level": "ERROR",
"filters": ["noisyrequestsfilter"],
},
"django.db.backends": {
"propagate": False,
"handlers": ["default"],
"level": LOG_LEVEL_DATABASE,
},
},
}

View file

@ -176,22 +176,43 @@ class ArchiveBoxBaseConfig(BaseSettings):
"""Populate any unset values using function provided as their default"""
for key, field in self.model_fields.items():
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
value = getattr(self, key)
if isinstance(value, Callable):
# if value is a function, execute it to get the actual value, passing existing config as a dict arg
# if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected
if func_takes_args_or_kwargs(value):
# assemble dict of existing field values to pass to default factory functions
config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False))
computed_default = field.default(config_so_far)
else:
# otherwise it's a pure function with no args, just call it
computed_default = field.default()
# check to make sure default factory return value matches type annotation
# coerce/check to make sure default factory return value matches type annotation
TypeAdapter(field.annotation).validate_python(computed_default)
# set generated default value as final validated value
setattr(self, key, computed_default)
return self
def update_in_place(self, warn=True, **kwargs):
"""
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it.
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue.
"""
if warn:
print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:')
for key, value in kwargs.items():
os.environ[key] = str(value)
original_value = getattr(self, key)
if warn:
print(f' {key}={original_value} -> {value}')
self.__init__()
return self
class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg]
hook_type: ClassVar[HookType] = 'CONFIG'

View file

@ -20,8 +20,9 @@ except ImportError:
class LdapConfig(BaseConfigSet):
"""
LDAP Config gets imported by core/settings.py very early during startup, so it needs to be in a separate file from apps.py
so that it can be imported during settings.py initialization before the apps are loaded.
LDAP Config gets imported by core/settings.py very early during startup.
It needs to be in a separate file from apps.py so that it can be imported
during settings.py initialization before the apps are loaded.
"""
section: ClassVar[ConfigSectionName] = 'LDAP_CONFIG'
@ -41,20 +42,29 @@ class LdapConfig(BaseConfigSet):
@model_validator(mode='after')
def validate_ldap_config(self):
# Check that LDAP libraries are installed
if self.LDAP_ENABLED and LDAP_LIB is None:
sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n')
sys.stderr.write('[X] Error: LDAP Authentication is enabled but LDAP libraries are not installed. You may need to run: pip install archivebox[ldap]\n')
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
# sys.exit(1)
self.LDAP_ENABLED = False
self.update(LDAP_ENABLED=False)
if self.LDAP_ENABLED:
assert (
# Check that all required LDAP config options are set
all_config_is_set = (
self.LDAP_SERVER_URI
and self.LDAP_BIND_DN
and self.LDAP_BIND_PASSWORD
and self.LDAP_USER_BASE
and self.LDAP_USER_FILTER
), 'LDAP_* config options must all be set if LDAP_ENABLED=True'
)
if self.LDAP_ENABLED and not all_config_is_set:
missing_config_options = [
key for key, value in self.model_dump().items()
if value is None and key != 'LDAP_ENABLED'
]
sys.stderr.write('[X] Error: LDAP_* config options must all be set if LDAP_ENABLED=True\n')
sys.stderr.write(f' Missing: {", ".join(missing_config_options)}\n')
self.update(LDAP_ENABLED=False)
return self
@property

View file

@ -28,8 +28,21 @@ class RipgrepConfig(BaseConfigSet):
RIPGREP_BINARY: str = Field(default='rg')
RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg')
RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [
# https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md
f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}',
'--type-not=ignore',
'--ignore-case',
'--files-with-matches',
'--regexp',
])
RIPGREP_SEARCH_DIR: str = Field(default=lambda: str(settings.ARCHIVE_DIR))
RIPGREP_CONFIG = RipgrepConfig()
class RipgrepBinary(BaseBinary):
name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
@ -41,17 +54,8 @@ class RipgrepBinary(BaseBinary):
RIPGREP_BINARY = RipgrepBinary()
RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
RG_ADD_TYPE = '--type-add'
RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}"
RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l)
RG_REGEX_ARGUMENT = '-e'
TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
ts_regex = re.compile(TIMESTAMP_REGEX)
# regex to match archive/<ts>/... snapshot dir names
TIMESTAMP_REGEX = re.compile(r'\/([\d]+\.[\d]+)\/')
class RipgrepSearchBackend(BaseSearchBackend):
name: str = 'ripgrep'
@ -67,23 +71,22 @@ class RipgrepSearchBackend(BaseSearchBackend):
@staticmethod
def search(text: str) -> List[str]:
rg_bin = RIPGREP_BINARY.load()
if not rg_bin.version:
from core.models import Snapshot
ripgrep_binary = RIPGREP_BINARY.load()
if not ripgrep_binary.version:
raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
rg_cmd = [
rg_bin.abspath,
RG_ADD_TYPE,
RG_IGNORE_ARGUMENTS,
RG_DEFAULT_ARGUMENTS,
RG_REGEX_ARGUMENT,
cmd = [
ripgrep_binary.abspath,
*RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT,
text,
str(settings.ARCHIVE_DIR)
RIPGREP_CONFIG.RIPGREP_SEARCH_DIR,
]
rg = run(rg_cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True)
proc = run(cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True)
timestamps = set()
for path in rg.stdout.splitlines():
ts = ts_regex.findall(path)
for path in proc.stdout.splitlines():
ts = TIMESTAMP_REGEX.findall(path)
if ts:
timestamps.add(ts[0])

View file

@ -1,5 +1,6 @@
__package__ = 'archivebox.plugins_search.sonic'
import os
import sys
from typing import List, Dict, ClassVar, Generator, cast
@ -39,15 +40,23 @@ class SonicConfig(BaseConfigSet):
SONIC_COLLECTION: str = Field(default='archivebox')
SONIC_BUCKET: str = Field(default='archivebox')
SONIC_MAX_CHUNK_LENGTH: int = Field(default=2000)
SONIC_MAX_TEXT_LENGTH: int = Field(default=100000000)
SONIC_MAX_RETRIES: int = Field(default=5)
@model_validator(mode='after')
def validate_sonic_port(self):
if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic':
if SONIC_LIB is None:
sys.stderr.write('[!] Sonic search backend is enabled but not installed. Install Sonic to use the Sonic search backend.\n')
if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic' and SONIC_LIB is None:
sys.stderr.write('[X] Error: Sonic search backend is enabled but sonic-client lib is not installed. You may need to run: pip install archivebox[sonic]\n')
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
# sys.exit(1)
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep')
return self
SONIC_CONFIG = SonicConfig()
class SonicBinary(BaseBinary):
name: BinName = SONIC_CONFIG.SONIC_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo
@ -57,6 +66,7 @@ class SonicBinary(BaseBinary):
# cargo.name: {'packages': lambda: ['sonic-server']}, # TODO: add cargo
}
# TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally
# def on_get_version(self):
# with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl:
# return SemVer.parse(str(ingestcl.protocol))
@ -64,11 +74,6 @@ class SonicBinary(BaseBinary):
SONIC_BINARY = SonicBinary()
MAX_SONIC_TEXT_TOTAL_LENGTH = 100000000 # dont index more than 100 million characters per text
MAX_SONIC_TEXT_CHUNK_LENGTH = 2000 # dont index more than 2000 characters per chunk
MAX_SONIC_ERRORS_BEFORE_ABORT = 5
class SonicSearchBackend(BaseSearchBackend):
name: str = 'sonic'
@ -80,11 +85,11 @@ class SonicSearchBackend(BaseSearchBackend):
with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl:
for text in texts:
chunks = (
text[i:i+MAX_SONIC_TEXT_CHUNK_LENGTH]
text[i:i+SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH]
for i in range(
0,
min(len(text), MAX_SONIC_TEXT_TOTAL_LENGTH),
MAX_SONIC_TEXT_CHUNK_LENGTH,
min(len(text), SONIC_CONFIG.SONIC_MAX_TEXT_LENGTH),
SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH,
)
)
try:
@ -93,7 +98,7 @@ class SonicSearchBackend(BaseSearchBackend):
except Exception as err:
print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}')
error_count += 1
if error_count > MAX_SONIC_ERRORS_BEFORE_ABORT:
if error_count > SONIC_CONFIG.SONIC_MAX_RETRIES:
raise
@staticmethod

View file

@ -1,8 +1,9 @@
__package__ = 'archivebox.plugins_search.sqlite'
import sys
import sqlite3
import codecs
from typing import List, ClassVar, Generator, Callable
from typing import List, ClassVar, Iterable, Callable
from django.conf import settings
from django.db import connection as database
@ -17,7 +18,7 @@ from plugantic.base_hook import BaseHook
from plugantic.base_searchbackend import BaseSearchBackend
# Depends on Other Plugins:
# from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
@ -30,6 +31,7 @@ class SqliteftsConfig(BaseConfigSet):
SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS')
SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH')
# Not really meant to be user-modified, just here as constants
SQLITEFTS_DB: str = Field(default='search.sqlite3')
SQLITEFTS_TABLE: str = Field(default='snapshot_fts')
SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts')
@ -37,8 +39,9 @@ class SqliteftsConfig(BaseConfigSet):
@model_validator(mode='after')
def validate_fts_separate_database(self):
if self.SQLITEFTS_SEPARATE_DATABASE:
assert self.SQLITEFTS_DB, "SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True"
if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sqlite' and self.SQLITEFTS_SEPARATE_DATABASE and not self.SQLITEFTS_DB:
sys.stderr.write('[X] Error: SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True\n')
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep')
return self
@property
@ -84,8 +87,7 @@ def _escape_sqlite3(value: str, *, quote: str, errors='strict') -> str:
nul_index = encodable.find("\x00")
if nul_index >= 0:
error = UnicodeEncodeError("NUL-terminated utf-8", encodable,
nul_index, nul_index + 1, "NUL not allowed")
error = UnicodeEncodeError("NUL-terminated utf-8", encodable, nul_index, nul_index + 1, "NUL not allowed")
error_handler = codecs.lookup_error(errors)
replacement, _ = error_handler(error)
assert isinstance(replacement, str), "handling a UnicodeEncodeError should return a str replacement"
@ -224,7 +226,7 @@ class SqliteftsSearchBackend(BaseSearchBackend):
return snap_ids
@staticmethod
def flush(snapshot_ids: Generator[str, None, None]):
def flush(snapshot_ids: Iterable[str]):
snapshot_ids = list(snapshot_ids) # type: ignore[assignment]
id_table = _escape_sqlite3_identifier(SQLITEFTS_CONFIG.SQLITEFTS_ID_TABLE)
@ -243,7 +245,7 @@ SQLITEFTS_SEARCH_BACKEND = SqliteftsSearchBackend()
class SqliteftsSearchPlugin(BasePlugin):
app_label: str ='sqlitefts'
verbose_name: str = 'Sqlitefts'
verbose_name: str = 'SQLite FTS5 Search'
hooks: List[InstanceOf[BaseHook]] = [
SQLITEFTS_CONFIG,

View file

@ -115,9 +115,6 @@ class SearchBackendConfig(BaseConfigSet):
USE_SEARCHING_BACKEND: bool = Field(default=True)
SEARCH_BACKEND_ENGINE: str = Field(default='ripgrep')
SEARCH_BACKEND_HOST_NAME: str = Field(default='localhost')
SEARCH_BACKEND_PORT: int = Field(default=1491)
SEARCH_BACKEND_PASSWORD: str = Field(default='SecretPassword')
SEARCH_PROCESS_HTML: bool = Field(default=True)
SEARCH_BACKEND_TIMEOUT: int = Field(default=10)