begin migrating search backends to new plugin system

This commit is contained in:
Nick Sweeting 2024-09-24 02:13:01 -07:00
parent 2d19317e3f
commit c9c163efed
No known key found for this signature in database
11 changed files with 83 additions and 21 deletions

View file

@ -46,6 +46,7 @@ PLUGIN_DIRS = {
'plugins_sys': PACKAGE_DIR / 'plugins_sys', 'plugins_sys': PACKAGE_DIR / 'plugins_sys',
'plugins_pkg': PACKAGE_DIR / 'plugins_pkg', 'plugins_pkg': PACKAGE_DIR / 'plugins_pkg',
'plugins_auth': PACKAGE_DIR / 'plugins_auth', 'plugins_auth': PACKAGE_DIR / 'plugins_auth',
'plugins_search': PACKAGE_DIR / 'plugins_search',
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor', 'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
'user_plugins': DATA_DIR / 'user_plugins', 'user_plugins': DATA_DIR / 'user_plugins',
} }

View file

@ -31,7 +31,6 @@ from ..logging_util import (
log_archive_method_started, log_archive_method_started,
log_archive_method_finished, log_archive_method_finished,
) )
from ..search import write_search_index
from .title import should_save_title, save_title from .title import should_save_title, save_title
from .favicon import should_save_favicon, save_favicon from .favicon import should_save_favicon, save_favicon
@ -110,6 +109,8 @@ def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link: def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
from ..search import write_search_index
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
from core.models import Snapshot, ArchiveResult from core.models import Snapshot, ArchiveResult
try: try:

View file

@ -51,7 +51,6 @@ from .sql import (
write_sql_link_details, write_sql_link_details,
) )
from ..search import search_backend_enabled, query_search_index
### Link filtering and checking ### Link filtering and checking
@ -379,7 +378,10 @@ def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='
return snapshots.filter(q_filter) return snapshots.filter(q_filter)
def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet: def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet:
if not search_backend_enabled(): from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
from ..search import query_search_index
if not SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENABLED:
stderr() stderr()
stderr( stderr(
'[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', '[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True',

View file

@ -141,8 +141,6 @@ from .logging_util import (
printable_dependency_version, printable_dependency_version,
) )
from .search import flush_search_index, index_links
@enforce_types @enforce_types
def help(out_dir: Path=OUTPUT_DIR) -> None: def help(out_dir: Path=OUTPUT_DIR) -> None:
@ -767,6 +765,8 @@ def remove(filter_str: Optional[str]=None,
to_remove = snapshots.count() to_remove = snapshots.count()
from .search import flush_search_index
flush_search_index(snapshots=snapshots) flush_search_index(snapshots=snapshots)
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
all_snapshots = load_main_index(out_dir=out_dir) all_snapshots = load_main_index(out_dir=out_dir)
@ -790,6 +790,7 @@ def update(resume: Optional[float]=None,
"""Import any new links from subscriptions and retry any previously failed/skipped links""" """Import any new links from subscriptions and retry any previously failed/skipped links"""
from core.models import ArchiveResult from core.models import ArchiveResult
from .search import index_links
check_data_folder(out_dir=out_dir) check_data_folder(out_dir=out_dir)
check_dependencies() check_dependencies()

View file

@ -0,0 +1,62 @@
__package__ = 'archivebox.plugins_search.ripgrep'
from typing import List, Dict, ClassVar
# from typing_extensions import Self
from django.conf import settings
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName
# Depends on other Django apps:
from plugantic.base_plugin import BasePlugin
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
from plugantic.base_binary import BaseBinary, env, apt, brew
from plugantic.base_hook import BaseHook
# from plugantic.base_search import BaseSearchBackend
# Depends on Other Plugins:
# from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
###################### Config ##########################
class RipgrepConfig(BaseConfigSet):
section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG'
RIPGREP_BINARY: str = Field(default='rg')
RIPGREP_CONFIG = RipgrepConfig()
class RipgrepBinary(BaseBinary):
name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
apt.name: {'packages': lambda: ['ripgrep']},
brew.name: {'packages': lambda: ['ripgrep']},
}
RIPGREP_BINARY = RipgrepBinary()
# TODO:
# class RipgrepSearchBackend(BaseSearchBackend):
# name: str = 'ripgrep'
# RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend()
class RipgrepSearchPlugin(BasePlugin):
app_label: str ='ripgrep'
verbose_name: str = 'Ripgrep'
hooks: List[InstanceOf[BaseHook]] = [
RIPGREP_CONFIG,
RIPGREP_BINARY,
]
PLUGIN = RipgrepSearchPlugin()
PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -3,24 +3,19 @@ from pathlib import Path
from importlib import import_module from importlib import import_module
from django.db.models import QuerySet from django.db.models import QuerySet
from django.conf import settings
from archivebox.index.schema import Link from archivebox.index.schema import Link
from archivebox.util import enforce_types from archivebox.util import enforce_types
from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE from archivebox.config import stderr
# from archivebox.plugins_sys.config.apps import settings.CONFIGS.SearchBackendConfig
from .utils import get_indexable_content, log_index_started from .utils import get_indexable_content, log_index_started
def indexing_enabled():
return USE_INDEXING_BACKEND
def search_backend_enabled():
return USE_SEARCHING_BACKEND
def get_backend():
return f'search.backends.{SEARCH_BACKEND_ENGINE}'
def import_backend(): def import_backend():
backend_string = get_backend() backend_string = f'plugins_search.{settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE}.{settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE}'
try: try:
backend = import_module(backend_string) backend = import_module(backend_string)
except Exception as err: except Exception as err:
@ -28,8 +23,8 @@ def import_backend():
return backend return backend
@enforce_types @enforce_types
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None: def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None:
if not indexing_enabled(): if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND:
return return
if not skip_text_index and texts: if not skip_text_index and texts:
@ -48,10 +43,10 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
) )
@enforce_types @enforce_types
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
from core.models import Snapshot from core.models import Snapshot
if search_backend_enabled(): if settings.CONFIGS.SearchBackendConfig.USE_SEARCHING_BACKEND:
backend = import_backend() backend = import_backend()
try: try:
snapshot_pks = backend.search(query) snapshot_pks = backend.search(query)
@ -71,7 +66,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
@enforce_types @enforce_types
def flush_search_index(snapshots: QuerySet): def flush_search_index(snapshots: QuerySet):
if not indexing_enabled() or not snapshots: if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND or not snapshots:
return return
backend = import_backend() backend = import_backend()
snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True)) snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))
@ -85,7 +80,7 @@ def flush_search_index(snapshots: QuerySet):
) )
@enforce_types @enforce_types
def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR): def index_links(links: Union[List[Link],None], out_dir: Path=settings.DATA_DIR):
if not links: if not links:
return return