From c9c163efedc24af82703fb6962d53341076f983b Mon Sep 17 00:00:00 2001 From: Nick Sweeting <github@sweeting.me> Date: Tue, 24 Sep 2024 02:13:01 -0700 Subject: [PATCH] begin migrating search backends to new plugin system --- archivebox/core/settings.py | 1 + archivebox/extractors/__init__.py | 3 +- archivebox/index/__init__.py | 6 +- archivebox/main.py | 5 +- .../backends => plugins_search}/__init__.py | 0 archivebox/plugins_search/ripgrep/__init__.py | 0 archivebox/plugins_search/ripgrep/apps.py | 62 +++++++++++++++++++ .../ripgrep}/ripgrep.py | 0 .../sonic}/sonic.py | 0 .../sqlite}/sqlite.py | 0 archivebox/search/__init__.py | 27 ++++---- 11 files changed, 83 insertions(+), 21 deletions(-) rename archivebox/{search/backends => plugins_search}/__init__.py (100%) create mode 100644 archivebox/plugins_search/ripgrep/__init__.py create mode 100644 archivebox/plugins_search/ripgrep/apps.py rename archivebox/{search/backends => plugins_search/ripgrep}/ripgrep.py (100%) rename archivebox/{search/backends => plugins_search/sonic}/sonic.py (100%) rename archivebox/{search/backends => plugins_search/sqlite}/sqlite.py (100%) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index dff6baa7..bf3463c1 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -46,6 +46,7 @@ PLUGIN_DIRS = { 'plugins_sys': PACKAGE_DIR / 'plugins_sys', 'plugins_pkg': PACKAGE_DIR / 'plugins_pkg', 'plugins_auth': PACKAGE_DIR / 'plugins_auth', + 'plugins_search': PACKAGE_DIR / 'plugins_search', 'plugins_extractor': PACKAGE_DIR / 'plugins_extractor', 'user_plugins': DATA_DIR / 'user_plugins', } diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index c373dbdf..e517dad6 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -31,7 +31,6 @@ from ..logging_util import ( log_archive_method_started, log_archive_method_finished, ) -from ..search import write_search_index from .title import should_save_title, save_title from .favicon import should_save_favicon, save_favicon @@ -110,6 +109,8 @@ def ignore_methods(to_ignore: List[str]) -> Iterable[str]: def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link: """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" + from ..search import write_search_index + # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. from core.models import Snapshot, ArchiveResult try: diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 1edd3caf..aca651ea 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -51,7 +51,6 @@ from .sql import ( write_sql_link_details, ) -from ..search import search_backend_enabled, query_search_index ### Link filtering and checking @@ -379,7 +378,10 @@ def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str=' return snapshots.filter(q_filter) def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet: - if not search_backend_enabled(): + from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG + from ..search import query_search_index + + if not SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENABLED: stderr() stderr( '[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', diff --git a/archivebox/main.py b/archivebox/main.py index c231d597..ab2b0c9e 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -141,8 +141,6 @@ from .logging_util import ( printable_dependency_version, ) -from .search import flush_search_index, index_links - @enforce_types def help(out_dir: Path=OUTPUT_DIR) -> None: @@ -767,6 +765,8 @@ def remove(filter_str: Optional[str]=None, to_remove = snapshots.count() + from .search import flush_search_index + flush_search_index(snapshots=snapshots) remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) all_snapshots = load_main_index(out_dir=out_dir) @@ -790,6 +790,7 @@ def update(resume: Optional[float]=None, """Import any new links from subscriptions and retry any previously failed/skipped links""" from core.models import ArchiveResult + from .search import index_links check_data_folder(out_dir=out_dir) check_dependencies() diff --git a/archivebox/search/backends/__init__.py b/archivebox/plugins_search/__init__.py similarity index 100% rename from archivebox/search/backends/__init__.py rename to archivebox/plugins_search/__init__.py diff --git a/archivebox/plugins_search/ripgrep/__init__.py b/archivebox/plugins_search/ripgrep/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/plugins_search/ripgrep/apps.py b/archivebox/plugins_search/ripgrep/apps.py new file mode 100644 index 00000000..4f9b72f8 --- /dev/null +++ b/archivebox/plugins_search/ripgrep/apps.py @@ -0,0 +1,62 @@ +__package__ = 'archivebox.plugins_search.ripgrep' + +from typing import List, Dict, ClassVar +# from typing_extensions import Self + +from django.conf import settings + +# Depends on other PyPI/vendor packages: +from pydantic import InstanceOf, Field +from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName + +# Depends on other Django apps: +from plugantic.base_plugin import BasePlugin +from plugantic.base_configset import BaseConfigSet, ConfigSectionName +from plugantic.base_binary import BaseBinary, env, apt, brew +from plugantic.base_hook import BaseHook +# from plugantic.base_search import BaseSearchBackend + +# Depends on Other Plugins: +# from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG + +###################### Config ########################## + +class RipgrepConfig(BaseConfigSet): + section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG' + + RIPGREP_BINARY: str = Field(default='rg') + +RIPGREP_CONFIG = RipgrepConfig() + +class RipgrepBinary(BaseBinary): + name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { + apt.name: {'packages': lambda: ['ripgrep']}, + brew.name: {'packages': lambda: ['ripgrep']}, + } + +RIPGREP_BINARY = RipgrepBinary() + +# TODO: +# class RipgrepSearchBackend(BaseSearchBackend): +# name: str = 'ripgrep' + +# RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend() + + +class RipgrepSearchPlugin(BasePlugin): + app_label: str ='ripgrep' + verbose_name: str = 'Ripgrep' + + hooks: List[InstanceOf[BaseHook]] = [ + RIPGREP_CONFIG, + RIPGREP_BINARY, + ] + + + +PLUGIN = RipgrepSearchPlugin() +PLUGIN.register(settings) +DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/plugins_search/ripgrep/ripgrep.py similarity index 100% rename from archivebox/search/backends/ripgrep.py rename to archivebox/plugins_search/ripgrep/ripgrep.py diff --git a/archivebox/search/backends/sonic.py b/archivebox/plugins_search/sonic/sonic.py similarity index 100% rename from archivebox/search/backends/sonic.py rename to archivebox/plugins_search/sonic/sonic.py diff --git a/archivebox/search/backends/sqlite.py b/archivebox/plugins_search/sqlite/sqlite.py similarity index 100% rename from archivebox/search/backends/sqlite.py rename to archivebox/plugins_search/sqlite/sqlite.py diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index c5a9b13c..eab57141 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -3,24 +3,19 @@ from pathlib import Path from importlib import import_module from django.db.models import QuerySet +from django.conf import settings from archivebox.index.schema import Link from archivebox.util import enforce_types -from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE +from archivebox.config import stderr + +# from archivebox.plugins_sys.config.apps import settings.CONFIGS.SearchBackendConfig from .utils import get_indexable_content, log_index_started -def indexing_enabled(): - return USE_INDEXING_BACKEND - -def search_backend_enabled(): - return USE_SEARCHING_BACKEND - -def get_backend(): - return f'search.backends.{SEARCH_BACKEND_ENGINE}' def import_backend(): - backend_string = get_backend() + backend_string = f'plugins_search.{settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE}.{settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE}' try: backend = import_module(backend_string) except Exception as err: @@ -28,8 +23,8 @@ def import_backend(): return backend @enforce_types -def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None: - if not indexing_enabled(): +def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None: + if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND: return if not skip_text_index and texts: @@ -48,10 +43,10 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: ) @enforce_types -def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: +def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet: from core.models import Snapshot - if search_backend_enabled(): + if settings.CONFIGS.SearchBackendConfig.USE_SEARCHING_BACKEND: backend = import_backend() try: snapshot_pks = backend.search(query) @@ -71,7 +66,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: @enforce_types def flush_search_index(snapshots: QuerySet): - if not indexing_enabled() or not snapshots: + if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND or not snapshots: return backend = import_backend() snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True)) @@ -85,7 +80,7 @@ def flush_search_index(snapshots: QuerySet): ) @enforce_types -def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR): +def index_links(links: Union[List[Link],None], out_dir: Path=settings.DATA_DIR): if not links: return