From c9c163efedc24af82703fb6962d53341076f983b Mon Sep 17 00:00:00 2001
From: Nick Sweeting <github@sweeting.me>
Date: Tue, 24 Sep 2024 02:13:01 -0700
Subject: [PATCH] begin migrating search backends to new plugin system

---
 archivebox/core/settings.py                   |  1 +
 archivebox/extractors/__init__.py             |  3 +-
 archivebox/index/__init__.py                  |  6 +-
 archivebox/main.py                            |  5 +-
 .../backends => plugins_search}/__init__.py   |  0
 archivebox/plugins_search/ripgrep/__init__.py |  0
 archivebox/plugins_search/ripgrep/apps.py     | 62 +++++++++++++++++++
 .../ripgrep}/ripgrep.py                       |  0
 .../sonic}/sonic.py                           |  0
 .../sqlite}/sqlite.py                         |  0
 archivebox/search/__init__.py                 | 27 ++++----
 11 files changed, 83 insertions(+), 21 deletions(-)
 rename archivebox/{search/backends => plugins_search}/__init__.py (100%)
 create mode 100644 archivebox/plugins_search/ripgrep/__init__.py
 create mode 100644 archivebox/plugins_search/ripgrep/apps.py
 rename archivebox/{search/backends => plugins_search/ripgrep}/ripgrep.py (100%)
 rename archivebox/{search/backends => plugins_search/sonic}/sonic.py (100%)
 rename archivebox/{search/backends => plugins_search/sqlite}/sqlite.py (100%)

diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index dff6baa7..bf3463c1 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -46,6 +46,7 @@ PLUGIN_DIRS = {
     'plugins_sys':          PACKAGE_DIR / 'plugins_sys',
     'plugins_pkg':          PACKAGE_DIR / 'plugins_pkg',
     'plugins_auth':         PACKAGE_DIR / 'plugins_auth',
+    'plugins_search':         PACKAGE_DIR / 'plugins_search',
     'plugins_extractor':    PACKAGE_DIR / 'plugins_extractor',
     'user_plugins':         DATA_DIR / 'user_plugins',
 }
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index c373dbdf..e517dad6 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -31,7 +31,6 @@ from ..logging_util import (
     log_archive_method_started,
     log_archive_method_finished,
 )
-from ..search import write_search_index
 
 from .title import should_save_title, save_title
 from .favicon import should_save_favicon, save_favicon
@@ -110,6 +109,8 @@ def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
 def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
     """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
 
+    from ..search import write_search_index
+
     # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
     from core.models import Snapshot, ArchiveResult
     try:
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 1edd3caf..aca651ea 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -51,7 +51,6 @@ from .sql import (
     write_sql_link_details,
 )
 
-from ..search import search_backend_enabled, query_search_index
 
 ### Link filtering and checking
 
@@ -379,7 +378,10 @@ def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='
     return snapshots.filter(q_filter)
 
 def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet:
-    if not search_backend_enabled():
+    from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
+    from ..search import query_search_index
+    
+    if not SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENABLED:
         stderr()
         stderr(
                 '[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True',
diff --git a/archivebox/main.py b/archivebox/main.py
index c231d597..ab2b0c9e 100755
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -141,8 +141,6 @@ from .logging_util import (
     printable_dependency_version,
 )
 
-from .search import flush_search_index, index_links
-
 
 @enforce_types
 def help(out_dir: Path=OUTPUT_DIR) -> None:
@@ -767,6 +765,8 @@ def remove(filter_str: Optional[str]=None,
 
     to_remove = snapshots.count()
 
+    from .search import flush_search_index
+
     flush_search_index(snapshots=snapshots)
     remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
     all_snapshots = load_main_index(out_dir=out_dir)
@@ -790,6 +790,7 @@ def update(resume: Optional[float]=None,
     """Import any new links from subscriptions and retry any previously failed/skipped links"""
 
     from core.models import ArchiveResult
+    from .search import index_links
 
     check_data_folder(out_dir=out_dir)
     check_dependencies()
diff --git a/archivebox/search/backends/__init__.py b/archivebox/plugins_search/__init__.py
similarity index 100%
rename from archivebox/search/backends/__init__.py
rename to archivebox/plugins_search/__init__.py
diff --git a/archivebox/plugins_search/ripgrep/__init__.py b/archivebox/plugins_search/ripgrep/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/plugins_search/ripgrep/apps.py b/archivebox/plugins_search/ripgrep/apps.py
new file mode 100644
index 00000000..4f9b72f8
--- /dev/null
+++ b/archivebox/plugins_search/ripgrep/apps.py
@@ -0,0 +1,62 @@
+__package__ = 'archivebox.plugins_search.ripgrep'
+
+from typing import List, Dict, ClassVar
+# from typing_extensions import Self
+
+from django.conf import settings
+
+# Depends on other PyPI/vendor packages:
+from pydantic import InstanceOf, Field
+from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName
+
+# Depends on other Django apps:
+from plugantic.base_plugin import BasePlugin
+from plugantic.base_configset import BaseConfigSet, ConfigSectionName
+from plugantic.base_binary import BaseBinary, env, apt, brew
+from plugantic.base_hook import BaseHook
+# from plugantic.base_search import BaseSearchBackend
+
+# Depends on Other Plugins:
+# from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
+
+###################### Config ##########################
+
+class RipgrepConfig(BaseConfigSet):
+    section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG'
+
+    RIPGREP_BINARY: str = Field(default='rg')
+
+RIPGREP_CONFIG = RipgrepConfig()
+
+class RipgrepBinary(BaseBinary):
+    name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+    provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
+        apt.name: {'packages': lambda: ['ripgrep']},
+        brew.name: {'packages': lambda: ['ripgrep']},
+    }
+
+RIPGREP_BINARY = RipgrepBinary()
+
+# TODO:
+# class RipgrepSearchBackend(BaseSearchBackend):
+#     name: str = 'ripgrep'
+
+# RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend()
+
+
+class RipgrepSearchPlugin(BasePlugin):
+    app_label: str ='ripgrep'
+    verbose_name: str = 'Ripgrep'
+
+    hooks: List[InstanceOf[BaseHook]] = [
+        RIPGREP_CONFIG,
+        RIPGREP_BINARY,
+    ]
+
+
+
+PLUGIN = RipgrepSearchPlugin()
+PLUGIN.register(settings)
+DJANGO_APP = PLUGIN.AppConfig
diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/plugins_search/ripgrep/ripgrep.py
similarity index 100%
rename from archivebox/search/backends/ripgrep.py
rename to archivebox/plugins_search/ripgrep/ripgrep.py
diff --git a/archivebox/search/backends/sonic.py b/archivebox/plugins_search/sonic/sonic.py
similarity index 100%
rename from archivebox/search/backends/sonic.py
rename to archivebox/plugins_search/sonic/sonic.py
diff --git a/archivebox/search/backends/sqlite.py b/archivebox/plugins_search/sqlite/sqlite.py
similarity index 100%
rename from archivebox/search/backends/sqlite.py
rename to archivebox/plugins_search/sqlite/sqlite.py
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index c5a9b13c..eab57141 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -3,24 +3,19 @@ from pathlib import Path
 from importlib import import_module
 
 from django.db.models import QuerySet
+from django.conf import settings
 
 from archivebox.index.schema import Link
 from archivebox.util import enforce_types
-from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
+from archivebox.config import stderr
+
+# from archivebox.plugins_sys.config.apps import settings.CONFIGS.SearchBackendConfig
 
 from .utils import get_indexable_content, log_index_started
 
-def indexing_enabled():
-    return USE_INDEXING_BACKEND
-
-def search_backend_enabled():
-    return USE_SEARCHING_BACKEND
-
-def get_backend():
-    return f'search.backends.{SEARCH_BACKEND_ENGINE}'
 
 def import_backend():
-    backend_string = get_backend()
+    backend_string = f'plugins_search.{settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE}.{settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE}'
     try:
         backend = import_module(backend_string)
     except Exception as err:
@@ -28,8 +23,8 @@ def import_backend():
     return backend
 
 @enforce_types
-def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
-    if not indexing_enabled():
+def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None:
+    if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND:
         return
 
     if not skip_text_index and texts:
@@ -48,10 +43,10 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
                 )
 
 @enforce_types
-def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
+def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
     from core.models import Snapshot
 
-    if search_backend_enabled():
+    if settings.CONFIGS.SearchBackendConfig.USE_SEARCHING_BACKEND:
         backend = import_backend()
         try:
             snapshot_pks = backend.search(query)
@@ -71,7 +66,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
 
 @enforce_types
 def flush_search_index(snapshots: QuerySet):
-    if not indexing_enabled() or not snapshots:
+    if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND or not snapshots:
         return
     backend = import_backend()
     snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))
@@ -85,7 +80,7 @@ def flush_search_index(snapshots: QuerySet):
         )
 
 @enforce_types
-def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
+def index_links(links: Union[List[Link],None], out_dir: Path=settings.DATA_DIR):
     if not links:
         return