mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
begin migrating search backends to new plugin system
This commit is contained in:
parent
2d19317e3f
commit
c9c163efed
11 changed files with 83 additions and 21 deletions
|
@ -46,6 +46,7 @@ PLUGIN_DIRS = {
|
||||||
'plugins_sys': PACKAGE_DIR / 'plugins_sys',
|
'plugins_sys': PACKAGE_DIR / 'plugins_sys',
|
||||||
'plugins_pkg': PACKAGE_DIR / 'plugins_pkg',
|
'plugins_pkg': PACKAGE_DIR / 'plugins_pkg',
|
||||||
'plugins_auth': PACKAGE_DIR / 'plugins_auth',
|
'plugins_auth': PACKAGE_DIR / 'plugins_auth',
|
||||||
|
'plugins_search': PACKAGE_DIR / 'plugins_search',
|
||||||
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
|
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
|
||||||
'user_plugins': DATA_DIR / 'user_plugins',
|
'user_plugins': DATA_DIR / 'user_plugins',
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,6 @@ from ..logging_util import (
|
||||||
log_archive_method_started,
|
log_archive_method_started,
|
||||||
log_archive_method_finished,
|
log_archive_method_finished,
|
||||||
)
|
)
|
||||||
from ..search import write_search_index
|
|
||||||
|
|
||||||
from .title import should_save_title, save_title
|
from .title import should_save_title, save_title
|
||||||
from .favicon import should_save_favicon, save_favicon
|
from .favicon import should_save_favicon, save_favicon
|
||||||
|
@ -110,6 +109,8 @@ def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
|
||||||
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
|
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, created_by_id: int | None=None) -> Link:
|
||||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||||
|
|
||||||
|
from ..search import write_search_index
|
||||||
|
|
||||||
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
|
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
|
||||||
from core.models import Snapshot, ArchiveResult
|
from core.models import Snapshot, ArchiveResult
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -51,7 +51,6 @@ from .sql import (
|
||||||
write_sql_link_details,
|
write_sql_link_details,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ..search import search_backend_enabled, query_search_index
|
|
||||||
|
|
||||||
### Link filtering and checking
|
### Link filtering and checking
|
||||||
|
|
||||||
|
@ -379,7 +378,10 @@ def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='
|
||||||
return snapshots.filter(q_filter)
|
return snapshots.filter(q_filter)
|
||||||
|
|
||||||
def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet:
|
def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet:
|
||||||
if not search_backend_enabled():
|
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
|
||||||
|
from ..search import query_search_index
|
||||||
|
|
||||||
|
if not SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENABLED:
|
||||||
stderr()
|
stderr()
|
||||||
stderr(
|
stderr(
|
||||||
'[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True',
|
'[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True',
|
||||||
|
|
|
@ -141,8 +141,6 @@ from .logging_util import (
|
||||||
printable_dependency_version,
|
printable_dependency_version,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .search import flush_search_index, index_links
|
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def help(out_dir: Path=OUTPUT_DIR) -> None:
|
def help(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
|
@ -767,6 +765,8 @@ def remove(filter_str: Optional[str]=None,
|
||||||
|
|
||||||
to_remove = snapshots.count()
|
to_remove = snapshots.count()
|
||||||
|
|
||||||
|
from .search import flush_search_index
|
||||||
|
|
||||||
flush_search_index(snapshots=snapshots)
|
flush_search_index(snapshots=snapshots)
|
||||||
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
||||||
all_snapshots = load_main_index(out_dir=out_dir)
|
all_snapshots = load_main_index(out_dir=out_dir)
|
||||||
|
@ -790,6 +790,7 @@ def update(resume: Optional[float]=None,
|
||||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||||
|
|
||||||
from core.models import ArchiveResult
|
from core.models import ArchiveResult
|
||||||
|
from .search import index_links
|
||||||
|
|
||||||
check_data_folder(out_dir=out_dir)
|
check_data_folder(out_dir=out_dir)
|
||||||
check_dependencies()
|
check_dependencies()
|
||||||
|
|
0
archivebox/plugins_search/ripgrep/__init__.py
Normal file
0
archivebox/plugins_search/ripgrep/__init__.py
Normal file
62
archivebox/plugins_search/ripgrep/apps.py
Normal file
62
archivebox/plugins_search/ripgrep/apps.py
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
__package__ = 'archivebox.plugins_search.ripgrep'
|
||||||
|
|
||||||
|
from typing import List, Dict, ClassVar
|
||||||
|
# from typing_extensions import Self
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
# Depends on other PyPI/vendor packages:
|
||||||
|
from pydantic import InstanceOf, Field
|
||||||
|
from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName
|
||||||
|
|
||||||
|
# Depends on other Django apps:
|
||||||
|
from plugantic.base_plugin import BasePlugin
|
||||||
|
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
||||||
|
from plugantic.base_binary import BaseBinary, env, apt, brew
|
||||||
|
from plugantic.base_hook import BaseHook
|
||||||
|
# from plugantic.base_search import BaseSearchBackend
|
||||||
|
|
||||||
|
# Depends on Other Plugins:
|
||||||
|
# from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
|
||||||
|
|
||||||
|
###################### Config ##########################
|
||||||
|
|
||||||
|
class RipgrepConfig(BaseConfigSet):
|
||||||
|
section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG'
|
||||||
|
|
||||||
|
RIPGREP_BINARY: str = Field(default='rg')
|
||||||
|
|
||||||
|
RIPGREP_CONFIG = RipgrepConfig()
|
||||||
|
|
||||||
|
class RipgrepBinary(BaseBinary):
|
||||||
|
name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY
|
||||||
|
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||||
|
|
||||||
|
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||||
|
apt.name: {'packages': lambda: ['ripgrep']},
|
||||||
|
brew.name: {'packages': lambda: ['ripgrep']},
|
||||||
|
}
|
||||||
|
|
||||||
|
RIPGREP_BINARY = RipgrepBinary()
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# class RipgrepSearchBackend(BaseSearchBackend):
|
||||||
|
# name: str = 'ripgrep'
|
||||||
|
|
||||||
|
# RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend()
|
||||||
|
|
||||||
|
|
||||||
|
class RipgrepSearchPlugin(BasePlugin):
|
||||||
|
app_label: str ='ripgrep'
|
||||||
|
verbose_name: str = 'Ripgrep'
|
||||||
|
|
||||||
|
hooks: List[InstanceOf[BaseHook]] = [
|
||||||
|
RIPGREP_CONFIG,
|
||||||
|
RIPGREP_BINARY,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
PLUGIN = RipgrepSearchPlugin()
|
||||||
|
PLUGIN.register(settings)
|
||||||
|
DJANGO_APP = PLUGIN.AppConfig
|
|
@ -3,24 +3,19 @@ from pathlib import Path
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
|
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
from archivebox.index.schema import Link
|
from archivebox.index.schema import Link
|
||||||
from archivebox.util import enforce_types
|
from archivebox.util import enforce_types
|
||||||
from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
|
from archivebox.config import stderr
|
||||||
|
|
||||||
|
# from archivebox.plugins_sys.config.apps import settings.CONFIGS.SearchBackendConfig
|
||||||
|
|
||||||
from .utils import get_indexable_content, log_index_started
|
from .utils import get_indexable_content, log_index_started
|
||||||
|
|
||||||
def indexing_enabled():
|
|
||||||
return USE_INDEXING_BACKEND
|
|
||||||
|
|
||||||
def search_backend_enabled():
|
|
||||||
return USE_SEARCHING_BACKEND
|
|
||||||
|
|
||||||
def get_backend():
|
|
||||||
return f'search.backends.{SEARCH_BACKEND_ENGINE}'
|
|
||||||
|
|
||||||
def import_backend():
|
def import_backend():
|
||||||
backend_string = get_backend()
|
backend_string = f'plugins_search.{settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE}.{settings.CONFIGS.SearchBackendConfig.SEARCH_BACKEND_ENGINE}'
|
||||||
try:
|
try:
|
||||||
backend = import_module(backend_string)
|
backend = import_module(backend_string)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
@ -28,8 +23,8 @@ def import_backend():
|
||||||
return backend
|
return backend
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
|
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None:
|
||||||
if not indexing_enabled():
|
if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND:
|
||||||
return
|
return
|
||||||
|
|
||||||
if not skip_text_index and texts:
|
if not skip_text_index and texts:
|
||||||
|
@ -48,10 +43,10 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
|
||||||
)
|
)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
|
def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
|
|
||||||
if search_backend_enabled():
|
if settings.CONFIGS.SearchBackendConfig.USE_SEARCHING_BACKEND:
|
||||||
backend = import_backend()
|
backend = import_backend()
|
||||||
try:
|
try:
|
||||||
snapshot_pks = backend.search(query)
|
snapshot_pks = backend.search(query)
|
||||||
|
@ -71,7 +66,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def flush_search_index(snapshots: QuerySet):
|
def flush_search_index(snapshots: QuerySet):
|
||||||
if not indexing_enabled() or not snapshots:
|
if not settings.CONFIGS.SearchBackendConfig.USE_INDEXING_BACKEND or not snapshots:
|
||||||
return
|
return
|
||||||
backend = import_backend()
|
backend = import_backend()
|
||||||
snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))
|
snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))
|
||||||
|
@ -85,7 +80,7 @@ def flush_search_index(snapshots: QuerySet):
|
||||||
)
|
)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
|
def index_links(links: Union[List[Link],None], out_dir: Path=settings.DATA_DIR):
|
||||||
if not links:
|
if not links:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue