mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-27 13:14:24 -04:00
begin migrating search backends to new plugin system
This commit is contained in:
parent
2d19317e3f
commit
c9c163efed
11 changed files with 83 additions and 21 deletions
0
archivebox/plugins_search/ripgrep/__init__.py
Normal file
0
archivebox/plugins_search/ripgrep/__init__.py
Normal file
62
archivebox/plugins_search/ripgrep/apps.py
Normal file
62
archivebox/plugins_search/ripgrep/apps.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
__package__ = 'archivebox.plugins_search.ripgrep'
|
||||
|
||||
from typing import List, Dict, ClassVar
|
||||
# from typing_extensions import Self
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
# Depends on other PyPI/vendor packages:
|
||||
from pydantic import InstanceOf, Field
|
||||
from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName
|
||||
|
||||
# Depends on other Django apps:
|
||||
from plugantic.base_plugin import BasePlugin
|
||||
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
||||
from plugantic.base_binary import BaseBinary, env, apt, brew
|
||||
from plugantic.base_hook import BaseHook
|
||||
# from plugantic.base_search import BaseSearchBackend
|
||||
|
||||
# Depends on Other Plugins:
|
||||
# from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
class RipgrepConfig(BaseConfigSet):
|
||||
section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG'
|
||||
|
||||
RIPGREP_BINARY: str = Field(default='rg')
|
||||
|
||||
RIPGREP_CONFIG = RipgrepConfig()
|
||||
|
||||
class RipgrepBinary(BaseBinary):
|
||||
name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||
apt.name: {'packages': lambda: ['ripgrep']},
|
||||
brew.name: {'packages': lambda: ['ripgrep']},
|
||||
}
|
||||
|
||||
RIPGREP_BINARY = RipgrepBinary()
|
||||
|
||||
# TODO:
|
||||
# class RipgrepSearchBackend(BaseSearchBackend):
|
||||
# name: str = 'ripgrep'
|
||||
|
||||
# RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend()
|
||||
|
||||
|
||||
class RipgrepSearchPlugin(BasePlugin):
|
||||
app_label: str ='ripgrep'
|
||||
verbose_name: str = 'Ripgrep'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
RIPGREP_CONFIG,
|
||||
RIPGREP_BINARY,
|
||||
]
|
||||
|
||||
|
||||
|
||||
PLUGIN = RipgrepSearchPlugin()
|
||||
PLUGIN.register(settings)
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
45
archivebox/plugins_search/ripgrep/ripgrep.py
Normal file
45
archivebox/plugins_search/ripgrep/ripgrep.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
import re
|
||||
from subprocess import run, PIPE
|
||||
from typing import List, Generator
|
||||
|
||||
from archivebox.config import ARCHIVE_DIR, RIPGREP_VERSION, SEARCH_BACKEND_TIMEOUT
|
||||
from archivebox.util import enforce_types
|
||||
|
||||
RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
|
||||
|
||||
RG_ADD_TYPE = '--type-add'
|
||||
RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}"
|
||||
RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l)
|
||||
RG_REGEX_ARGUMENT = '-e'
|
||||
|
||||
TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
|
||||
|
||||
ts_regex = re.compile(TIMESTAMP_REGEX)
|
||||
|
||||
@enforce_types
|
||||
def index(snapshot_id: str, texts: List[str]):
|
||||
return
|
||||
|
||||
@enforce_types
|
||||
def flush(snapshot_ids: Generator[str, None, None]):
|
||||
return
|
||||
|
||||
@enforce_types
|
||||
def search(text: str) -> List[str]:
|
||||
if not RIPGREP_VERSION:
|
||||
raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
|
||||
|
||||
from core.models import Snapshot
|
||||
|
||||
rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)]
|
||||
rg = run(rg_cmd, stdout=PIPE, stderr=PIPE, timeout=SEARCH_BACKEND_TIMEOUT)
|
||||
file_paths = [p.decode() for p in rg.stdout.splitlines()]
|
||||
timestamps = set()
|
||||
for path in file_paths:
|
||||
ts = ts_regex.findall(path)
|
||||
if ts:
|
||||
timestamps.add(ts[0])
|
||||
|
||||
snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
|
||||
|
||||
return snap_ids
|
Loading…
Add table
Add a link
Reference in a new issue