Merge pull request #570 from ArchiveBox/sonic-search

2025-05-13 14:44:29 -04:00 · 2020-12-05 18:22:17 -05:00 · 2020-12-05 18:22:17 -05:00 · 8d103687d0
commit 8d103687d0
parent 7bc13204e6 172197ae01
20 changed files with 406 additions and 12 deletions
--- a/2
+++ b/2
@ -46,7 +46,7 @@ RUN apt-get update -qq \
 # Install apt dependencies
 RUN apt-get update -qq \
    && apt-get install -qq -y --no-install-recommends \
-        wget curl chromium git ffmpeg youtube-dl \
+        wget curl chromium git ffmpeg youtube-dl ripgrep \
        fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
    && rm -rf /var/lib/apt/lists/*
--- a/archivebox.egg-info
+++ b/archivebox.egg-info
@ -1 +0,0 @@
 pip_dist/archivebox.egg-info
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
    parser.add_argument(
        '--filter-type',
        type=str,
-        choices=('exact', 'substring', 'domain', 'regex','tag'),
+        choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
        default='exact',
        help='Type of pattern matching to use when filtering URLs',
    )
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
    parser.add_argument(
        '--filter-type',
        type=str,
-        choices=('exact', 'substring', 'domain', 'regex'),
+        choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
        default='exact',
        help='Type of pattern matching to use when filtering URLs',
    )
--- a/archivebox/config.py
+++ b/archivebox/config.py
@ -139,6 +139,18 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
        'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
    },
    'SEARCH_BACKEND_CONFIG' : {
        'USE_INDEXING_BACKEND':     {'type': bool,  'default': True},
        'USE_SEARCHING_BACKEND':    {'type': bool,  'default': True},
        'SEARCH_BACKEND_ENGINE':    {'type': str,   'default': 'ripgrep'},
        'SEARCH_BACKEND_HOST_NAME': {'type': str,   'default': 'localhost'},
        'SEARCH_BACKEND_PORT':      {'type': int,   'default': 1491},
        'SEARCH_BACKEND_PASSWORD':  {'type': str,   'default': 'SecretPassword'},
        # SONIC
        'SONIC_COLLECTION':         {'type': str,   'default': 'archivebox'},
        'SONIC_BUCKET':             {'type': str,   'default': 'snapshots'},
    },
    'DEPENDENCY_CONFIG': {
        'USE_CURL':                 {'type': bool,  'default': True},
        'USE_WGET':                 {'type': bool,  'default': True},
@ -149,7 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
        'USE_CHROME':               {'type': bool,  'default': True},
        'USE_NODE':                 {'type': bool,  'default': True},
        'USE_YOUTUBEDL':            {'type': bool,  'default': True},
-
+        
        'CURL_BINARY':              {'type': str,   'default': 'curl'},
        'GIT_BINARY':               {'type': str,   'default': 'git'},
        'WGET_BINARY':              {'type': str,   'default': 'wget'},
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@ -14,6 +14,9 @@ from django import forms
 from core.models import Snapshot, Tag
 from core.forms import AddLinkForm, TagField
 from core.utils import get_icons
 from core.mixins import SearchResultsAdminMixin
 from index.html import snapshot_icons
 from util import htmldecode, urldecode, ansi_to_html
 from logging_util import printable_filesize
@ -82,7 +85,7 @@ class SnapshotAdminForm(forms.ModelForm):
        return instance
-class SnapshotAdmin(admin.ModelAdmin):
+class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
    list_display = ('added', 'title_str', 'url_str', 'files', 'size')
    sort_fields = ('title_str', 'url_str', 'added')
    readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
--- a/archivebox/core/mixins.py
+++ b/archivebox/core/mixins.py
@ -0,0 +1,23 @@
 from django.contrib import messages
 from archivebox.search import query_search_index
 class SearchResultsAdminMixin(object):
    def get_search_results(self, request, queryset, search_term):
        ''' Enhances the search queryset with results from the search backend.
        '''
        qs, use_distinct = \
            super(SearchResultsAdminMixin, self).get_search_results(
                request, queryset, search_term)
        search_term = search_term.strip()
        if not search_term:
            return qs, use_distinct
        try:
            qsearch = query_search_index(search_term)
        except Exception as err:
            messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
        else:
            qs |= qsearch
        finally:
            return qs, use_distinct
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -5,10 +5,11 @@ import uuid
 from django.db import models, transaction
 from django.utils.functional import cached_property
 from django.utils.text import slugify
 from django.db.models import Case, When, Value, IntegerField
 from ..util import parse_date
 from ..index.schema import Link
-from ..extractors import get_default_archive_methods
+from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
 EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
 STATUS_CHOICES = [
@ -91,7 +92,7 @@ class Snapshot(models.Model):
        return {
            key: getattr(self, key)
            if key != 'tags' else self.tags_str()
-            for key in args 
+            for key in args
        }
    def as_link(self) -> Link:
@ -100,7 +101,7 @@ class Snapshot(models.Model):
    def as_link_with_details(self) -> Link:
        from ..index import load_link_details
        return load_link_details(self.as_link())
-    
+
    def tags_str(self) -> str:
        return ','.join(self.tags.order_by('name').values_list('name', flat=True))
@ -157,7 +158,15 @@ class Snapshot(models.Model):
        self.tags.clear()
        self.tags.add(*tags_id)
 class ArchiveResultManager(models.Manager):
    def indexable(self, sorted: bool = True):
        INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
        qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
        if sorted:
            precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
        return qs
 class ArchiveResult(models.Model):
    snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
    cmd = models.JSONField()
@ -169,5 +178,7 @@ class ArchiveResult(models.Model):
    status = models.CharField(max_length=16, choices=STATUS_CHOICES)
    extractor = models.CharField(choices=EXTRACTORS, max_length=32)
    objects = ArchiveResultManager()
    def __str__(self):
        return self.extractor
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -23,6 +23,7 @@ from ..logging_util import (
    log_archive_method_started,
    log_archive_method_finished,
 )
 from ..search import write_search_index
 from .title import should_save_title, save_title
 from .favicon import should_save_favicon, save_favicon
@ -38,6 +39,7 @@ from .media import should_save_media, save_media
 from .archive_org import should_save_archive_dot_org, save_archive_dot_org
 from .headers import should_save_headers, save_headers
 def get_default_archive_methods():
    return [
        ('title', should_save_title, save_title),
@ -55,6 +57,8 @@ def get_default_archive_methods():
        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
    ]
 ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
@enforce_types
 def ignore_methods(to_ignore: List[str]):
    ARCHIVE_METHODS = get_default_archive_methods()
@ -107,6 +111,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                    link.history[method_name].append(result)
                    stats[result.status] += 1
                    write_search_index(link=link, texts=result.index_texts)
                    log_archive_method_finished(result)
                    if not skip_index:
                        ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
--- a/archivebox/extractors/readability.py
+++ b/archivebox/extractors/readability.py
@ -71,6 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
        CURL_BINARY,
        link.url
    ]
    readability_content = None
    timer = TimedProgress(timeout, prefix='      ')
    try:
        document = get_html(link, out_dir)
@ -86,8 +87,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
        result = run(cmd, cwd=out_dir, timeout=timeout)
        result_json = json.loads(result.stdout)
        output_folder.mkdir(exist_ok=True)
        readability_content = result_json.pop("textContent") 
        atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
-        atomic_write(str(output_folder / "content.txt"), result_json.pop("textContent"))
+        atomic_write(str(output_folder / "content.txt"), readability_content)
        atomic_write(str(output_folder / "article.json"), result_json)
        # parse out number of files downloaded from last line of stderr:
@ -117,5 +119,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
        cmd_version=READABILITY_VERSION,
        output=output,
        status=status,
-        **timer.stats,
+        index_texts= [readability_content] if readability_content else [],
        **timer.stats,  
    )
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -51,6 +51,8 @@ from .sql import (
    write_sql_link_details,
 )
 from ..search import search_backend_enabled, query_search_index
 ### Link filtering and checking
@enforce_types
@ -365,7 +367,7 @@ LINK_FILTERS = {
 }
@enforce_types
-def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
+def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
    q_filter = Q()
    for pattern in filter_patterns:
        try:
@ -380,6 +382,31 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
            raise SystemExit(2)
    return snapshots.filter(q_filter)
 def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet:
    if not search_backend_enabled():
        stderr()
        stderr(
                '[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True',
                color='red',
            )
        raise SystemExit(2)
    qsearch = get_empty_snapshot_queryset()
    for pattern in filter_patterns:
        try:
            qsearch |= query_search_index(pattern)
        except:
            raise SystemExit(2)
    return snapshots & qsearch
@enforce_types
 def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
    if filter_type != 'search':
        return q_filter(snapshots, filter_patterns, filter_type)
    else:
        return search_filter(snapshots, filter_patterns, filter_type)
 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
    """indexed links without checking archive status or data directory validity"""
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@ -39,6 +39,7 @@ class ArchiveResult:
    status: str
    start_ts: datetime
    end_ts: datetime
    index_texts: Union[List[str], None] = None
    schema: str = 'ArchiveResult'
    def __post_init__(self):
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -115,6 +115,7 @@ from .logging_util import (
    printable_dependency_version,
 )
 from .search import flush_search_index, index_links
 ALLOWED_IN_OUTPUT_DIR = {
    'lost+found',
@ -664,6 +665,7 @@ def remove(filter_str: Optional[str]=None,
    to_remove = snapshots.count()
    flush_search_index(snapshots=snapshots)
    remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
    all_snapshots = load_main_index(out_dir=out_dir)
    log_removal_finished(all_snapshots.count(), to_remove)
@ -709,6 +711,7 @@ def update(resume: Optional[float]=None,
    if index_only:
        for link in all_links:
            write_link_details(link, out_dir=out_dir, skip_sql_index=True)
        index_links(all_links, out_dir=out_dir)
        return all_links
    # Step 2: Run the archive methods for each link
--- a/archivebox/search/init.py
+++ b/archivebox/search/init.py
@ -0,0 +1,110 @@
 from typing import List, Union
 from pathlib import Path
 from importlib import import_module
 from django.db.models import QuerySet
 from archivebox.index.schema import Link
 from archivebox.util import enforce_types
 from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
 from .utils import get_indexable_content, log_index_started
 def indexing_enabled():
    return USE_INDEXING_BACKEND
 def search_backend_enabled():
    return USE_SEARCHING_BACKEND
 def get_backend():
    return f'search.backends.{SEARCH_BACKEND_ENGINE}'
 def import_backend():
    backend_string = get_backend()
    try:
        backend = import_module(backend_string)
    except Exception as err:
        raise Exception("Could not load '%s' as a backend: %s" % (backend_string, err))
    return backend
@enforce_types
 def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
    if not indexing_enabled():
        return
    if not skip_text_index and texts:
        setup_django(out_dir, check_db=True)
        from core.models import Snapshot
        snap = Snapshot.objects.filter(url=link.url).first()
        backend = import_backend()
        if snap:
            try:
                backend.index(snapshot_id=str(snap.id), texts=texts)
            except Exception as err:
                stderr()
                stderr(
                    f'[X] The search backend threw an exception={err}:',
                color='red',
                )
@enforce_types
 def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
    setup_django(out_dir, check_db=True)
    from core.models import Snapshot
    if search_backend_enabled():
        backend = import_backend()
        try:
            snapshot_ids = backend.search(query)
        except Exception as err:
            stderr()
            stderr(
                    f'[X] The search backend threw an exception={err}:',
                color='red',
                )
            raise
        else:
            # TODO preserve ordering from backend
            qsearch = Snapshot.objects.filter(pk__in=snapshot_ids)
            return qsearch
    return Snapshot.objects.none()
@enforce_types
 def flush_search_index(snapshots: QuerySet):
    if not indexing_enabled() or not snapshots:
        return
    backend = import_backend()
    snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True))
    try:
        backend.flush(snapshot_ids)
    except Exception as err:
        stderr()
        stderr(
            f'[X] The search backend threw an exception={err}:',
        color='red',
        )
@enforce_types
 def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
    if not links:
        return
    setup_django(out_dir=out_dir, check_db=True)
    from core.models import Snapshot, ArchiveResult
    for link in links:
        if snap := Snapshot.objects.filter(url=link.url).first():
            results = ArchiveResult.objects.indexable().filter(snapshot=snap)
            log_index_started(link.url)
            try:
                texts = get_indexable_content(results)
            except Exception as err:
                stderr()
                stderr(
                    f'[X] An Exception ocurred reading the indexable content={err}:',
                    color='red',
                    ) 
            else:
                write_search_index(link, texts, out_dir=out_dir)
--- a/archivebox/search/backends/init.py
+++ b/archivebox/search/backends/init.py
--- a/archivebox/search/backends/ripgrep.py
+++ b/archivebox/search/backends/ripgrep.py
@ -0,0 +1,47 @@
 import re
 from subprocess import run, PIPE, DEVNULL
 from typing import List, Generator
 from archivebox.config import setup_django, ARCHIVE_DIR
 from archivebox.util import enforce_types
 RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
 RG_ADD_TYPE = '--type-add'
 RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}"
 RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l)
 RG_REGEX_ARGUMENT = '-e'
 TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
 ts_regex =  re.compile(TIMESTAMP_REGEX)
@enforce_types
 def index(snapshot_id: str, texts: List[str]):
    return
@enforce_types
 def flush(snapshot_ids: Generator[str, None, None]):
    return
@enforce_types
 def search(text: str) -> List[str]:
    is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL)
    if is_rg_installed.returncode:
        raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
    setup_django(check_db=True)
    from core.models import Snapshot
    rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)]
    rg = run(rg_cmd, stdout=PIPE, stderr=PIPE, timeout=60)
    file_paths = [p.decode() for p in rg.stdout.splitlines()]
    timestamps = set()
    for path in file_paths:
        if ts := ts_regex.findall(path):
            timestamps.add(ts[0])
    snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
    return snap_ids
--- a/archivebox/search/backends/sonic.py
+++ b/archivebox/search/backends/sonic.py
@ -0,0 +1,28 @@
 from typing import List, Generator
 from sonic import IngestClient, SearchClient
 from archivebox.util import enforce_types
 from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION
 MAX_SONIC_TEXT_LENGTH = 20000
@enforce_types
 def index(snapshot_id: str, texts: List[str]):
    with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
        for text in texts:
            chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
            for chunk in chunks:
                ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
@enforce_types
 def search(text: str) -> List[str]:
    with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl:
        snap_ids = querycl.query(SONIC_COLLECTION, SONIC_BUCKET, text)
    return snap_ids
@enforce_types
 def flush(snapshot_ids: Generator[str, None, None]):
    with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
        for id in snapshot_ids:
            ingestcl.flush_object(SONIC_COLLECTION, SONIC_BUCKET, str(id))
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@ -0,0 +1,44 @@
 from django.db.models import QuerySet
 from archivebox.util import enforce_types
 from archivebox.config import ANSI
 def log_index_started(url):
    print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
    print( )
 def get_file_result_content(res, extra_path, use_pwd=False):
    if use_pwd: 
        fpath = f'{res.pwd}/{res.output}'
    else:
        fpath = f'{res.output}'
    if extra_path:
        fpath = f'{fpath}/{extra_path}'
    with open(fpath, 'r') as file:
        data = file.read()
    if data:
        return [data]
    return []
 # This should be abstracted by a plugin interface for extractors
@enforce_types
 def get_indexable_content(results: QuerySet):
    if not results:
        return []
    # Only use the first method available
    res, method = results.first(), results.first().extractor
    if method not in ('readability', 'singlefile', 'dom', 'wget'):
        return []
    # This should come from a plugin interface
    if method == 'readability':
        return get_file_result_content(res, 'content.txt')
    elif method == 'singlefile':
        return get_file_result_content(res, '')
    elif method == 'dom':
        return get_file_result_content(res,'',use_pwd=True)
    elif method == 'wget':
        return get_file_result_content(res,'',use_pwd=True)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -23,6 +23,7 @@ services:
            - SHOW_PROGRESS=False
        volumes:
            - ./data:/data
    # Optional Addons: tweak these examples as needed for your specific use case
@ -73,3 +74,14 @@ services:
    #     volumes:
    #         ./data:/archivebox
    #         ./data/wayback:/webarchive
    # Example: Run sonic search backend
    # sonic:
    #    image: valeriansaliou/sonic:v1.3.0    
    #    ports:
    #        - 1491:1491
    #    environment:
    #        - SEARCH_BACKEND_PASSWORD=SecretPassword
    #    volumes:
    #        - ./etc/sonic/config.cfg:/etc/sonic.cfg
    #        - ./data:/var/lib/sonic/store/
--- a/etc/sonic/config.cfg
+++ b/etc/sonic/config.cfg
@ -0,0 +1,66 @@
 # Sonic
 # Fast, lightweight and schema-less search backend
 # Configuration file
 # Example: https://github.com/valeriansaliou/sonic/blob/master/config.cfg
 [server]
 log_level = "debug"
 [channel]
 inet = "0.0.0.0:1491"
 tcp_timeout = 300
 auth_password = "${env.SEARCH_BACKEND_PASSWORD}"
 [channel.search]
 query_limit_default = 65535
 query_limit_maximum = 65535
 query_alternates_try = 10
 suggest_limit_default = 5
 suggest_limit_maximum = 20
 [store]
 [store.kv]
 path = "/var/lib/sonic/store/kv/"
 retain_word_objects = 100000
 [store.kv.pool]
 inactive_after = 1800
 [store.kv.database]
 flush_after = 900
 compress = true
 parallelism = 2
 max_files = 100
 max_compactions = 1
 max_flushes = 1
 write_buffer = 16384
 write_ahead_log = true
 [store.fst]
 path = "/var/lib/sonic/store/fst/"
 [store.fst.pool]
 inactive_after = 300
 [store.fst.graph]
 consolidate_after = 180
 max_size = 2048
 max_words = 250000