From b1f70b219736378170c1dcda1131792bf83c1830 Mon Sep 17 00:00:00 2001 From: JDC Date: Tue, 17 Nov 2020 18:42:57 -0500 Subject: [PATCH 01/27] Initial implementation --- archivebox.egg-info | 1 - archivebox/core/admin.py | 5 +++- archivebox/core/mixins.py | 21 +++++++++++++++ archivebox/extractors/__init__.py | 2 ++ archivebox/extractors/readability.py | 7 +++-- archivebox/index/schema.py | 1 + archivebox/search/__init__.py | 40 ++++++++++++++++++++++++++++ 7 files changed, 73 insertions(+), 4 deletions(-) delete mode 120000 archivebox.egg-info create mode 100644 archivebox/core/mixins.py create mode 100644 archivebox/search/__init__.py diff --git a/archivebox.egg-info b/archivebox.egg-info deleted file mode 120000 index 8ce20dd2..00000000 --- a/archivebox.egg-info +++ /dev/null @@ -1 +0,0 @@ -pip_dist/archivebox.egg-info \ No newline at end of file diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 5d3db409..e078bdaf 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -14,6 +14,9 @@ from django import forms from core.models import Snapshot, Tag from core.forms import AddLinkForm, TagField +from core.utils import get_icons +from core.mixins import SearchResultsAdminMixin + from index.html import snapshot_icons from util import htmldecode, urldecode, ansi_to_html from logging_util import printable_filesize @@ -82,7 +85,7 @@ class SnapshotAdminForm(forms.ModelForm): return instance -class SnapshotAdmin(admin.ModelAdmin): +class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): list_display = ('added', 'title_str', 'url_str', 'files', 'size') sort_fields = ('title_str', 'url_str', 'added') readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated') diff --git a/archivebox/core/mixins.py b/archivebox/core/mixins.py new file mode 100644 index 00000000..28f79b38 --- /dev/null +++ b/archivebox/core/mixins.py @@ -0,0 +1,21 @@ +from django.db.models import Q, Case, When, Value, IntegerField + +from archivebox.search import search_index + +class SearchResultsAdminMixin(object): + def get_search_results(self, request, queryset, search_term): + ''' Show exact match for title and slug at top of admin search results. + ''' + qs, use_distinct = \ + super(SearchResultsAdminMixin, self).get_search_results( + request, queryset, search_term) + + search_term = search_term.strip() + if not search_term: + return qs, use_distinct + + snapshot_ids = search_index(search_term) + qsearch = queryset.filter(id__in=snapshot_ids) + qs |= qsearch + + return qs, use_distinct \ No newline at end of file diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index ef5ef446..0cf6d90d 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -23,6 +23,7 @@ from ..logging_util import ( log_archive_method_started, log_archive_method_finished, ) +from ..search import write_search_index from .title import should_save_title, save_title from .favicon import should_save_favicon, save_favicon @@ -107,6 +108,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s link.history[method_name].append(result) stats[result.status] += 1 + write_search_index(link=link, texts=result.index_texts) log_archive_method_finished(result) if not skip_index: ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index bd45e9d5..9da620b4 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -71,6 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO CURL_BINARY, link.url ] + readability_content = None timer = TimedProgress(timeout, prefix=' ') try: document = get_html(link, out_dir) @@ -86,8 +87,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO result = run(cmd, cwd=out_dir, timeout=timeout) result_json = json.loads(result.stdout) output_folder.mkdir(exist_ok=True) + readability_content = result_json.pop("textContent") atomic_write(str(output_folder / "content.html"), result_json.pop("content")) - atomic_write(str(output_folder / "content.txt"), result_json.pop("textContent")) + atomic_write(str(output_folder / "content.txt"), readability_content) atomic_write(str(output_folder / "article.json"), result_json) # parse out number of files downloaded from last line of stderr: @@ -117,5 +119,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO cmd_version=READABILITY_VERSION, output=output, status=status, - **timer.stats, + index_texts= [readability_content] if readability_content else [], + **timer.stats, ) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 90021e0b..bc3a25da 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -39,6 +39,7 @@ class ArchiveResult: status: str start_ts: datetime end_ts: datetime + index_texts: Union[List[str], None] = None schema: str = 'ArchiveResult' def __post_init__(self): diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py new file mode 100644 index 00000000..f503e9fa --- /dev/null +++ b/archivebox/search/__init__.py @@ -0,0 +1,40 @@ +from typing import List, Optional, Union +from pathlib import Path + +from sonic import IngestClient, SearchClient + +from ..index.schema import Link, ArchiveResult +from ..util import enforce_types +from ..config import setup_django, OUTPUT_DIR + + +@enforce_types +def write_sonic_index(snapshot_id: str, texts: List[str]): + # TODO add variables to localhost, port, password, bucket, collection + with IngestClient("localhost", 1491, "SecretPassword") as ingestcl: + for text in texts: + ingestcl.push("archivebox", "snapshots", snapshot_id, str(text)) + +@enforce_types +def search_sonic_index(text: str) -> List: + with SearchClient("localhost", 1491, "SecretPassword") as querycl: + snap_ids = querycl.query("archivebox", "snapshots", text) + return snap_ids + + +@enforce_types +def search_index(text: str) -> List: + # get backend + return search_sonic_index(text) + + +@enforce_types +def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None: + setup_django(out_dir, check_db=True) + from core.models import Snapshot + + if not skip_text_index and texts: + snap = Snapshot.objects.filter(url=link.url).first() + if snap: + # get backend + write_sonic_index(str(snap.id), texts) \ No newline at end of file From 5f6673c72c472ce23f192e7661ec449134fbf463 Mon Sep 17 00:00:00 2001 From: JDC Date: Wed, 18 Nov 2020 17:54:13 -0500 Subject: [PATCH 02/27] Implement backend architecture for search engines --- archivebox/core/mixins.py | 20 ++++---- archivebox/search/__init__.py | 65 +++++++++++++++----------- archivebox/search/backends/__init__.py | 0 archivebox/search/backends/sonic.py | 19 ++++++++ 4 files changed, 69 insertions(+), 35 deletions(-) create mode 100644 archivebox/search/backends/__init__.py create mode 100644 archivebox/search/backends/sonic.py diff --git a/archivebox/core/mixins.py b/archivebox/core/mixins.py index 28f79b38..afae2d78 100644 --- a/archivebox/core/mixins.py +++ b/archivebox/core/mixins.py @@ -1,10 +1,10 @@ -from django.db.models import Q, Case, When, Value, IntegerField +from django.contrib import messages -from archivebox.search import search_index +from archivebox.search import query_search_index class SearchResultsAdminMixin(object): def get_search_results(self, request, queryset, search_term): - ''' Show exact match for title and slug at top of admin search results. + ''' Enhances the search queryset with results from the search backend. ''' qs, use_distinct = \ super(SearchResultsAdminMixin, self).get_search_results( @@ -13,9 +13,13 @@ class SearchResultsAdminMixin(object): search_term = search_term.strip() if not search_term: return qs, use_distinct + try: + snapshot_ids = query_search_index(search_term) + except Exception as err: + messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}') + else: + qsearch = queryset.filter(id__in=snapshot_ids) + qs |= qsearch - snapshot_ids = search_index(search_term) - qsearch = queryset.filter(id__in=snapshot_ids) - qs |= qsearch - - return qs, use_distinct \ No newline at end of file + finally: + return qs, use_distinct diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index f503e9fa..6e604224 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -1,40 +1,51 @@ -from typing import List, Optional, Union +from typing import List, Union from pathlib import Path - -from sonic import IngestClient, SearchClient - -from ..index.schema import Link, ArchiveResult -from ..util import enforce_types -from ..config import setup_django, OUTPUT_DIR +from importlib import import_module -@enforce_types -def write_sonic_index(snapshot_id: str, texts: List[str]): - # TODO add variables to localhost, port, password, bucket, collection - with IngestClient("localhost", 1491, "SecretPassword") as ingestcl: - for text in texts: - ingestcl.push("archivebox", "snapshots", snapshot_id, str(text)) - -@enforce_types -def search_sonic_index(text: str) -> List: - with SearchClient("localhost", 1491, "SecretPassword") as querycl: - snap_ids = querycl.query("archivebox", "snapshots", text) - return snap_ids +from archivebox.index.schema import Link +from archivebox.util import enforce_types +from archivebox.config import setup_django, OUTPUT_DIR -@enforce_types -def search_index(text: str) -> List: - # get backend - return search_sonic_index(text) +def indexing_enabled(): + return True + # return FULLTEXT_INDEXING_ENABLED +def search_backend_enabled(): + return True + # return FULLTEXT_SEARCH_ENABLED + +def get_backend(): + return 'search.backends.sonic' + +def import_backend(): + backend_string = get_backend() + try: + backend = import_module(backend_string) + except Exception as err: + raise Exception("Could not load '%s' as a backend: %s" % (backend_string, err)) + return backend @enforce_types def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None: - setup_django(out_dir, check_db=True) - from core.models import Snapshot + if not indexing_enabled(): + return if not skip_text_index and texts: + setup_django(out_dir, check_db=True) + from core.models import Snapshot + snap = Snapshot.objects.filter(url=link.url).first() + backend = import_backend() if snap: - # get backend - write_sonic_index(str(snap.id), texts) \ No newline at end of file + backend.index(snapshot_id=str(snap.id), texts=texts) + +@enforce_types +def query_search_index(text: str) -> List: + if search_backend_enabled(): + backend = import_backend() + return backend.search(text) + else: + return [] + \ No newline at end of file diff --git a/archivebox/search/backends/__init__.py b/archivebox/search/backends/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py new file mode 100644 index 00000000..28725f27 --- /dev/null +++ b/archivebox/search/backends/sonic.py @@ -0,0 +1,19 @@ +from typing import List + +from sonic import IngestClient, SearchClient + +from archivebox.util import enforce_types + +@enforce_types +def index(snapshot_id: str, texts: List[str]): + # TODO add variables to localhost, port, password, bucket, collection + with IngestClient("localhost", 1491, "SecretPassword") as ingestcl: + for text in texts: + ingestcl.push("archivebox", "snapshots", snapshot_id, str(text)) + +@enforce_types +def search(text: str) -> List: + with SearchClient("localhost", 1491, "SecretPassword") as querycl: + snap_ids = querycl.query("archivebox", "snapshots", text) + return snap_ids + \ No newline at end of file From c2c01af3adfd69c1984b5c6b2cdc1aa59b08c32b Mon Sep 17 00:00:00 2001 From: JDC Date: Thu, 19 Nov 2020 08:06:13 -0500 Subject: [PATCH 03/27] Add config for search backend --- archivebox/config.py | 14 +++++++++++++- archivebox/search/__init__.py | 11 ++++------- archivebox/search/backends/sonic.py | 11 ++++++----- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 47049342..0ca2d7d9 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -139,6 +139,18 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, }, + 'SEARCH_BACKEND_CONFIG' : { + 'USE_INDEXING_BACKEND': {'type': bool, 'default': True}, + 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True}, + 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'sonic'}, + 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, + 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, + 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, + # SONIC + 'SONIC_BUCKET': {'type': str, 'default': 'archivebox'}, + 'SONIC_COLLECTION': {'type': str, 'default': 'snapshots'}, + }, + 'DEPENDENCY_CONFIG': { 'USE_CURL': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True}, @@ -149,7 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'USE_CHROME': {'type': bool, 'default': True}, 'USE_NODE': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, - + 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 6e604224..7db4af46 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -5,19 +5,16 @@ from importlib import import_module from archivebox.index.schema import Link from archivebox.util import enforce_types -from archivebox.config import setup_django, OUTPUT_DIR - +from archivebox.config import setup_django, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE def indexing_enabled(): - return True - # return FULLTEXT_INDEXING_ENABLED + return USE_INDEXING_BACKEND def search_backend_enabled(): - return True - # return FULLTEXT_SEARCH_ENABLED + return USE_SEARCHING_BACKEND def get_backend(): - return 'search.backends.sonic' + return f'search.backends.{SEARCH_BACKEND_ENGINE}' def import_backend(): backend_string = get_backend() diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py index 28725f27..e062f9e1 100644 --- a/archivebox/search/backends/sonic.py +++ b/archivebox/search/backends/sonic.py @@ -3,17 +3,18 @@ from typing import List from sonic import IngestClient, SearchClient from archivebox.util import enforce_types +from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION + @enforce_types def index(snapshot_id: str, texts: List[str]): - # TODO add variables to localhost, port, password, bucket, collection - with IngestClient("localhost", 1491, "SecretPassword") as ingestcl: + with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: for text in texts: - ingestcl.push("archivebox", "snapshots", snapshot_id, str(text)) + ingestcl.push(SONIC_BUCKET, SONIC_COLLECTION, snapshot_id, str(text)) @enforce_types def search(text: str) -> List: - with SearchClient("localhost", 1491, "SecretPassword") as querycl: - snap_ids = querycl.query("archivebox", "snapshots", text) + with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl: + snap_ids = querycl.query(SONIC_BUCKET, SONIC_COLLECTION, text) return snap_ids \ No newline at end of file From 47daa038eb61674df22345e99201472ea770762c Mon Sep 17 00:00:00 2001 From: JDC Date: Thu, 19 Nov 2020 16:45:12 -0500 Subject: [PATCH 04/27] Implement flush for search backend after remove command --- archivebox/config.py | 4 ++-- archivebox/core/mixins.py | 2 +- archivebox/main.py | 2 ++ archivebox/search/__init__.py | 9 ++++++++- archivebox/search/backends/sonic.py | 11 ++++++++--- 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 0ca2d7d9..ee2f0b4a 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -147,8 +147,8 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, # SONIC - 'SONIC_BUCKET': {'type': str, 'default': 'archivebox'}, - 'SONIC_COLLECTION': {'type': str, 'default': 'snapshots'}, + 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, + 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, }, 'DEPENDENCY_CONFIG': { diff --git a/archivebox/core/mixins.py b/archivebox/core/mixins.py index afae2d78..b361790a 100644 --- a/archivebox/core/mixins.py +++ b/archivebox/core/mixins.py @@ -18,7 +18,7 @@ class SearchResultsAdminMixin(object): except Exception as err: messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}') else: - qsearch = queryset.filter(id__in=snapshot_ids) + qsearch = queryset.filter(pk__in=snapshot_ids) qs |= qsearch finally: diff --git a/archivebox/main.py b/archivebox/main.py index cbbd2218..504cd670 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -115,6 +115,7 @@ from .logging_util import ( printable_dependency_version, ) +from .search import flush_search_index ALLOWED_IN_OUTPUT_DIR = { 'lost+found', @@ -665,6 +666,7 @@ def remove(filter_str: Optional[str]=None, to_remove = snapshots.count() remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) + flush_search_index(snapshot_ids=[str(pk) for pk in snapshots.values_list('pk',flat=True)]) all_snapshots = load_main_index(out_dir=out_dir) log_removal_finished(all_snapshots.count(), to_remove) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 7db4af46..93245bda 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -45,4 +45,11 @@ def query_search_index(text: str) -> List: return backend.search(text) else: return [] - \ No newline at end of file + +@enforce_types +def flush_search_index(snapshot_ids: List[str]): + if not indexing_enabled() or not snapshot_ids: + return + backend = import_backend() + backend.flush(snapshot_ids) + \ No newline at end of file diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py index e062f9e1..8fd93ae8 100644 --- a/archivebox/search/backends/sonic.py +++ b/archivebox/search/backends/sonic.py @@ -10,11 +10,16 @@ from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEA def index(snapshot_id: str, texts: List[str]): with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: for text in texts: - ingestcl.push(SONIC_BUCKET, SONIC_COLLECTION, snapshot_id, str(text)) + ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text)) @enforce_types def search(text: str) -> List: with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl: - snap_ids = querycl.query(SONIC_BUCKET, SONIC_COLLECTION, text) + snap_ids = querycl.query(SONIC_COLLECTION, SONIC_BUCKET, text) return snap_ids - \ No newline at end of file + +@enforce_types +def flush(snapshot_ids: List[str]): + with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: + for id in snapshot_ids: + ingestcl.flush_object(SONIC_COLLECTION, SONIC_BUCKET, str(id)) From f383648ffc80e64bfa399efc5e1b7766fe7de3dd Mon Sep 17 00:00:00 2001 From: JDC Date: Thu, 19 Nov 2020 17:33:53 -0500 Subject: [PATCH 05/27] Use a generator for snapshot flush from index --- archivebox/main.py | 2 +- archivebox/search/__init__.py | 7 +++---- archivebox/search/backends/sonic.py | 6 +++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/archivebox/main.py b/archivebox/main.py index 504cd670..7d13a5c4 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -666,7 +666,7 @@ def remove(filter_str: Optional[str]=None, to_remove = snapshots.count() remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) - flush_search_index(snapshot_ids=[str(pk) for pk in snapshots.values_list('pk',flat=True)]) + flush_search_index(snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True))) all_snapshots = load_main_index(out_dir=out_dir) log_removal_finished(all_snapshots.count(), to_remove) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 93245bda..59bb6fe5 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List, Union, Generator from pathlib import Path from importlib import import_module @@ -39,7 +39,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: backend.index(snapshot_id=str(snap.id), texts=texts) @enforce_types -def query_search_index(text: str) -> List: +def query_search_index(text: str) -> List[str]: if search_backend_enabled(): backend = import_backend() return backend.search(text) @@ -47,9 +47,8 @@ def query_search_index(text: str) -> List: return [] @enforce_types -def flush_search_index(snapshot_ids: List[str]): +def flush_search_index(snapshot_ids: Generator[str, None, None]): if not indexing_enabled() or not snapshot_ids: return backend = import_backend() backend.flush(snapshot_ids) - \ No newline at end of file diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py index 8fd93ae8..7dc4d5b0 100644 --- a/archivebox/search/backends/sonic.py +++ b/archivebox/search/backends/sonic.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Generator from sonic import IngestClient, SearchClient @@ -13,13 +13,13 @@ def index(snapshot_id: str, texts: List[str]): ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text)) @enforce_types -def search(text: str) -> List: +def search(text: str) -> List[str]: with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl: snap_ids = querycl.query(SONIC_COLLECTION, SONIC_BUCKET, text) return snap_ids @enforce_types -def flush(snapshot_ids: List[str]): +def flush(snapshot_ids: Generator[str, None, None]): with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: for id in snapshot_ids: ingestcl.flush_object(SONIC_COLLECTION, SONIC_BUCKET, str(id)) From 823df34080a0ac8aa9cc6d4e9d689a3d4cf84309 Mon Sep 17 00:00:00 2001 From: JDC Date: Thu, 19 Nov 2020 18:19:33 -0500 Subject: [PATCH 06/27] Use QuerySets for search backend API instead of pks --- archivebox/core/mixins.py | 4 +--- archivebox/main.py | 2 +- archivebox/search/__init__.py | 19 ++++++++++++++----- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/archivebox/core/mixins.py b/archivebox/core/mixins.py index b361790a..d1203745 100644 --- a/archivebox/core/mixins.py +++ b/archivebox/core/mixins.py @@ -14,12 +14,10 @@ class SearchResultsAdminMixin(object): if not search_term: return qs, use_distinct try: - snapshot_ids = query_search_index(search_term) + qsearch = query_search_index(search_term) except Exception as err: messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}') else: - qsearch = queryset.filter(pk__in=snapshot_ids) qs |= qsearch - finally: return qs, use_distinct diff --git a/archivebox/main.py b/archivebox/main.py index 7d13a5c4..d533d58d 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -666,7 +666,7 @@ def remove(filter_str: Optional[str]=None, to_remove = snapshots.count() remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) - flush_search_index(snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True))) + flush_search_index(snapshots=snapshots) all_snapshots = load_main_index(out_dir=out_dir) log_removal_finished(all_snapshots.count(), to_remove) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 59bb6fe5..15efffb0 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -1,7 +1,8 @@ -from typing import List, Union, Generator +from typing import List, Union from pathlib import Path from importlib import import_module +from django.db.models import QuerySet from archivebox.index.schema import Link from archivebox.util import enforce_types @@ -39,16 +40,24 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: backend.index(snapshot_id=str(snap.id), texts=texts) @enforce_types -def query_search_index(text: str) -> List[str]: +def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: if search_backend_enabled(): + setup_django(out_dir, check_db=True) + from core.models import Snapshot + backend = import_backend() - return backend.search(text) + snapshot_ids = backend.search(query) + # TODO preserve ordering from backend + qsearch = Snapshot.objects.filter(pk__in=snapshot_ids) + return qsearch else: return [] @enforce_types -def flush_search_index(snapshot_ids: Generator[str, None, None]): - if not indexing_enabled() or not snapshot_ids: +def flush_search_index(snapshots: QuerySet): + if not indexing_enabled() or not snapshots: return backend = import_backend() + snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True)) + backend.flush(snapshot_ids) From fb67d6684c4ba229450767ab8afef2a7b158cd99 Mon Sep 17 00:00:00 2001 From: JDC Date: Thu, 19 Nov 2020 21:53:22 -0500 Subject: [PATCH 07/27] fix: Return empty QuerySet instead of list --- archivebox/search/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 15efffb0..2a1f4dcd 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -51,7 +51,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: qsearch = Snapshot.objects.filter(pk__in=snapshot_ids) return qsearch else: - return [] + return Snapshot.objects.none() @enforce_types def flush_search_index(snapshots: QuerySet): From 0f7dba07dfe673d5915c1bfb344a24b4cb027e84 Mon Sep 17 00:00:00 2001 From: JDC Date: Thu, 19 Nov 2020 23:39:28 -0500 Subject: [PATCH 08/27] feat: add search filter-type to list command --- archivebox/cli/archivebox_list.py | 2 +- archivebox/index/__init__.py | 34 ++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 140810a6..3838cf60 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex','tag'), + choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 3a066e18..34e2c5ff 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -51,6 +51,8 @@ from .sql import ( write_sql_link_details, ) +from ..search import search_backend_enabled, query_search_index + ### Link filtering and checking @enforce_types @@ -365,7 +367,7 @@ LINK_FILTERS = { } @enforce_types -def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: +def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: q_filter = Q() for pattern in filter_patterns: try: @@ -380,6 +382,36 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type raise SystemExit(2) return snapshots.filter(q_filter) +def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet: + if not search_backend_enabled(): + stderr() + stderr( + '[X] The search backend is not enabled', + color='red', + ) + raise SystemExit(2) + + qsearch = get_empty_snapshot_queryset() + for pattern in filter_patterns: + try: + qsearch |= query_search_index(pattern) + except Exception as err: + stderr() + stderr( + f'[X] The search backend threw an exception={err}:', + color='red', + ) + raise SystemExit(2) + + return snapshots & qsearch + +@enforce_types +def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: + if filter_type != 'search': + return q_filter(snapshots, filter_patterns, filter_type) + else: + return search_filter(snapshots, filter_patterns, filter_type) + def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity""" From 0773f12034239304aea3dbccf61edcf0392201f4 Mon Sep 17 00:00:00 2001 From: JDC Date: Fri, 20 Nov 2020 10:29:28 -0500 Subject: [PATCH 09/27] Add sonic to docker-compose --- docker-compose.yml | 11 ++++++++ etc/sonic/config.cfg | 66 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 etc/sonic/config.cfg diff --git a/docker-compose.yml b/docker-compose.yml index 5fe91026..4e121621 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,8 +21,19 @@ services: environment: - USE_COLOR=True - SHOW_PROGRESS=False + - SEARCH_BACKEND_HOST_NAME=sonic volumes: - ./data:/data + depends_on: + - sonic + sonic: + image: valeriansaliou/sonic:v1.3.0 + ports: + - 1491:1491 + volumes: + - ./etc/sonic/config.cfg:/etc/sonic.cfg + - ./data:/var/lib/sonic/store/ + # Optional Addons: tweak these examples as needed for your specific use case diff --git a/etc/sonic/config.cfg b/etc/sonic/config.cfg new file mode 100644 index 00000000..b3dd5898 --- /dev/null +++ b/etc/sonic/config.cfg @@ -0,0 +1,66 @@ +# Sonic +# Fast, lightweight and schema-less search backend +# Configuration file +# Example: https://github.com/valeriansaliou/sonic/blob/master/config.cfg + + +[server] + +log_level = "debug" + + +[channel] + +inet = "0.0.0.0:1491" +tcp_timeout = 300 + +auth_password = "SecretPassword" + +[channel.search] + +query_limit_default = 10 +query_limit_maximum = 100 +query_alternates_try = 4 + +suggest_limit_default = 5 +suggest_limit_maximum = 20 + + +[store] + +[store.kv] + +path = "/var/lib/sonic/store/kv/" + +retain_word_objects = 1000 + +[store.kv.pool] + +inactive_after = 1800 + +[store.kv.database] + +flush_after = 900 + +compress = true +parallelism = 2 +max_files = 100 +max_compactions = 1 +max_flushes = 1 +write_buffer = 16384 +write_ahead_log = true + +[store.fst] + +path = "/var/lib/sonic/store/fst/" + +[store.fst.pool] + +inactive_after = 300 + +[store.fst.graph] + +consolidate_after = 180 + +max_size = 2048 +max_words = 250000 From a38e3e0c90ad8954dfe151e83c68af9c04cf4f42 Mon Sep 17 00:00:00 2001 From: JDC Date: Fri, 20 Nov 2020 11:51:44 -0500 Subject: [PATCH 10/27] Get searc backend password from env var SEARCH_BACKEND_PASSWORD --- docker-compose.yml | 3 +++ etc/sonic/config.cfg | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 4e121621..29fc6f7a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,6 +22,7 @@ services: - USE_COLOR=True - SHOW_PROGRESS=False - SEARCH_BACKEND_HOST_NAME=sonic + - SEARCH_BACKEND_PASSWORD=SecretPassword volumes: - ./data:/data depends_on: @@ -30,6 +31,8 @@ services: image: valeriansaliou/sonic:v1.3.0 ports: - 1491:1491 + environment: + - SEARCH_BACKEND_PASSWORD=SecretPassword volumes: - ./etc/sonic/config.cfg:/etc/sonic.cfg - ./data:/var/lib/sonic/store/ diff --git a/etc/sonic/config.cfg b/etc/sonic/config.cfg index b3dd5898..4fb374b4 100644 --- a/etc/sonic/config.cfg +++ b/etc/sonic/config.cfg @@ -14,7 +14,7 @@ log_level = "debug" inet = "0.0.0.0:1491" tcp_timeout = 300 -auth_password = "SecretPassword" +auth_password = "${env.SEARCH_BACKEND_PASSWORD}" [channel.search] From 9bd40ed7f6055f1a60597eb63836984dec6651fb Mon Sep 17 00:00:00 2001 From: JDC Date: Fri, 20 Nov 2020 15:27:39 -0500 Subject: [PATCH 11/27] Max out number of queries --- etc/sonic/config.cfg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/etc/sonic/config.cfg b/etc/sonic/config.cfg index 4fb374b4..45806ed1 100644 --- a/etc/sonic/config.cfg +++ b/etc/sonic/config.cfg @@ -18,9 +18,9 @@ auth_password = "${env.SEARCH_BACKEND_PASSWORD}" [channel.search] -query_limit_default = 10 -query_limit_maximum = 100 -query_alternates_try = 4 +query_limit_default = 65535 +query_limit_maximum = 65535 +query_alternates_try = 10 suggest_limit_default = 5 suggest_limit_maximum = 20 From 0ed53cc1177484b7dbdf2a3aefe4a4c18a2c4ced Mon Sep 17 00:00:00 2001 From: JDC Date: Sat, 21 Nov 2020 08:22:18 -0500 Subject: [PATCH 12/27] Add search filter type for `update` --- archivebox/cli/archivebox_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index aa8cae1b..d9a94235 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex'), + choices=('exact', 'substring', 'domain', 'regex', 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) From 4eeedae8151c6677253b509ddcb7ec2e9086284d Mon Sep 17 00:00:00 2001 From: JDC Date: Sat, 21 Nov 2020 09:37:13 -0500 Subject: [PATCH 13/27] Exception handling for indexing and searching --- archivebox/index/__init__.py | 9 ++----- archivebox/search/__init__.py | 50 +++++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 34e2c5ff..bf1d0c6a 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -386,7 +386,7 @@ def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: if not search_backend_enabled(): stderr() stderr( - '[X] The search backend is not enabled', + '[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red', ) raise SystemExit(2) @@ -395,12 +395,7 @@ def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: for pattern in filter_patterns: try: qsearch |= query_search_index(pattern) - except Exception as err: - stderr() - stderr( - f'[X] The search backend threw an exception={err}:', - color='red', - ) + except: raise SystemExit(2) return snapshots & qsearch diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 2a1f4dcd..fdf19a89 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -6,7 +6,7 @@ from django.db.models import QuerySet from archivebox.index.schema import Link from archivebox.util import enforce_types -from archivebox.config import setup_django, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE +from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE def indexing_enabled(): return USE_INDEXING_BACKEND @@ -37,21 +37,37 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: snap = Snapshot.objects.filter(url=link.url).first() backend = import_backend() if snap: - backend.index(snapshot_id=str(snap.id), texts=texts) + try: + backend.index(snapshot_id=str(snap.id), texts=texts) + except Exception as err: + stderr() + stderr( + f'[X] The search backend threw an exception={err}:', + color='red', + ) @enforce_types -def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: - if search_backend_enabled(): - setup_django(out_dir, check_db=True) - from core.models import Snapshot +def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: + setup_django(out_dir, check_db=True) + from core.models import Snapshot + if search_backend_enabled(): backend = import_backend() - snapshot_ids = backend.search(query) - # TODO preserve ordering from backend - qsearch = Snapshot.objects.filter(pk__in=snapshot_ids) - return qsearch - else: - return Snapshot.objects.none() + try: + snapshot_ids = backend.search(query) + except Exception as err: + stderr() + stderr( + f'[X] The search backend threw an exception={err}:', + color='red', + ) + raise + else: + # TODO preserve ordering from backend + qsearch = Snapshot.objects.filter(pk__in=snapshot_ids) + return qsearch + + return Snapshot.objects.none() @enforce_types def flush_search_index(snapshots: QuerySet): @@ -59,5 +75,11 @@ def flush_search_index(snapshots: QuerySet): return backend = import_backend() snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True)) - - backend.flush(snapshot_ids) + try: + backend.flush(snapshot_ids) + except Exception as err: + stderr() + stderr( + f'[X] The search backend threw an exception={err}:', + color='red', + ) From 70cc0c1950c4fb4bdc8edbc3f932d9500cf35283 Mon Sep 17 00:00:00 2001 From: JDC Date: Sat, 21 Nov 2020 13:02:35 -0500 Subject: [PATCH 14/27] Add search filter-type --- archivebox/cli/archivebox_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index d9a94235..aa8cae1b 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex', 'search'), + choices=('exact', 'substring', 'domain', 'regex'), default='exact', help='Type of pattern matching to use when filtering URLs', ) From c5b1b91708b9a66eb508b8d22f7686f6711c5747 Mon Sep 17 00:00:00 2001 From: JDC Date: Sat, 21 Nov 2020 13:02:58 -0500 Subject: [PATCH 15/27] fix: flush_search_index must be called before removing snapshots --- archivebox/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/main.py b/archivebox/main.py index d533d58d..73278702 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -665,8 +665,8 @@ def remove(filter_str: Optional[str]=None, to_remove = snapshots.count() - remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) flush_search_index(snapshots=snapshots) + remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) all_snapshots = load_main_index(out_dir=out_dir) log_removal_finished(all_snapshots.count(), to_remove) From 8484bdb9739a949311fd666eb7c7fe7f5fde6f3d Mon Sep 17 00:00:00 2001 From: JDC Date: Sat, 21 Nov 2020 13:06:51 -0500 Subject: [PATCH 16/27] Fix add search filter to update --- archivebox/cli/archivebox_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index aa8cae1b..d9a94235 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex'), + choices=('exact', 'substring', 'domain', 'regex', 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) From 95382b381203e92dda76286a30a934ca2cca1ba5 Mon Sep 17 00:00:00 2001 From: JDC Date: Sun, 22 Nov 2020 20:56:24 -0500 Subject: [PATCH 17/27] Add ripgrep rg search backend and set as default --- Dockerfile | 2 +- archivebox/config.py | 2 +- archivebox/search/backends/ripgrep.py | 43 +++++++++++++++++++++++++++ docker-compose.yml | 24 +++++++-------- 4 files changed, 56 insertions(+), 15 deletions(-) create mode 100644 archivebox/search/backends/ripgrep.py diff --git a/Dockerfile b/Dockerfile index 33d4a488..20a410e2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,7 +46,7 @@ RUN apt-get update -qq \ # Install apt dependencies RUN apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ - wget curl chromium git ffmpeg youtube-dl \ + wget curl chromium git ffmpeg youtube-dl ripgrep \ fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ && rm -rf /var/lib/apt/lists/* diff --git a/archivebox/config.py b/archivebox/config.py index ee2f0b4a..846df0c9 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -142,7 +142,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'SEARCH_BACKEND_CONFIG' : { 'USE_INDEXING_BACKEND': {'type': bool, 'default': True}, 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True}, - 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'sonic'}, + 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'}, 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py new file mode 100644 index 00000000..cd9ecfee --- /dev/null +++ b/archivebox/search/backends/ripgrep.py @@ -0,0 +1,43 @@ +import re +from subprocess import run, PIPE, DEVNULL +from typing import List, Generator + +from archivebox.config import setup_django, ARCHIVE_DIR, ARCHIVE_DIR_NAME +from archivebox.util import enforce_types + +DEFAULT_ARGUMENTS = '-ilt' # Case insensitive, matching files, types +DEFAULT_EXTENSIONS = 'html' +REGEX_ARGUMENT = '-e' + +TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/' + +ts_regex = re.compile(TIMESTAMP_REGEX) + +@enforce_types +def index(snapshot_id: str, texts: List[str]): + return + +@enforce_types +def flush(snapshot_ids: Generator[str, None, None]): + return + +@enforce_types +def search(text: str) -> List[str]: + is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL) + if is_rg_installed.returncode: + raise Exception("rg binary not found, install ripgrep to use this backend") + + setup_django(check_db=True) + from core.models import Snapshot + + rg = run(['rg',DEFAULT_ARGUMENTS, DEFAULT_EXTENSIONS, REGEX_ARGUMENT, text, str(ARCHIVE_DIR)],stdout=PIPE, stderr=PIPE, timeout=60) + file_paths = [p.decode().replace(str(ARCHIVE_DIR_NAME), '') for p in rg.stdout.splitlines()] + timestamps = set() + for path in file_paths: + if ts := ts_regex.findall(path): + timestamps.add(ts[0]) + + snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] + + return snap_ids + diff --git a/docker-compose.yml b/docker-compose.yml index 29fc6f7a..c76f734a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,21 +21,8 @@ services: environment: - USE_COLOR=True - SHOW_PROGRESS=False - - SEARCH_BACKEND_HOST_NAME=sonic - - SEARCH_BACKEND_PASSWORD=SecretPassword volumes: - ./data:/data - depends_on: - - sonic - sonic: - image: valeriansaliou/sonic:v1.3.0 - ports: - - 1491:1491 - environment: - - SEARCH_BACKEND_PASSWORD=SecretPassword - volumes: - - ./etc/sonic/config.cfg:/etc/sonic.cfg - - ./data:/var/lib/sonic/store/ @@ -87,3 +74,14 @@ services: # volumes: # ./data:/archivebox # ./data/wayback:/webarchive + + # Example: Run sonic search backend + # sonic: + # image: valeriansaliou/sonic:v1.3.0 + # ports: + # - 1491:1491 + # environment: + # - SEARCH_BACKEND_PASSWORD=SecretPassword + # volumes: + # - ./etc/sonic/config.cfg:/etc/sonic.cfg + # - ./data:/var/lib/sonic/store/ \ No newline at end of file From 23a9beb4e00ad954af8476c3e3c71e9d068f00a1 Mon Sep 17 00:00:00 2001 From: JDC Date: Mon, 23 Nov 2020 08:26:12 -0500 Subject: [PATCH 18/27] Add ignored extensions in ripgrep search --- archivebox/search/backends/ripgrep.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py index cd9ecfee..07292e37 100644 --- a/archivebox/search/backends/ripgrep.py +++ b/archivebox/search/backends/ripgrep.py @@ -2,12 +2,15 @@ import re from subprocess import run, PIPE, DEVNULL from typing import List, Generator -from archivebox.config import setup_django, ARCHIVE_DIR, ARCHIVE_DIR_NAME +from archivebox.config import setup_django, ARCHIVE_DIR from archivebox.util import enforce_types -DEFAULT_ARGUMENTS = '-ilt' # Case insensitive, matching files, types -DEFAULT_EXTENSIONS = 'html' -REGEX_ARGUMENT = '-e' +RG_IGNORE_EXTENSIONS = ('css','js','orig','svg') + +RG_ADD_TYPE = '--type-add' +RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}" +RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l) +RG_REGEX_ARGUMENT = '-e' TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/' @@ -25,13 +28,14 @@ def flush(snapshot_ids: Generator[str, None, None]): def search(text: str) -> List[str]: is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL) if is_rg_installed.returncode: - raise Exception("rg binary not found, install ripgrep to use this backend") + raise Exception("ripgrep binary not found, install ripgrep to use this search backend") setup_django(check_db=True) from core.models import Snapshot - rg = run(['rg',DEFAULT_ARGUMENTS, DEFAULT_EXTENSIONS, REGEX_ARGUMENT, text, str(ARCHIVE_DIR)],stdout=PIPE, stderr=PIPE, timeout=60) - file_paths = [p.decode().replace(str(ARCHIVE_DIR_NAME), '') for p in rg.stdout.splitlines()] + rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)] + rg = run(rg_cmd, stdout=PIPE, stderr=PIPE, timeout=60) + file_paths = [p.decode() for p in rg.stdout.splitlines()] timestamps = set() for path in file_paths: if ts := ts_regex.findall(path): From 7903db6dfb15b7f6d601885b8920f7539a8cdec7 Mon Sep 17 00:00:00 2001 From: JDC Date: Mon, 23 Nov 2020 13:04:38 -0500 Subject: [PATCH 19/27] Add ArchiveResult Manager and sorted indexable filter --- archivebox/core/models.py | 17 ++++++++++++++--- archivebox/extractors/__init__.py | 3 +++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 5555c798..fe2d05ab 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -5,10 +5,11 @@ import uuid from django.db import models, transaction from django.utils.functional import cached_property from django.utils.text import slugify +from django.db.models import Case, When, Value, IntegerField from ..util import parse_date from ..index.schema import Link -from ..extractors import get_default_archive_methods +from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] STATUS_CHOICES = [ @@ -91,7 +92,7 @@ class Snapshot(models.Model): return { key: getattr(self, key) if key != 'tags' else self.tags_str() - for key in args + for key in args } def as_link(self) -> Link: @@ -100,7 +101,7 @@ class Snapshot(models.Model): def as_link_with_details(self) -> Link: from ..index import load_link_details return load_link_details(self.as_link()) - + def tags_str(self) -> str: return ','.join(self.tags.order_by('name').values_list('name', flat=True)) @@ -157,7 +158,15 @@ class Snapshot(models.Model): self.tags.clear() self.tags.add(*tags_id) +class ArchiveResultManager(models.Manager): + def indexable(self, sorted: bool = True): + INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] + qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded') + if sorted: + precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] + qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') + return qs class ArchiveResult(models.Model): snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) cmd = models.JSONField() @@ -169,5 +178,7 @@ class ArchiveResult(models.Model): status = models.CharField(max_length=16, choices=STATUS_CHOICES) extractor = models.CharField(choices=EXTRACTORS, max_length=32) + objects = ArchiveResultManager() + def __str__(self): return self.extractor diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 0cf6d90d..ceef3b51 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -39,6 +39,7 @@ from .media import should_save_media, save_media from .archive_org import should_save_archive_dot_org, save_archive_dot_org from .headers import should_save_headers, save_headers + def get_default_archive_methods(): return [ ('title', should_save_title, save_title), @@ -56,6 +57,8 @@ def get_default_archive_methods(): ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ] +ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)] + @enforce_types def ignore_methods(to_ignore: List[str]): ARCHIVE_METHODS = get_default_archive_methods() From 273c9d91c6dfddfdb25888173e50786c28c242b3 Mon Sep 17 00:00:00 2001 From: JDC Date: Mon, 23 Nov 2020 13:41:35 -0500 Subject: [PATCH 20/27] Add tag filter to update command --- archivebox/cli/archivebox_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index d9a94235..6748096e 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex', 'search'), + choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'), default='exact', help='Type of pattern matching to use when filtering URLs', ) From caf4660ac86153632c76de247b6ac8579d06de31 Mon Sep 17 00:00:00 2001 From: JDC Date: Mon, 23 Nov 2020 15:51:59 -0500 Subject: [PATCH 21/27] Add indexing to update command and utilities --- archivebox/main.py | 3 ++- archivebox/search/__init__.py | 16 +++++++++++++++ archivebox/search/utils.py | 38 +++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 archivebox/search/utils.py diff --git a/archivebox/main.py b/archivebox/main.py index 73278702..bb24d124 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -115,7 +115,7 @@ from .logging_util import ( printable_dependency_version, ) -from .search import flush_search_index +from .search import flush_search_index, index_links ALLOWED_IN_OUTPUT_DIR = { 'lost+found', @@ -711,6 +711,7 @@ def update(resume: Optional[float]=None, if index_only: for link in all_links: write_link_details(link, out_dir=out_dir, skip_sql_index=True) + index_links(all_links, out_dir=out_dir) return all_links # Step 2: Run the archive methods for each link diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index fdf19a89..537fa1ff 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -8,6 +8,8 @@ from archivebox.index.schema import Link from archivebox.util import enforce_types from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE +from .utils import get_indexable_content + def indexing_enabled(): return USE_INDEXING_BACKEND @@ -83,3 +85,17 @@ def flush_search_index(snapshots: QuerySet): f'[X] The search backend threw an exception={err}:', color='red', ) + +@enforce_types +def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR): + if not links: + return + + setup_django(out_dir=out_dir, check_db=True) + from core.models import Snapshot, ArchiveResult + + for link in links: + if snap := Snapshot.objects.filter(url=link.url).first(): + results = ArchiveResult.objects.indexable().filter(snapshot=snap) + texts = get_indexable_content(results) + write_search_index(link,texts,out_dir=out_dir) diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py new file mode 100644 index 00000000..f2d86b2c --- /dev/null +++ b/archivebox/search/utils.py @@ -0,0 +1,38 @@ +from django.db.models import QuerySet + +from archivebox.util import enforce_types + +def get_file_result_content(res, extra_path, use_pwd=False): + if use_pwd: + fpath = f'{res.pwd}/{res.output}' + else: + fpath = f'{res.output}' + + if extra_path: + fpath = f'{fpath}/{extra_path}' + + with open(fpath, 'r') as file: + data = file.read().replace('\n', '') + if data: + return [data] + return [] + + +# This should be abstracted by a plugin interface for extractors +@enforce_types +def get_indexable_content(results: QuerySet): + if not results: + return [] + # Only use the first method available + res, method = results.first(), results.first().extractor + if method not in ('readability', 'singlefile', 'dom', 'wget'): + return [] + # This should come from a plugin interface + if method == 'readability': + return get_file_result_content(res, 'content.txt') + elif method == 'singlefile': + return get_file_result_content(res, '') + elif method == 'dom': + return get_file_result_content(res,'',use_pwd=True) + elif method == 'wget': + return get_file_result_content(res,'',use_pwd=True) From 0acf479b70421553b721f6ef040039fcf5362f7b Mon Sep 17 00:00:00 2001 From: JDC Date: Mon, 23 Nov 2020 16:54:27 -0500 Subject: [PATCH 22/27] Partition long strings in chunks for sonic --- archivebox/search/__init__.py | 2 +- archivebox/search/backends/sonic.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 537fa1ff..fa5d564d 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -98,4 +98,4 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR): if snap := Snapshot.objects.filter(url=link.url).first(): results = ArchiveResult.objects.indexable().filter(snapshot=snap) texts = get_indexable_content(results) - write_search_index(link,texts,out_dir=out_dir) + write_search_index(link, texts, out_dir=out_dir) diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py index 7dc4d5b0..affe9d20 100644 --- a/archivebox/search/backends/sonic.py +++ b/archivebox/search/backends/sonic.py @@ -5,13 +5,18 @@ from sonic import IngestClient, SearchClient from archivebox.util import enforce_types from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION +MAX_SONIC_TEXT_LENGTH = 1000 @enforce_types def index(snapshot_id: str, texts: List[str]): with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: for text in texts: - ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text)) - + if len(text) < MAX_SONIC_TEXT_LENGTH: + ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text)) + else: + chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)] + for chunk in chunks: + ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk)) @enforce_types def search(text: str) -> List[str]: with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl: From db9c2edccc5dc136bd79a3568574b67b4a63600b Mon Sep 17 00:00:00 2001 From: JDC Date: Mon, 23 Nov 2020 17:23:26 -0500 Subject: [PATCH 23/27] Add log print for url indexing --- archivebox/search/__init__.py | 3 ++- archivebox/search/utils.py | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index fa5d564d..a262d926 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -8,7 +8,7 @@ from archivebox.index.schema import Link from archivebox.util import enforce_types from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE -from .utils import get_indexable_content +from .utils import get_indexable_content, log_index_started def indexing_enabled(): return USE_INDEXING_BACKEND @@ -98,4 +98,5 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR): if snap := Snapshot.objects.filter(url=link.url).first(): results = ArchiveResult.objects.indexable().filter(snapshot=snap) texts = get_indexable_content(results) + log_index_started(link.url) write_search_index(link, texts, out_dir=out_dir) diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index f2d86b2c..55c97e75 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -1,6 +1,11 @@ from django.db.models import QuerySet from archivebox.util import enforce_types +from archivebox.config import ANSI + +def log_index_started(url): + print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI)) + print( ) def get_file_result_content(res, extra_path, use_pwd=False): if use_pwd: @@ -12,7 +17,7 @@ def get_file_result_content(res, extra_path, use_pwd=False): fpath = f'{fpath}/{extra_path}' with open(fpath, 'r') as file: - data = file.read().replace('\n', '') + data = file.read() if data: return [data] return [] @@ -28,6 +33,7 @@ def get_indexable_content(results: QuerySet): if method not in ('readability', 'singlefile', 'dom', 'wget'): return [] # This should come from a plugin interface + if method == 'readability': return get_file_result_content(res, 'content.txt') elif method == 'singlefile': From 15fbd81480536bd7223096446b27f8666d7057e4 Mon Sep 17 00:00:00 2001 From: JDC Date: Mon, 23 Nov 2020 18:17:07 -0500 Subject: [PATCH 24/27] Change MAX_SONIC_TEXT_LENGTH --- archivebox/search/backends/sonic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py index affe9d20..e34c6535 100644 --- a/archivebox/search/backends/sonic.py +++ b/archivebox/search/backends/sonic.py @@ -5,7 +5,7 @@ from sonic import IngestClient, SearchClient from archivebox.util import enforce_types from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION -MAX_SONIC_TEXT_LENGTH = 1000 +MAX_SONIC_TEXT_LENGTH = 20000 @enforce_types def index(snapshot_id: str, texts: List[str]): From b1d70185ed0bf53b446da0ab54ae4bcf5fc6cb27 Mon Sep 17 00:00:00 2001 From: JDC Date: Mon, 23 Nov 2020 18:33:32 -0500 Subject: [PATCH 25/27] Increase word_objects for Sonic default config --- etc/sonic/config.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/sonic/config.cfg b/etc/sonic/config.cfg index 45806ed1..10fbda53 100644 --- a/etc/sonic/config.cfg +++ b/etc/sonic/config.cfg @@ -32,7 +32,7 @@ suggest_limit_maximum = 20 path = "/var/lib/sonic/store/kv/" -retain_word_objects = 1000 +retain_word_objects = 100000 [store.kv.pool] From 5a6b814c7935ccc1571abd8d5b2487186cac96c7 Mon Sep 17 00:00:00 2001 From: jdcaballerov Date: Tue, 24 Nov 2020 09:35:06 -0500 Subject: [PATCH 26/27] Add exception handling for indexable content reader --- archivebox/search/__init__.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index a262d926..ebeebcd0 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -97,6 +97,14 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR): for link in links: if snap := Snapshot.objects.filter(url=link.url).first(): results = ArchiveResult.objects.indexable().filter(snapshot=snap) - texts = get_indexable_content(results) log_index_started(link.url) - write_search_index(link, texts, out_dir=out_dir) + try: + texts = get_indexable_content(results) + except Exception as err: + stderr() + stderr( + f'[X] An Exception ocurred reading the indexable content={err}:', + color='red', + ) + else: + write_search_index(link, texts, out_dir=out_dir) \ No newline at end of file From 172197ae01c080874ec83b190e536d986c6603c5 Mon Sep 17 00:00:00 2001 From: jdcaballerov Date: Thu, 26 Nov 2020 18:12:54 -0500 Subject: [PATCH 27/27] refactor: Remove if LENGTH and use text chunker for every input --- archivebox/search/backends/sonic.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py index e34c6535..f0beaddd 100644 --- a/archivebox/search/backends/sonic.py +++ b/archivebox/search/backends/sonic.py @@ -11,12 +11,10 @@ MAX_SONIC_TEXT_LENGTH = 20000 def index(snapshot_id: str, texts: List[str]): with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: for text in texts: - if len(text) < MAX_SONIC_TEXT_LENGTH: - ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text)) - else: - chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)] - for chunk in chunks: - ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk)) + chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)] + for chunk in chunks: + ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk)) + @enforce_types def search(text: str) -> List[str]: with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl: