From b1f70b219736378170c1dcda1131792bf83c1830 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Tue, 17 Nov 2020 18:42:57 -0500
Subject: [PATCH 01/27] Initial implementation

---
 archivebox.egg-info                  |  1 -
 archivebox/core/admin.py             |  5 +++-
 archivebox/core/mixins.py            | 21 +++++++++++++++
 archivebox/extractors/__init__.py    |  2 ++
 archivebox/extractors/readability.py |  7 +++--
 archivebox/index/schema.py           |  1 +
 archivebox/search/__init__.py        | 40 ++++++++++++++++++++++++++++
 7 files changed, 73 insertions(+), 4 deletions(-)
 delete mode 120000 archivebox.egg-info
 create mode 100644 archivebox/core/mixins.py
 create mode 100644 archivebox/search/__init__.py

diff --git a/archivebox.egg-info b/archivebox.egg-info
deleted file mode 120000
index 8ce20dd2..00000000
--- a/archivebox.egg-info
+++ /dev/null
@@ -1 +0,0 @@
-pip_dist/archivebox.egg-info
\ No newline at end of file
diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index 5d3db409..e078bdaf 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -14,6 +14,9 @@ from django import forms
 from core.models import Snapshot, Tag
 from core.forms import AddLinkForm, TagField
 
+from core.utils import get_icons
+from core.mixins import SearchResultsAdminMixin
+
 from index.html import snapshot_icons
 from util import htmldecode, urldecode, ansi_to_html
 from logging_util import printable_filesize
@@ -82,7 +85,7 @@ class SnapshotAdminForm(forms.ModelForm):
         return instance
 
 
-class SnapshotAdmin(admin.ModelAdmin):
+class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     sort_fields = ('title_str', 'url_str', 'added')
     readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
diff --git a/archivebox/core/mixins.py b/archivebox/core/mixins.py
new file mode 100644
index 00000000..28f79b38
--- /dev/null
+++ b/archivebox/core/mixins.py
@@ -0,0 +1,21 @@
+from django.db.models import Q, Case, When, Value, IntegerField
+
+from archivebox.search import search_index
+
+class SearchResultsAdminMixin(object):
+    def get_search_results(self, request, queryset, search_term):
+        ''' Show exact match for title and slug at top of admin search results.
+        '''
+        qs, use_distinct = \
+            super(SearchResultsAdminMixin, self).get_search_results(
+                request, queryset, search_term)
+
+        search_term = search_term.strip()
+        if not search_term:
+            return qs, use_distinct
+
+        snapshot_ids = search_index(search_term)
+        qsearch = queryset.filter(id__in=snapshot_ids)
+        qs |= qsearch
+
+        return qs, use_distinct
\ No newline at end of file
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index ef5ef446..0cf6d90d 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -23,6 +23,7 @@ from ..logging_util import (
     log_archive_method_started,
     log_archive_method_finished,
 )
+from ..search import write_search_index
 
 from .title import should_save_title, save_title
 from .favicon import should_save_favicon, save_favicon
@@ -107,6 +108,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                     link.history[method_name].append(result)
 
                     stats[result.status] += 1
+                    write_search_index(link=link, texts=result.index_texts)
                     log_archive_method_finished(result)
                     if not skip_index:
                         ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py
index bd45e9d5..9da620b4 100644
--- a/archivebox/extractors/readability.py
+++ b/archivebox/extractors/readability.py
@@ -71,6 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         CURL_BINARY,
         link.url
     ]
+    readability_content = None
     timer = TimedProgress(timeout, prefix='      ')
     try:
         document = get_html(link, out_dir)
@@ -86,8 +87,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         result = run(cmd, cwd=out_dir, timeout=timeout)
         result_json = json.loads(result.stdout)
         output_folder.mkdir(exist_ok=True)
+        readability_content = result_json.pop("textContent") 
         atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
-        atomic_write(str(output_folder / "content.txt"), result_json.pop("textContent"))
+        atomic_write(str(output_folder / "content.txt"), readability_content)
         atomic_write(str(output_folder / "article.json"), result_json)
 
         # parse out number of files downloaded from last line of stderr:
@@ -117,5 +119,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         cmd_version=READABILITY_VERSION,
         output=output,
         status=status,
-        **timer.stats,
+        index_texts= [readability_content] if readability_content else [],
+        **timer.stats,  
     )
diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index 90021e0b..bc3a25da 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -39,6 +39,7 @@ class ArchiveResult:
     status: str
     start_ts: datetime
     end_ts: datetime
+    index_texts: Union[List[str], None] = None
     schema: str = 'ArchiveResult'
 
     def __post_init__(self):
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
new file mode 100644
index 00000000..f503e9fa
--- /dev/null
+++ b/archivebox/search/__init__.py
@@ -0,0 +1,40 @@
+from typing import List, Optional, Union
+from pathlib import Path
+
+from sonic import IngestClient, SearchClient
+
+from ..index.schema import Link, ArchiveResult
+from ..util import enforce_types
+from ..config import setup_django, OUTPUT_DIR
+
+
+@enforce_types
+def write_sonic_index(snapshot_id: str, texts: List[str]):
+    # TODO add variables to localhost, port, password, bucket, collection
+    with IngestClient("localhost", 1491, "SecretPassword") as ingestcl:
+        for text in texts:
+            ingestcl.push("archivebox", "snapshots", snapshot_id, str(text))
+
+@enforce_types
+def search_sonic_index(text: str) -> List:
+    with SearchClient("localhost", 1491, "SecretPassword") as querycl:
+        snap_ids = querycl.query("archivebox", "snapshots", text)
+    return snap_ids
+
+
+@enforce_types
+def search_index(text: str) -> List:
+    # get backend
+    return search_sonic_index(text)
+
+
+@enforce_types
+def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
+    setup_django(out_dir, check_db=True)
+    from core.models import Snapshot
+
+    if not skip_text_index and texts:
+        snap = Snapshot.objects.filter(url=link.url).first()
+        if snap:
+            # get backend
+            write_sonic_index(str(snap.id), texts)
\ No newline at end of file

From 5f6673c72c472ce23f192e7661ec449134fbf463 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Wed, 18 Nov 2020 17:54:13 -0500
Subject: [PATCH 02/27] Implement backend architecture for search engines

---
 archivebox/core/mixins.py              | 20 ++++----
 archivebox/search/__init__.py          | 65 +++++++++++++++-----------
 archivebox/search/backends/__init__.py |  0
 archivebox/search/backends/sonic.py    | 19 ++++++++
 4 files changed, 69 insertions(+), 35 deletions(-)
 create mode 100644 archivebox/search/backends/__init__.py
 create mode 100644 archivebox/search/backends/sonic.py

diff --git a/archivebox/core/mixins.py b/archivebox/core/mixins.py
index 28f79b38..afae2d78 100644
--- a/archivebox/core/mixins.py
+++ b/archivebox/core/mixins.py
@@ -1,10 +1,10 @@
-from django.db.models import Q, Case, When, Value, IntegerField
+from django.contrib import messages
 
-from archivebox.search import search_index
+from archivebox.search import query_search_index
 
 class SearchResultsAdminMixin(object):
     def get_search_results(self, request, queryset, search_term):
-        ''' Show exact match for title and slug at top of admin search results.
+        ''' Enhances the search queryset with results from the search backend.
         '''
         qs, use_distinct = \
             super(SearchResultsAdminMixin, self).get_search_results(
@@ -13,9 +13,13 @@ class SearchResultsAdminMixin(object):
         search_term = search_term.strip()
         if not search_term:
             return qs, use_distinct
+        try:
+            snapshot_ids = query_search_index(search_term)
+        except Exception as err:
+            messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
+        else:
+            qsearch = queryset.filter(id__in=snapshot_ids)
+            qs |= qsearch
 
-        snapshot_ids = search_index(search_term)
-        qsearch = queryset.filter(id__in=snapshot_ids)
-        qs |= qsearch
-
-        return qs, use_distinct
\ No newline at end of file
+        finally:
+            return qs, use_distinct
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index f503e9fa..6e604224 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -1,40 +1,51 @@
-from typing import List, Optional, Union
+from typing import List, Union
 from pathlib import Path
-
-from sonic import IngestClient, SearchClient
-
-from ..index.schema import Link, ArchiveResult
-from ..util import enforce_types
-from ..config import setup_django, OUTPUT_DIR
+from importlib import import_module
 
 
-@enforce_types
-def write_sonic_index(snapshot_id: str, texts: List[str]):
-    # TODO add variables to localhost, port, password, bucket, collection
-    with IngestClient("localhost", 1491, "SecretPassword") as ingestcl:
-        for text in texts:
-            ingestcl.push("archivebox", "snapshots", snapshot_id, str(text))
-
-@enforce_types
-def search_sonic_index(text: str) -> List:
-    with SearchClient("localhost", 1491, "SecretPassword") as querycl:
-        snap_ids = querycl.query("archivebox", "snapshots", text)
-    return snap_ids
+from archivebox.index.schema import Link
+from archivebox.util import enforce_types
+from archivebox.config import setup_django, OUTPUT_DIR
 
 
-@enforce_types
-def search_index(text: str) -> List:
-    # get backend
-    return search_sonic_index(text)
+def indexing_enabled():
+    return True
+    # return FULLTEXT_INDEXING_ENABLED
 
+def search_backend_enabled():
+    return True
+    # return FULLTEXT_SEARCH_ENABLED
+
+def get_backend():
+    return 'search.backends.sonic'
+
+def import_backend():
+    backend_string = get_backend()
+    try:
+        backend = import_module(backend_string)
+    except Exception as err:
+        raise Exception("Could not load '%s' as a backend: %s" % (backend_string, err))
+    return backend
 
 @enforce_types
 def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
-    setup_django(out_dir, check_db=True)
-    from core.models import Snapshot
+    if not indexing_enabled():
+        return
 
     if not skip_text_index and texts:
+        setup_django(out_dir, check_db=True)
+        from core.models import Snapshot
+
         snap = Snapshot.objects.filter(url=link.url).first()
+        backend = import_backend()
         if snap:
-            # get backend
-            write_sonic_index(str(snap.id), texts)
\ No newline at end of file
+            backend.index(snapshot_id=str(snap.id), texts=texts)
+
+@enforce_types
+def query_search_index(text: str) -> List:
+    if search_backend_enabled():
+        backend = import_backend()
+        return backend.search(text)
+    else:
+        return []
+        
\ No newline at end of file
diff --git a/archivebox/search/backends/__init__.py b/archivebox/search/backends/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py
new file mode 100644
index 00000000..28725f27
--- /dev/null
+++ b/archivebox/search/backends/sonic.py
@@ -0,0 +1,19 @@
+from typing import List
+
+from sonic import IngestClient, SearchClient
+
+from archivebox.util import enforce_types
+
+@enforce_types
+def index(snapshot_id: str, texts: List[str]):
+    # TODO add variables to localhost, port, password, bucket, collection
+    with IngestClient("localhost", 1491, "SecretPassword") as ingestcl:
+        for text in texts:
+            ingestcl.push("archivebox", "snapshots", snapshot_id, str(text))
+
+@enforce_types
+def search(text: str) -> List:
+    with SearchClient("localhost", 1491, "SecretPassword") as querycl:
+        snap_ids = querycl.query("archivebox", "snapshots", text)
+    return snap_ids
+    
\ No newline at end of file

From c2c01af3adfd69c1984b5c6b2cdc1aa59b08c32b Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Thu, 19 Nov 2020 08:06:13 -0500
Subject: [PATCH 03/27] Add config for search backend

---
 archivebox/config.py                | 14 +++++++++++++-
 archivebox/search/__init__.py       | 11 ++++-------
 archivebox/search/backends/sonic.py | 11 ++++++-----
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index 47049342..0ca2d7d9 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -139,6 +139,18 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
     },
 
+    'SEARCH_BACKEND_CONFIG' : {
+        'USE_INDEXING_BACKEND':     {'type': bool,  'default': True},
+        'USE_SEARCHING_BACKEND':    {'type': bool,  'default': True},
+        'SEARCH_BACKEND_ENGINE':    {'type': str,   'default': 'sonic'},
+        'SEARCH_BACKEND_HOST_NAME': {'type': str,   'default': 'localhost'},
+        'SEARCH_BACKEND_PORT':      {'type': int,   'default': 1491},
+        'SEARCH_BACKEND_PASSWORD':  {'type': str,   'default': 'SecretPassword'},
+        # SONIC
+        'SONIC_BUCKET':             {'type': str,   'default': 'archivebox'},
+        'SONIC_COLLECTION':         {'type': str,   'default': 'snapshots'},
+    },
+
     'DEPENDENCY_CONFIG': {
         'USE_CURL':                 {'type': bool,  'default': True},
         'USE_WGET':                 {'type': bool,  'default': True},
@@ -149,7 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'USE_CHROME':               {'type': bool,  'default': True},
         'USE_NODE':                 {'type': bool,  'default': True},
         'USE_YOUTUBEDL':            {'type': bool,  'default': True},
-
+        
         'CURL_BINARY':              {'type': str,   'default': 'curl'},
         'GIT_BINARY':               {'type': str,   'default': 'git'},
         'WGET_BINARY':              {'type': str,   'default': 'wget'},
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index 6e604224..7db4af46 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -5,19 +5,16 @@ from importlib import import_module
 
 from archivebox.index.schema import Link
 from archivebox.util import enforce_types
-from archivebox.config import setup_django, OUTPUT_DIR
-
+from archivebox.config import setup_django, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
 
 def indexing_enabled():
-    return True
-    # return FULLTEXT_INDEXING_ENABLED
+    return USE_INDEXING_BACKEND
 
 def search_backend_enabled():
-    return True
-    # return FULLTEXT_SEARCH_ENABLED
+    return USE_SEARCHING_BACKEND
 
 def get_backend():
-    return 'search.backends.sonic'
+    return f'search.backends.{SEARCH_BACKEND_ENGINE}'
 
 def import_backend():
     backend_string = get_backend()
diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py
index 28725f27..e062f9e1 100644
--- a/archivebox/search/backends/sonic.py
+++ b/archivebox/search/backends/sonic.py
@@ -3,17 +3,18 @@ from typing import List
 from sonic import IngestClient, SearchClient
 
 from archivebox.util import enforce_types
+from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION
+
 
 @enforce_types
 def index(snapshot_id: str, texts: List[str]):
-    # TODO add variables to localhost, port, password, bucket, collection
-    with IngestClient("localhost", 1491, "SecretPassword") as ingestcl:
+    with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
         for text in texts:
-            ingestcl.push("archivebox", "snapshots", snapshot_id, str(text))
+            ingestcl.push(SONIC_BUCKET, SONIC_COLLECTION, snapshot_id, str(text))
 
 @enforce_types
 def search(text: str) -> List:
-    with SearchClient("localhost", 1491, "SecretPassword") as querycl:
-        snap_ids = querycl.query("archivebox", "snapshots", text)
+    with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl:
+        snap_ids = querycl.query(SONIC_BUCKET, SONIC_COLLECTION, text)
     return snap_ids
     
\ No newline at end of file

From 47daa038eb61674df22345e99201472ea770762c Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Thu, 19 Nov 2020 16:45:12 -0500
Subject: [PATCH 04/27] Implement flush for search backend after remove command

---
 archivebox/config.py                |  4 ++--
 archivebox/core/mixins.py           |  2 +-
 archivebox/main.py                  |  2 ++
 archivebox/search/__init__.py       |  9 ++++++++-
 archivebox/search/backends/sonic.py | 11 ++++++++---
 5 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index 0ca2d7d9..ee2f0b4a 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -147,8 +147,8 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'SEARCH_BACKEND_PORT':      {'type': int,   'default': 1491},
         'SEARCH_BACKEND_PASSWORD':  {'type': str,   'default': 'SecretPassword'},
         # SONIC
-        'SONIC_BUCKET':             {'type': str,   'default': 'archivebox'},
-        'SONIC_COLLECTION':         {'type': str,   'default': 'snapshots'},
+        'SONIC_COLLECTION':         {'type': str,   'default': 'archivebox'},
+        'SONIC_BUCKET':             {'type': str,   'default': 'snapshots'},
     },
 
     'DEPENDENCY_CONFIG': {
diff --git a/archivebox/core/mixins.py b/archivebox/core/mixins.py
index afae2d78..b361790a 100644
--- a/archivebox/core/mixins.py
+++ b/archivebox/core/mixins.py
@@ -18,7 +18,7 @@ class SearchResultsAdminMixin(object):
         except Exception as err:
             messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
         else:
-            qsearch = queryset.filter(id__in=snapshot_ids)
+            qsearch = queryset.filter(pk__in=snapshot_ids)
             qs |= qsearch
 
         finally:
diff --git a/archivebox/main.py b/archivebox/main.py
index cbbd2218..504cd670 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -115,6 +115,7 @@ from .logging_util import (
     printable_dependency_version,
 )
 
+from .search import flush_search_index
 
 ALLOWED_IN_OUTPUT_DIR = {
     'lost+found',
@@ -665,6 +666,7 @@ def remove(filter_str: Optional[str]=None,
     to_remove = snapshots.count()
 
     remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
+    flush_search_index(snapshot_ids=[str(pk) for pk in snapshots.values_list('pk',flat=True)])
     all_snapshots = load_main_index(out_dir=out_dir)
     log_removal_finished(all_snapshots.count(), to_remove)
     
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index 7db4af46..93245bda 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -45,4 +45,11 @@ def query_search_index(text: str) -> List:
         return backend.search(text)
     else:
         return []
-        
\ No newline at end of file
+
+@enforce_types
+def flush_search_index(snapshot_ids: List[str]):
+    if not indexing_enabled() or not snapshot_ids:
+        return
+    backend = import_backend()
+    backend.flush(snapshot_ids)
+    
\ No newline at end of file
diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py
index e062f9e1..8fd93ae8 100644
--- a/archivebox/search/backends/sonic.py
+++ b/archivebox/search/backends/sonic.py
@@ -10,11 +10,16 @@ from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEA
 def index(snapshot_id: str, texts: List[str]):
     with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
         for text in texts:
-            ingestcl.push(SONIC_BUCKET, SONIC_COLLECTION, snapshot_id, str(text))
+            ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text))
 
 @enforce_types
 def search(text: str) -> List:
     with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl:
-        snap_ids = querycl.query(SONIC_BUCKET, SONIC_COLLECTION, text)
+        snap_ids = querycl.query(SONIC_COLLECTION, SONIC_BUCKET, text)
     return snap_ids
-    
\ No newline at end of file
+
+@enforce_types
+def flush(snapshot_ids: List[str]):
+    with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
+        for id in snapshot_ids:
+            ingestcl.flush_object(SONIC_COLLECTION, SONIC_BUCKET, str(id))

From f383648ffc80e64bfa399efc5e1b7766fe7de3dd Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Thu, 19 Nov 2020 17:33:53 -0500
Subject: [PATCH 05/27] Use a generator for snapshot flush from index

---
 archivebox/main.py                  | 2 +-
 archivebox/search/__init__.py       | 7 +++----
 archivebox/search/backends/sonic.py | 6 +++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/archivebox/main.py b/archivebox/main.py
index 504cd670..7d13a5c4 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -666,7 +666,7 @@ def remove(filter_str: Optional[str]=None,
     to_remove = snapshots.count()
 
     remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
-    flush_search_index(snapshot_ids=[str(pk) for pk in snapshots.values_list('pk',flat=True)])
+    flush_search_index(snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True)))
     all_snapshots = load_main_index(out_dir=out_dir)
     log_removal_finished(all_snapshots.count(), to_remove)
     
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index 93245bda..59bb6fe5 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import List, Union, Generator
 from pathlib import Path
 from importlib import import_module
 
@@ -39,7 +39,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
             backend.index(snapshot_id=str(snap.id), texts=texts)
 
 @enforce_types
-def query_search_index(text: str) -> List:
+def query_search_index(text: str) -> List[str]:  
     if search_backend_enabled():
         backend = import_backend()
         return backend.search(text)
@@ -47,9 +47,8 @@ def query_search_index(text: str) -> List:
         return []
 
 @enforce_types
-def flush_search_index(snapshot_ids: List[str]):
+def flush_search_index(snapshot_ids: Generator[str, None, None]):
     if not indexing_enabled() or not snapshot_ids:
         return
     backend = import_backend()
     backend.flush(snapshot_ids)
-    
\ No newline at end of file
diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py
index 8fd93ae8..7dc4d5b0 100644
--- a/archivebox/search/backends/sonic.py
+++ b/archivebox/search/backends/sonic.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Generator
 
 from sonic import IngestClient, SearchClient
 
@@ -13,13 +13,13 @@ def index(snapshot_id: str, texts: List[str]):
             ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text))
 
 @enforce_types
-def search(text: str) -> List:
+def search(text: str) -> List[str]:
     with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl:
         snap_ids = querycl.query(SONIC_COLLECTION, SONIC_BUCKET, text)
     return snap_ids
 
 @enforce_types
-def flush(snapshot_ids: List[str]):
+def flush(snapshot_ids: Generator[str, None, None]):
     with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
         for id in snapshot_ids:
             ingestcl.flush_object(SONIC_COLLECTION, SONIC_BUCKET, str(id))

From 823df34080a0ac8aa9cc6d4e9d689a3d4cf84309 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Thu, 19 Nov 2020 18:19:33 -0500
Subject: [PATCH 06/27] Use QuerySets for search backend API instead of pks

---
 archivebox/core/mixins.py     |  4 +---
 archivebox/main.py            |  2 +-
 archivebox/search/__init__.py | 19 ++++++++++++++-----
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/archivebox/core/mixins.py b/archivebox/core/mixins.py
index b361790a..d1203745 100644
--- a/archivebox/core/mixins.py
+++ b/archivebox/core/mixins.py
@@ -14,12 +14,10 @@ class SearchResultsAdminMixin(object):
         if not search_term:
             return qs, use_distinct
         try:
-            snapshot_ids = query_search_index(search_term)
+            qsearch = query_search_index(search_term)
         except Exception as err:
             messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
         else:
-            qsearch = queryset.filter(pk__in=snapshot_ids)
             qs |= qsearch
-
         finally:
             return qs, use_distinct
diff --git a/archivebox/main.py b/archivebox/main.py
index 7d13a5c4..d533d58d 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -666,7 +666,7 @@ def remove(filter_str: Optional[str]=None,
     to_remove = snapshots.count()
 
     remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
-    flush_search_index(snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True)))
+    flush_search_index(snapshots=snapshots)
     all_snapshots = load_main_index(out_dir=out_dir)
     log_removal_finished(all_snapshots.count(), to_remove)
     
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index 59bb6fe5..15efffb0 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -1,7 +1,8 @@
-from typing import List, Union, Generator
+from typing import List, Union
 from pathlib import Path
 from importlib import import_module
 
+from django.db.models import QuerySet
 
 from archivebox.index.schema import Link
 from archivebox.util import enforce_types
@@ -39,16 +40,24 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
             backend.index(snapshot_id=str(snap.id), texts=texts)
 
 @enforce_types
-def query_search_index(text: str) -> List[str]:  
+def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:  
     if search_backend_enabled():
+        setup_django(out_dir, check_db=True)
+        from core.models import Snapshot
+
         backend = import_backend()
-        return backend.search(text)
+        snapshot_ids = backend.search(query)
+        # TODO preserve ordering from backend
+        qsearch = Snapshot.objects.filter(pk__in=snapshot_ids)
+        return qsearch
     else:
         return []
 
 @enforce_types
-def flush_search_index(snapshot_ids: Generator[str, None, None]):
-    if not indexing_enabled() or not snapshot_ids:
+def flush_search_index(snapshots: QuerySet):
+    if not indexing_enabled() or not snapshots:
         return
     backend = import_backend()
+    snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True))
+
     backend.flush(snapshot_ids)

From fb67d6684c4ba229450767ab8afef2a7b158cd99 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Thu, 19 Nov 2020 21:53:22 -0500
Subject: [PATCH 07/27] fix: Return empty QuerySet instead of list

---
 archivebox/search/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index 15efffb0..2a1f4dcd 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -51,7 +51,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
         qsearch = Snapshot.objects.filter(pk__in=snapshot_ids)
         return qsearch
     else:
-        return []
+        return Snapshot.objects.none()
 
 @enforce_types
 def flush_search_index(snapshots: QuerySet):

From 0f7dba07dfe673d5915c1bfb344a24b4cb027e84 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Thu, 19 Nov 2020 23:39:28 -0500
Subject: [PATCH 08/27] feat: add search filter-type to list command

---
 archivebox/cli/archivebox_list.py |  2 +-
 archivebox/index/__init__.py      | 34 ++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py
index 140810a6..3838cf60 100644
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     parser.add_argument(
         '--filter-type',
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex','tag'),
+        choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
     )
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 3a066e18..34e2c5ff 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -51,6 +51,8 @@ from .sql import (
     write_sql_link_details,
 )
 
+from ..search import search_backend_enabled, query_search_index
+
 ### Link filtering and checking
 
 @enforce_types
@@ -365,7 +367,7 @@ LINK_FILTERS = {
 }
 
 @enforce_types
-def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
+def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
     q_filter = Q()
     for pattern in filter_patterns:
         try:
@@ -380,6 +382,36 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
             raise SystemExit(2)
     return snapshots.filter(q_filter)
 
+def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet:
+    if not search_backend_enabled():
+        stderr()
+        stderr(
+                '[X] The search backend is not enabled',
+                color='red',
+            )
+        raise SystemExit(2)
+
+    qsearch = get_empty_snapshot_queryset()
+    for pattern in filter_patterns:
+        try:
+            qsearch |= query_search_index(pattern)
+        except Exception as err:
+            stderr()
+            stderr(
+                f'[X] The search backend threw an exception={err}:',
+                color='red',
+            )
+            raise SystemExit(2)
+    
+    return snapshots & qsearch
+
+@enforce_types
+def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
+    if filter_type != 'search':
+        return q_filter(snapshots, filter_patterns, filter_type)
+    else:
+        return search_filter(snapshots, filter_patterns, filter_type)
+
 
 def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
     """indexed links without checking archive status or data directory validity"""

From 0773f12034239304aea3dbccf61edcf0392201f4 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Fri, 20 Nov 2020 10:29:28 -0500
Subject: [PATCH 09/27] Add sonic to docker-compose

---
 docker-compose.yml   | 11 ++++++++
 etc/sonic/config.cfg | 66 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 etc/sonic/config.cfg

diff --git a/docker-compose.yml b/docker-compose.yml
index 5fe91026..4e121621 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -21,8 +21,19 @@ services:
         environment:
             - USE_COLOR=True
             - SHOW_PROGRESS=False
+            - SEARCH_BACKEND_HOST_NAME=sonic
         volumes:
             - ./data:/data
+        depends_on:
+            - sonic
+    sonic:
+        image: valeriansaliou/sonic:v1.3.0    
+        ports:
+            - 1491:1491
+        volumes:
+            - ./etc/sonic/config.cfg:/etc/sonic.cfg
+            - ./data:/var/lib/sonic/store/
+    
 
 
     # Optional Addons: tweak these examples as needed for your specific use case
diff --git a/etc/sonic/config.cfg b/etc/sonic/config.cfg
new file mode 100644
index 00000000..b3dd5898
--- /dev/null
+++ b/etc/sonic/config.cfg
@@ -0,0 +1,66 @@
+# Sonic
+# Fast, lightweight and schema-less search backend
+# Configuration file
+# Example: https://github.com/valeriansaliou/sonic/blob/master/config.cfg
+
+
+[server]
+
+log_level = "debug"
+
+
+[channel]
+
+inet = "0.0.0.0:1491"
+tcp_timeout = 300
+
+auth_password = "SecretPassword"
+
+[channel.search]
+
+query_limit_default = 10
+query_limit_maximum = 100
+query_alternates_try = 4
+
+suggest_limit_default = 5
+suggest_limit_maximum = 20
+
+
+[store]
+
+[store.kv]
+
+path = "/var/lib/sonic/store/kv/"
+
+retain_word_objects = 1000
+
+[store.kv.pool]
+
+inactive_after = 1800
+
+[store.kv.database]
+
+flush_after = 900
+
+compress = true
+parallelism = 2
+max_files = 100
+max_compactions = 1
+max_flushes = 1
+write_buffer = 16384
+write_ahead_log = true
+
+[store.fst]
+
+path = "/var/lib/sonic/store/fst/"
+
+[store.fst.pool]
+
+inactive_after = 300
+
+[store.fst.graph]
+
+consolidate_after = 180
+
+max_size = 2048
+max_words = 250000

From a38e3e0c90ad8954dfe151e83c68af9c04cf4f42 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Fri, 20 Nov 2020 11:51:44 -0500
Subject: [PATCH 10/27] Get searc backend password from env var
 SEARCH_BACKEND_PASSWORD

---
 docker-compose.yml   | 3 +++
 etc/sonic/config.cfg | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 4e121621..29fc6f7a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -22,6 +22,7 @@ services:
             - USE_COLOR=True
             - SHOW_PROGRESS=False
             - SEARCH_BACKEND_HOST_NAME=sonic
+            - SEARCH_BACKEND_PASSWORD=SecretPassword
         volumes:
             - ./data:/data
         depends_on:
@@ -30,6 +31,8 @@ services:
         image: valeriansaliou/sonic:v1.3.0    
         ports:
             - 1491:1491
+        environment:
+            - SEARCH_BACKEND_PASSWORD=SecretPassword
         volumes:
             - ./etc/sonic/config.cfg:/etc/sonic.cfg
             - ./data:/var/lib/sonic/store/
diff --git a/etc/sonic/config.cfg b/etc/sonic/config.cfg
index b3dd5898..4fb374b4 100644
--- a/etc/sonic/config.cfg
+++ b/etc/sonic/config.cfg
@@ -14,7 +14,7 @@ log_level = "debug"
 inet = "0.0.0.0:1491"
 tcp_timeout = 300
 
-auth_password = "SecretPassword"
+auth_password = "${env.SEARCH_BACKEND_PASSWORD}"
 
 [channel.search]
 

From 9bd40ed7f6055f1a60597eb63836984dec6651fb Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Fri, 20 Nov 2020 15:27:39 -0500
Subject: [PATCH 11/27] Max out number of queries

---
 etc/sonic/config.cfg | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/etc/sonic/config.cfg b/etc/sonic/config.cfg
index 4fb374b4..45806ed1 100644
--- a/etc/sonic/config.cfg
+++ b/etc/sonic/config.cfg
@@ -18,9 +18,9 @@ auth_password = "${env.SEARCH_BACKEND_PASSWORD}"
 
 [channel.search]
 
-query_limit_default = 10
-query_limit_maximum = 100
-query_alternates_try = 4
+query_limit_default = 65535
+query_limit_maximum = 65535
+query_alternates_try = 10
 
 suggest_limit_default = 5
 suggest_limit_maximum = 20

From 0ed53cc1177484b7dbdf2a3aefe4a4c18a2c4ced Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Sat, 21 Nov 2020 08:22:18 -0500
Subject: [PATCH 12/27] Add search filter type for `update`

---
 archivebox/cli/archivebox_update.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index aa8cae1b..d9a94235 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     parser.add_argument(
         '--filter-type',
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex'),
+        choices=('exact', 'substring', 'domain', 'regex', 'search'),
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
     )

From 4eeedae8151c6677253b509ddcb7ec2e9086284d Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Sat, 21 Nov 2020 09:37:13 -0500
Subject: [PATCH 13/27] Exception handling for indexing and searching

---
 archivebox/index/__init__.py  |  9 ++-----
 archivebox/search/__init__.py | 50 +++++++++++++++++++++++++----------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 34e2c5ff..bf1d0c6a 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -386,7 +386,7 @@ def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type:
     if not search_backend_enabled():
         stderr()
         stderr(
-                '[X] The search backend is not enabled',
+                '[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True',
                 color='red',
             )
         raise SystemExit(2)
@@ -395,12 +395,7 @@ def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type:
     for pattern in filter_patterns:
         try:
             qsearch |= query_search_index(pattern)
-        except Exception as err:
-            stderr()
-            stderr(
-                f'[X] The search backend threw an exception={err}:',
-                color='red',
-            )
+        except:
             raise SystemExit(2)
     
     return snapshots & qsearch
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index 2a1f4dcd..fdf19a89 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -6,7 +6,7 @@ from django.db.models import QuerySet
 
 from archivebox.index.schema import Link
 from archivebox.util import enforce_types
-from archivebox.config import setup_django, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
+from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
 
 def indexing_enabled():
     return USE_INDEXING_BACKEND
@@ -37,21 +37,37 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
         snap = Snapshot.objects.filter(url=link.url).first()
         backend = import_backend()
         if snap:
-            backend.index(snapshot_id=str(snap.id), texts=texts)
+            try:
+                backend.index(snapshot_id=str(snap.id), texts=texts)
+            except Exception as err:
+                stderr()
+                stderr(
+                    f'[X] The search backend threw an exception={err}:',
+                color='red',
+                )
 
 @enforce_types
-def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:  
-    if search_backend_enabled():
-        setup_django(out_dir, check_db=True)
-        from core.models import Snapshot
+def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
+    setup_django(out_dir, check_db=True)
+    from core.models import Snapshot
 
+    if search_backend_enabled():
         backend = import_backend()
-        snapshot_ids = backend.search(query)
-        # TODO preserve ordering from backend
-        qsearch = Snapshot.objects.filter(pk__in=snapshot_ids)
-        return qsearch
-    else:
-        return Snapshot.objects.none()
+        try:
+            snapshot_ids = backend.search(query)
+        except Exception as err:
+            stderr()
+            stderr(
+                    f'[X] The search backend threw an exception={err}:',
+                color='red',
+                )
+            raise
+        else:
+            # TODO preserve ordering from backend
+            qsearch = Snapshot.objects.filter(pk__in=snapshot_ids)
+            return qsearch
+    
+    return Snapshot.objects.none()
 
 @enforce_types
 def flush_search_index(snapshots: QuerySet):
@@ -59,5 +75,11 @@ def flush_search_index(snapshots: QuerySet):
         return
     backend = import_backend()
     snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True))
-
-    backend.flush(snapshot_ids)
+    try:
+        backend.flush(snapshot_ids)
+    except Exception as err:
+        stderr()
+        stderr(
+            f'[X] The search backend threw an exception={err}:',
+        color='red',
+        )

From 70cc0c1950c4fb4bdc8edbc3f932d9500cf35283 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Sat, 21 Nov 2020 13:02:35 -0500
Subject: [PATCH 14/27] Add search filter-type

---
 archivebox/cli/archivebox_update.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index d9a94235..aa8cae1b 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     parser.add_argument(
         '--filter-type',
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex', 'search'),
+        choices=('exact', 'substring', 'domain', 'regex'),
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
     )

From c5b1b91708b9a66eb508b8d22f7686f6711c5747 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Sat, 21 Nov 2020 13:02:58 -0500
Subject: [PATCH 15/27] fix: flush_search_index must be called before removing
 snapshots

---
 archivebox/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/archivebox/main.py b/archivebox/main.py
index d533d58d..73278702 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -665,8 +665,8 @@ def remove(filter_str: Optional[str]=None,
 
     to_remove = snapshots.count()
 
-    remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
     flush_search_index(snapshots=snapshots)
+    remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
     all_snapshots = load_main_index(out_dir=out_dir)
     log_removal_finished(all_snapshots.count(), to_remove)
     

From 8484bdb9739a949311fd666eb7c7fe7f5fde6f3d Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Sat, 21 Nov 2020 13:06:51 -0500
Subject: [PATCH 16/27] Fix add search filter to update

---
 archivebox/cli/archivebox_update.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index aa8cae1b..d9a94235 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     parser.add_argument(
         '--filter-type',
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex'),
+        choices=('exact', 'substring', 'domain', 'regex', 'search'),
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
     )

From 95382b381203e92dda76286a30a934ca2cca1ba5 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Sun, 22 Nov 2020 20:56:24 -0500
Subject: [PATCH 17/27] Add ripgrep rg search backend and set as default

---
 Dockerfile                            |  2 +-
 archivebox/config.py                  |  2 +-
 archivebox/search/backends/ripgrep.py | 43 +++++++++++++++++++++++++++
 docker-compose.yml                    | 24 +++++++--------
 4 files changed, 56 insertions(+), 15 deletions(-)
 create mode 100644 archivebox/search/backends/ripgrep.py

diff --git a/Dockerfile b/Dockerfile
index 33d4a488..20a410e2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -46,7 +46,7 @@ RUN apt-get update -qq \
 # Install apt dependencies
 RUN apt-get update -qq \
     && apt-get install -qq -y --no-install-recommends \
-        wget curl chromium git ffmpeg youtube-dl \
+        wget curl chromium git ffmpeg youtube-dl ripgrep \
         fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
     && rm -rf /var/lib/apt/lists/*
 
diff --git a/archivebox/config.py b/archivebox/config.py
index ee2f0b4a..846df0c9 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -142,7 +142,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
     'SEARCH_BACKEND_CONFIG' : {
         'USE_INDEXING_BACKEND':     {'type': bool,  'default': True},
         'USE_SEARCHING_BACKEND':    {'type': bool,  'default': True},
-        'SEARCH_BACKEND_ENGINE':    {'type': str,   'default': 'sonic'},
+        'SEARCH_BACKEND_ENGINE':    {'type': str,   'default': 'ripgrep'},
         'SEARCH_BACKEND_HOST_NAME': {'type': str,   'default': 'localhost'},
         'SEARCH_BACKEND_PORT':      {'type': int,   'default': 1491},
         'SEARCH_BACKEND_PASSWORD':  {'type': str,   'default': 'SecretPassword'},
diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py
new file mode 100644
index 00000000..cd9ecfee
--- /dev/null
+++ b/archivebox/search/backends/ripgrep.py
@@ -0,0 +1,43 @@
+import re
+from subprocess import run, PIPE, DEVNULL
+from typing import List, Generator
+
+from archivebox.config import setup_django, ARCHIVE_DIR, ARCHIVE_DIR_NAME
+from archivebox.util import enforce_types
+
+DEFAULT_ARGUMENTS = '-ilt' # Case insensitive, matching files, types
+DEFAULT_EXTENSIONS = 'html'
+REGEX_ARGUMENT = '-e'
+
+TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
+
+ts_regex =  re.compile(TIMESTAMP_REGEX)
+
+@enforce_types
+def index(snapshot_id: str, texts: List[str]):
+    return
+
+@enforce_types
+def flush(snapshot_ids: Generator[str, None, None]):
+    return
+
+@enforce_types
+def search(text: str) -> List[str]:
+    is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL)
+    if is_rg_installed.returncode:
+        raise Exception("rg binary not found, install ripgrep to use this backend")
+
+    setup_django(check_db=True)
+    from core.models import Snapshot
+
+    rg = run(['rg',DEFAULT_ARGUMENTS, DEFAULT_EXTENSIONS, REGEX_ARGUMENT, text, str(ARCHIVE_DIR)],stdout=PIPE, stderr=PIPE, timeout=60)
+    file_paths = [p.decode().replace(str(ARCHIVE_DIR_NAME), '') for p in rg.stdout.splitlines()]
+    timestamps = set()
+    for path in file_paths:
+        if ts := ts_regex.findall(path):
+            timestamps.add(ts[0])
+    
+    snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
+
+    return snap_ids
+
diff --git a/docker-compose.yml b/docker-compose.yml
index 29fc6f7a..c76f734a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -21,21 +21,8 @@ services:
         environment:
             - USE_COLOR=True
             - SHOW_PROGRESS=False
-            - SEARCH_BACKEND_HOST_NAME=sonic
-            - SEARCH_BACKEND_PASSWORD=SecretPassword
         volumes:
             - ./data:/data
-        depends_on:
-            - sonic
-    sonic:
-        image: valeriansaliou/sonic:v1.3.0    
-        ports:
-            - 1491:1491
-        environment:
-            - SEARCH_BACKEND_PASSWORD=SecretPassword
-        volumes:
-            - ./etc/sonic/config.cfg:/etc/sonic.cfg
-            - ./data:/var/lib/sonic/store/
     
 
 
@@ -87,3 +74,14 @@ services:
     #     volumes:
     #         ./data:/archivebox
     #         ./data/wayback:/webarchive
+
+    # Example: Run sonic search backend
+    # sonic:
+    #    image: valeriansaliou/sonic:v1.3.0    
+    #    ports:
+    #        - 1491:1491
+    #    environment:
+    #        - SEARCH_BACKEND_PASSWORD=SecretPassword
+    #    volumes:
+    #        - ./etc/sonic/config.cfg:/etc/sonic.cfg
+    #        - ./data:/var/lib/sonic/store/
\ No newline at end of file

From 23a9beb4e00ad954af8476c3e3c71e9d068f00a1 Mon Sep 17 00:00:00 2001
From: JDC <jd_caballero@hotmail.com>
Date: Mon, 23 Nov 2020 08:26:12 -0500
Subject: [PATCH 18/27] Add ignored extensions in ripgrep search

---
 archivebox/search/backends/ripgrep.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py
index cd9ecfee..07292e37 100644
--- a/archivebox/search/backends/ripgrep.py
+++ b/archivebox/search/backends/ripgrep.py
@@ -2,12 +2,15 @@ import re
 from subprocess import run, PIPE, DEVNULL
 from typing import List, Generator
 
-from archivebox.config import setup_django, ARCHIVE_DIR, ARCHIVE_DIR_NAME
+from archivebox.config import setup_django, ARCHIVE_DIR
 from archivebox.util import enforce_types
 
-DEFAULT_ARGUMENTS = '-ilt' # Case insensitive, matching files, types
-DEFAULT_EXTENSIONS = 'html'
-REGEX_ARGUMENT = '-e'
+RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
+
+RG_ADD_TYPE = '--type-add'
+RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}"
+RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l)
+RG_REGEX_ARGUMENT = '-e'
 
 TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
 
@@ -25,13 +28,14 @@ def flush(snapshot_ids: Generator[str, None, None]):
 def search(text: str) -> List[str]:
     is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL)
     if is_rg_installed.returncode:
-        raise Exception("rg binary not found, install ripgrep to use this backend")
+        raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
 
     setup_django(check_db=True)
     from core.models import Snapshot
 
-    rg = run(['rg',DEFAULT_ARGUMENTS, DEFAULT_EXTENSIONS, REGEX_ARGUMENT, text, str(ARCHIVE_DIR)],stdout=PIPE, stderr=PIPE, timeout=60)
-    file_paths = [p.decode().replace(str(ARCHIVE_DIR_NAME), '') for p in rg.stdout.splitlines()]
+    rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)]
+    rg = run(rg_cmd, stdout=PIPE, stderr=PIPE, timeout=60)
+    file_paths = [p.decode() for p in rg.stdout.splitlines()]
     timestamps = set()
     for path in file_paths:
         if ts := ts_regex.findall(path):

From 7903db6dfb15b7f6d601885b8920f7539a8cdec7 Mon Sep 17 00:00:00 2001
From: JDC <juandiego.caballero@gmail.com>
Date: Mon, 23 Nov 2020 13:04:38 -0500
Subject: [PATCH 19/27] Add ArchiveResult Manager and sorted indexable filter

---
 archivebox/core/models.py         | 17 ++++++++++++++---
 archivebox/extractors/__init__.py |  3 +++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 5555c798..fe2d05ab 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -5,10 +5,11 @@ import uuid
 from django.db import models, transaction
 from django.utils.functional import cached_property
 from django.utils.text import slugify
+from django.db.models import Case, When, Value, IntegerField
 
 from ..util import parse_date
 from ..index.schema import Link
-from ..extractors import get_default_archive_methods
+from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
 
 EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
 STATUS_CHOICES = [
@@ -91,7 +92,7 @@ class Snapshot(models.Model):
         return {
             key: getattr(self, key)
             if key != 'tags' else self.tags_str()
-            for key in args 
+            for key in args
         }
 
     def as_link(self) -> Link:
@@ -100,7 +101,7 @@ class Snapshot(models.Model):
     def as_link_with_details(self) -> Link:
         from ..index import load_link_details
         return load_link_details(self.as_link())
-    
+
     def tags_str(self) -> str:
         return ','.join(self.tags.order_by('name').values_list('name', flat=True))
 
@@ -157,7 +158,15 @@ class Snapshot(models.Model):
         self.tags.clear()
         self.tags.add(*tags_id)
 
+class ArchiveResultManager(models.Manager):
+    def indexable(self, sorted: bool = True):
+        INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
+        qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
 
+        if sorted:
+            precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
+            qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
+        return qs
 class ArchiveResult(models.Model):
     snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
     cmd = models.JSONField()
@@ -169,5 +178,7 @@ class ArchiveResult(models.Model):
     status = models.CharField(max_length=16, choices=STATUS_CHOICES)
     extractor = models.CharField(choices=EXTRACTORS, max_length=32)
 
+    objects = ArchiveResultManager()
+
     def __str__(self):
         return self.extractor
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index 0cf6d90d..ceef3b51 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -39,6 +39,7 @@ from .media import should_save_media, save_media
 from .archive_org import should_save_archive_dot_org, save_archive_dot_org
 from .headers import should_save_headers, save_headers
 
+
 def get_default_archive_methods():
     return [
         ('title', should_save_title, save_title),
@@ -56,6 +57,8 @@ def get_default_archive_methods():
         ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
     ]
 
+ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
+
 @enforce_types
 def ignore_methods(to_ignore: List[str]):
     ARCHIVE_METHODS = get_default_archive_methods()

From 273c9d91c6dfddfdb25888173e50786c28c242b3 Mon Sep 17 00:00:00 2001
From: JDC <juandiego.caballero@gmail.com>
Date: Mon, 23 Nov 2020 13:41:35 -0500
Subject: [PATCH 20/27] Add tag filter to update command

---
 archivebox/cli/archivebox_update.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index d9a94235..6748096e 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
     parser.add_argument(
         '--filter-type',
         type=str,
-        choices=('exact', 'substring', 'domain', 'regex', 'search'),
+        choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
         default='exact',
         help='Type of pattern matching to use when filtering URLs',
     )

From caf4660ac86153632c76de247b6ac8579d06de31 Mon Sep 17 00:00:00 2001
From: JDC <juandiego.caballero@gmail.com>
Date: Mon, 23 Nov 2020 15:51:59 -0500
Subject: [PATCH 21/27] Add indexing to update command and utilities

---
 archivebox/main.py            |  3 ++-
 archivebox/search/__init__.py | 16 +++++++++++++++
 archivebox/search/utils.py    | 38 +++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 archivebox/search/utils.py

diff --git a/archivebox/main.py b/archivebox/main.py
index 73278702..bb24d124 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -115,7 +115,7 @@ from .logging_util import (
     printable_dependency_version,
 )
 
-from .search import flush_search_index
+from .search import flush_search_index, index_links
 
 ALLOWED_IN_OUTPUT_DIR = {
     'lost+found',
@@ -711,6 +711,7 @@ def update(resume: Optional[float]=None,
     if index_only:
         for link in all_links:
             write_link_details(link, out_dir=out_dir, skip_sql_index=True)
+        index_links(all_links, out_dir=out_dir)
         return all_links
         
     # Step 2: Run the archive methods for each link
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index fdf19a89..537fa1ff 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -8,6 +8,8 @@ from archivebox.index.schema import Link
 from archivebox.util import enforce_types
 from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
 
+from .utils import get_indexable_content
+
 def indexing_enabled():
     return USE_INDEXING_BACKEND
 
@@ -83,3 +85,17 @@ def flush_search_index(snapshots: QuerySet):
             f'[X] The search backend threw an exception={err}:',
         color='red',
         )
+
+@enforce_types
+def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
+    if not links:
+        return
+
+    setup_django(out_dir=out_dir, check_db=True)
+    from core.models import Snapshot, ArchiveResult
+
+    for link in links:
+        if snap := Snapshot.objects.filter(url=link.url).first():
+            results = ArchiveResult.objects.indexable().filter(snapshot=snap)
+            texts = get_indexable_content(results)
+            write_search_index(link,texts,out_dir=out_dir)
diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py
new file mode 100644
index 00000000..f2d86b2c
--- /dev/null
+++ b/archivebox/search/utils.py
@@ -0,0 +1,38 @@
+from django.db.models import QuerySet
+
+from archivebox.util import enforce_types
+
+def get_file_result_content(res, extra_path, use_pwd=False):
+    if use_pwd: 
+        fpath = f'{res.pwd}/{res.output}'
+    else:
+        fpath = f'{res.output}'
+    
+    if extra_path:
+        fpath = f'{fpath}/{extra_path}'
+
+    with open(fpath, 'r') as file:
+        data = file.read().replace('\n', '')
+    if data:
+        return [data]
+    return []
+
+
+# This should be abstracted by a plugin interface for extractors
+@enforce_types
+def get_indexable_content(results: QuerySet):
+    if not results:
+        return []
+    # Only use the first method available
+    res, method = results.first(), results.first().extractor
+    if method not in ('readability', 'singlefile', 'dom', 'wget'):
+        return []
+    # This should come from a plugin interface
+    if method == 'readability':
+        return get_file_result_content(res, 'content.txt')
+    elif method == 'singlefile':
+        return get_file_result_content(res, '')
+    elif method == 'dom':
+        return get_file_result_content(res,'',use_pwd=True)
+    elif method == 'wget':
+        return get_file_result_content(res,'',use_pwd=True)

From 0acf479b70421553b721f6ef040039fcf5362f7b Mon Sep 17 00:00:00 2001
From: JDC <juandiego.caballero@gmail.com>
Date: Mon, 23 Nov 2020 16:54:27 -0500
Subject: [PATCH 22/27] Partition long strings in chunks for sonic

---
 archivebox/search/__init__.py       | 2 +-
 archivebox/search/backends/sonic.py | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index 537fa1ff..fa5d564d 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -98,4 +98,4 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
         if snap := Snapshot.objects.filter(url=link.url).first():
             results = ArchiveResult.objects.indexable().filter(snapshot=snap)
             texts = get_indexable_content(results)
-            write_search_index(link,texts,out_dir=out_dir)
+            write_search_index(link, texts, out_dir=out_dir)
diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py
index 7dc4d5b0..affe9d20 100644
--- a/archivebox/search/backends/sonic.py
+++ b/archivebox/search/backends/sonic.py
@@ -5,13 +5,18 @@ from sonic import IngestClient, SearchClient
 from archivebox.util import enforce_types
 from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION
 
+MAX_SONIC_TEXT_LENGTH = 1000
 
 @enforce_types
 def index(snapshot_id: str, texts: List[str]):
     with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
         for text in texts:
-            ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text))
-
+            if len(text) < MAX_SONIC_TEXT_LENGTH:
+                ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text))
+            else:
+                chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
+                for chunk in chunks:
+                    ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
 @enforce_types
 def search(text: str) -> List[str]:
     with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl:

From db9c2edccc5dc136bd79a3568574b67b4a63600b Mon Sep 17 00:00:00 2001
From: JDC <juandiego.caballero@gmail.com>
Date: Mon, 23 Nov 2020 17:23:26 -0500
Subject: [PATCH 23/27] Add log print for url indexing

---
 archivebox/search/__init__.py | 3 ++-
 archivebox/search/utils.py    | 8 +++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index fa5d564d..a262d926 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -8,7 +8,7 @@ from archivebox.index.schema import Link
 from archivebox.util import enforce_types
 from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
 
-from .utils import get_indexable_content
+from .utils import get_indexable_content, log_index_started
 
 def indexing_enabled():
     return USE_INDEXING_BACKEND
@@ -98,4 +98,5 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
         if snap := Snapshot.objects.filter(url=link.url).first():
             results = ArchiveResult.objects.indexable().filter(snapshot=snap)
             texts = get_indexable_content(results)
+            log_index_started(link.url)
             write_search_index(link, texts, out_dir=out_dir)
diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py
index f2d86b2c..55c97e75 100644
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@@ -1,6 +1,11 @@
 from django.db.models import QuerySet
 
 from archivebox.util import enforce_types
+from archivebox.config import ANSI
+
+def log_index_started(url):
+    print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
+    print( )
 
 def get_file_result_content(res, extra_path, use_pwd=False):
     if use_pwd: 
@@ -12,7 +17,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
         fpath = f'{fpath}/{extra_path}'
 
     with open(fpath, 'r') as file:
-        data = file.read().replace('\n', '')
+        data = file.read()
     if data:
         return [data]
     return []
@@ -28,6 +33,7 @@ def get_indexable_content(results: QuerySet):
     if method not in ('readability', 'singlefile', 'dom', 'wget'):
         return []
     # This should come from a plugin interface
+
     if method == 'readability':
         return get_file_result_content(res, 'content.txt')
     elif method == 'singlefile':

From 15fbd81480536bd7223096446b27f8666d7057e4 Mon Sep 17 00:00:00 2001
From: JDC <juandiego.caballero@gmail.com>
Date: Mon, 23 Nov 2020 18:17:07 -0500
Subject: [PATCH 24/27] Change MAX_SONIC_TEXT_LENGTH

---
 archivebox/search/backends/sonic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py
index affe9d20..e34c6535 100644
--- a/archivebox/search/backends/sonic.py
+++ b/archivebox/search/backends/sonic.py
@@ -5,7 +5,7 @@ from sonic import IngestClient, SearchClient
 from archivebox.util import enforce_types
 from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION
 
-MAX_SONIC_TEXT_LENGTH = 1000
+MAX_SONIC_TEXT_LENGTH = 20000
 
 @enforce_types
 def index(snapshot_id: str, texts: List[str]):

From b1d70185ed0bf53b446da0ab54ae4bcf5fc6cb27 Mon Sep 17 00:00:00 2001
From: JDC <juandiego.caballero@gmail.com>
Date: Mon, 23 Nov 2020 18:33:32 -0500
Subject: [PATCH 25/27] Increase word_objects for Sonic default config

---
 etc/sonic/config.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etc/sonic/config.cfg b/etc/sonic/config.cfg
index 45806ed1..10fbda53 100644
--- a/etc/sonic/config.cfg
+++ b/etc/sonic/config.cfg
@@ -32,7 +32,7 @@ suggest_limit_maximum = 20
 
 path = "/var/lib/sonic/store/kv/"
 
-retain_word_objects = 1000
+retain_word_objects = 100000
 
 [store.kv.pool]
 

From 5a6b814c7935ccc1571abd8d5b2487186cac96c7 Mon Sep 17 00:00:00 2001
From: jdcaballerov <jd_caballero@hotmail.com>
Date: Tue, 24 Nov 2020 09:35:06 -0500
Subject: [PATCH 26/27] Add exception handling for indexable content reader

---
 archivebox/search/__init__.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index a262d926..ebeebcd0 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -97,6 +97,14 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
     for link in links:
         if snap := Snapshot.objects.filter(url=link.url).first():
             results = ArchiveResult.objects.indexable().filter(snapshot=snap)
-            texts = get_indexable_content(results)
             log_index_started(link.url)
-            write_search_index(link, texts, out_dir=out_dir)
+            try:
+                texts = get_indexable_content(results)
+            except Exception as err:
+                stderr()
+                stderr(
+                    f'[X] An Exception ocurred reading the indexable content={err}:',
+                    color='red',
+                    ) 
+            else:
+                write_search_index(link, texts, out_dir=out_dir)
\ No newline at end of file

From 172197ae01c080874ec83b190e536d986c6603c5 Mon Sep 17 00:00:00 2001
From: jdcaballerov <jd_caballero@hotmail.com>
Date: Thu, 26 Nov 2020 18:12:54 -0500
Subject: [PATCH 27/27] refactor: Remove if LENGTH and use text chunker for
 every input

---
 archivebox/search/backends/sonic.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py
index e34c6535..f0beaddd 100644
--- a/archivebox/search/backends/sonic.py
+++ b/archivebox/search/backends/sonic.py
@@ -11,12 +11,10 @@ MAX_SONIC_TEXT_LENGTH = 20000
 def index(snapshot_id: str, texts: List[str]):
     with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
         for text in texts:
-            if len(text) < MAX_SONIC_TEXT_LENGTH:
-                ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text))
-            else:
-                chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
-                for chunk in chunks:
-                    ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
+            chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
+            for chunk in chunks:
+                ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
+
 @enforce_types
 def search(text: str) -> List[str]:
     with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl: