mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
Merge pull request #570 from ArchiveBox/sonic-search
This commit is contained in:
commit
8d103687d0
20 changed files with 406 additions and 12 deletions
|
@ -46,7 +46,7 @@ RUN apt-get update -qq \
|
||||||
# Install apt dependencies
|
# Install apt dependencies
|
||||||
RUN apt-get update -qq \
|
RUN apt-get update -qq \
|
||||||
&& apt-get install -qq -y --no-install-recommends \
|
&& apt-get install -qq -y --no-install-recommends \
|
||||||
wget curl chromium git ffmpeg youtube-dl \
|
wget curl chromium git ffmpeg youtube-dl ripgrep \
|
||||||
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
pip_dist/archivebox.egg-info
|
|
|
@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--filter-type',
|
'--filter-type',
|
||||||
type=str,
|
type=str,
|
||||||
choices=('exact', 'substring', 'domain', 'regex','tag'),
|
choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
|
||||||
default='exact',
|
default='exact',
|
||||||
help='Type of pattern matching to use when filtering URLs',
|
help='Type of pattern matching to use when filtering URLs',
|
||||||
)
|
)
|
||||||
|
|
|
@ -91,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--filter-type',
|
'--filter-type',
|
||||||
type=str,
|
type=str,
|
||||||
choices=('exact', 'substring', 'domain', 'regex'),
|
choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
|
||||||
default='exact',
|
default='exact',
|
||||||
help='Type of pattern matching to use when filtering URLs',
|
help='Type of pattern matching to use when filtering URLs',
|
||||||
)
|
)
|
||||||
|
|
|
@ -139,6 +139,18 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||||
},
|
},
|
||||||
|
|
||||||
|
'SEARCH_BACKEND_CONFIG' : {
|
||||||
|
'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
|
||||||
|
'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
|
||||||
|
'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'},
|
||||||
|
'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'},
|
||||||
|
'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491},
|
||||||
|
'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'},
|
||||||
|
# SONIC
|
||||||
|
'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'},
|
||||||
|
'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},
|
||||||
|
},
|
||||||
|
|
||||||
'DEPENDENCY_CONFIG': {
|
'DEPENDENCY_CONFIG': {
|
||||||
'USE_CURL': {'type': bool, 'default': True},
|
'USE_CURL': {'type': bool, 'default': True},
|
||||||
'USE_WGET': {'type': bool, 'default': True},
|
'USE_WGET': {'type': bool, 'default': True},
|
||||||
|
@ -149,7 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||||
'USE_CHROME': {'type': bool, 'default': True},
|
'USE_CHROME': {'type': bool, 'default': True},
|
||||||
'USE_NODE': {'type': bool, 'default': True},
|
'USE_NODE': {'type': bool, 'default': True},
|
||||||
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
||||||
|
|
||||||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||||
'GIT_BINARY': {'type': str, 'default': 'git'},
|
'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||||
'WGET_BINARY': {'type': str, 'default': 'wget'},
|
'WGET_BINARY': {'type': str, 'default': 'wget'},
|
||||||
|
|
|
@ -14,6 +14,9 @@ from django import forms
|
||||||
from core.models import Snapshot, Tag
|
from core.models import Snapshot, Tag
|
||||||
from core.forms import AddLinkForm, TagField
|
from core.forms import AddLinkForm, TagField
|
||||||
|
|
||||||
|
from core.utils import get_icons
|
||||||
|
from core.mixins import SearchResultsAdminMixin
|
||||||
|
|
||||||
from index.html import snapshot_icons
|
from index.html import snapshot_icons
|
||||||
from util import htmldecode, urldecode, ansi_to_html
|
from util import htmldecode, urldecode, ansi_to_html
|
||||||
from logging_util import printable_filesize
|
from logging_util import printable_filesize
|
||||||
|
@ -82,7 +85,7 @@ class SnapshotAdminForm(forms.ModelForm):
|
||||||
return instance
|
return instance
|
||||||
|
|
||||||
|
|
||||||
class SnapshotAdmin(admin.ModelAdmin):
|
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
||||||
list_display = ('added', 'title_str', 'url_str', 'files', 'size')
|
list_display = ('added', 'title_str', 'url_str', 'files', 'size')
|
||||||
sort_fields = ('title_str', 'url_str', 'added')
|
sort_fields = ('title_str', 'url_str', 'added')
|
||||||
readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
|
readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
|
||||||
|
|
23
archivebox/core/mixins.py
Normal file
23
archivebox/core/mixins.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
from django.contrib import messages
|
||||||
|
|
||||||
|
from archivebox.search import query_search_index
|
||||||
|
|
||||||
|
class SearchResultsAdminMixin(object):
|
||||||
|
def get_search_results(self, request, queryset, search_term):
|
||||||
|
''' Enhances the search queryset with results from the search backend.
|
||||||
|
'''
|
||||||
|
qs, use_distinct = \
|
||||||
|
super(SearchResultsAdminMixin, self).get_search_results(
|
||||||
|
request, queryset, search_term)
|
||||||
|
|
||||||
|
search_term = search_term.strip()
|
||||||
|
if not search_term:
|
||||||
|
return qs, use_distinct
|
||||||
|
try:
|
||||||
|
qsearch = query_search_index(search_term)
|
||||||
|
except Exception as err:
|
||||||
|
messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}')
|
||||||
|
else:
|
||||||
|
qs |= qsearch
|
||||||
|
finally:
|
||||||
|
return qs, use_distinct
|
|
@ -5,10 +5,11 @@ import uuid
|
||||||
from django.db import models, transaction
|
from django.db import models, transaction
|
||||||
from django.utils.functional import cached_property
|
from django.utils.functional import cached_property
|
||||||
from django.utils.text import slugify
|
from django.utils.text import slugify
|
||||||
|
from django.db.models import Case, When, Value, IntegerField
|
||||||
|
|
||||||
from ..util import parse_date
|
from ..util import parse_date
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..extractors import get_default_archive_methods
|
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||||
|
|
||||||
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
||||||
STATUS_CHOICES = [
|
STATUS_CHOICES = [
|
||||||
|
@ -91,7 +92,7 @@ class Snapshot(models.Model):
|
||||||
return {
|
return {
|
||||||
key: getattr(self, key)
|
key: getattr(self, key)
|
||||||
if key != 'tags' else self.tags_str()
|
if key != 'tags' else self.tags_str()
|
||||||
for key in args
|
for key in args
|
||||||
}
|
}
|
||||||
|
|
||||||
def as_link(self) -> Link:
|
def as_link(self) -> Link:
|
||||||
|
@ -100,7 +101,7 @@ class Snapshot(models.Model):
|
||||||
def as_link_with_details(self) -> Link:
|
def as_link_with_details(self) -> Link:
|
||||||
from ..index import load_link_details
|
from ..index import load_link_details
|
||||||
return load_link_details(self.as_link())
|
return load_link_details(self.as_link())
|
||||||
|
|
||||||
def tags_str(self) -> str:
|
def tags_str(self) -> str:
|
||||||
return ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
return ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
||||||
|
|
||||||
|
@ -157,7 +158,15 @@ class Snapshot(models.Model):
|
||||||
self.tags.clear()
|
self.tags.clear()
|
||||||
self.tags.add(*tags_id)
|
self.tags.add(*tags_id)
|
||||||
|
|
||||||
|
class ArchiveResultManager(models.Manager):
|
||||||
|
def indexable(self, sorted: bool = True):
|
||||||
|
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||||
|
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
|
||||||
|
|
||||||
|
if sorted:
|
||||||
|
precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||||
|
qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
|
||||||
|
return qs
|
||||||
class ArchiveResult(models.Model):
|
class ArchiveResult(models.Model):
|
||||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||||
cmd = models.JSONField()
|
cmd = models.JSONField()
|
||||||
|
@ -169,5 +178,7 @@ class ArchiveResult(models.Model):
|
||||||
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
|
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
|
||||||
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
||||||
|
|
||||||
|
objects = ArchiveResultManager()
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.extractor
|
return self.extractor
|
||||||
|
|
|
@ -23,6 +23,7 @@ from ..logging_util import (
|
||||||
log_archive_method_started,
|
log_archive_method_started,
|
||||||
log_archive_method_finished,
|
log_archive_method_finished,
|
||||||
)
|
)
|
||||||
|
from ..search import write_search_index
|
||||||
|
|
||||||
from .title import should_save_title, save_title
|
from .title import should_save_title, save_title
|
||||||
from .favicon import should_save_favicon, save_favicon
|
from .favicon import should_save_favicon, save_favicon
|
||||||
|
@ -38,6 +39,7 @@ from .media import should_save_media, save_media
|
||||||
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
|
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
|
||||||
from .headers import should_save_headers, save_headers
|
from .headers import should_save_headers, save_headers
|
||||||
|
|
||||||
|
|
||||||
def get_default_archive_methods():
|
def get_default_archive_methods():
|
||||||
return [
|
return [
|
||||||
('title', should_save_title, save_title),
|
('title', should_save_title, save_title),
|
||||||
|
@ -55,6 +57,8 @@ def get_default_archive_methods():
|
||||||
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def ignore_methods(to_ignore: List[str]):
|
def ignore_methods(to_ignore: List[str]):
|
||||||
ARCHIVE_METHODS = get_default_archive_methods()
|
ARCHIVE_METHODS = get_default_archive_methods()
|
||||||
|
@ -107,6 +111,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||||
link.history[method_name].append(result)
|
link.history[method_name].append(result)
|
||||||
|
|
||||||
stats[result.status] += 1
|
stats[result.status] += 1
|
||||||
|
write_search_index(link=link, texts=result.index_texts)
|
||||||
log_archive_method_finished(result)
|
log_archive_method_finished(result)
|
||||||
if not skip_index:
|
if not skip_index:
|
||||||
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
|
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
|
||||||
|
|
|
@ -71,6 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
link.url
|
link.url
|
||||||
]
|
]
|
||||||
|
readability_content = None
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
document = get_html(link, out_dir)
|
document = get_html(link, out_dir)
|
||||||
|
@ -86,8 +87,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
result_json = json.loads(result.stdout)
|
result_json = json.loads(result.stdout)
|
||||||
output_folder.mkdir(exist_ok=True)
|
output_folder.mkdir(exist_ok=True)
|
||||||
|
readability_content = result_json.pop("textContent")
|
||||||
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
|
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
|
||||||
atomic_write(str(output_folder / "content.txt"), result_json.pop("textContent"))
|
atomic_write(str(output_folder / "content.txt"), readability_content)
|
||||||
atomic_write(str(output_folder / "article.json"), result_json)
|
atomic_write(str(output_folder / "article.json"), result_json)
|
||||||
|
|
||||||
# parse out number of files downloaded from last line of stderr:
|
# parse out number of files downloaded from last line of stderr:
|
||||||
|
@ -117,5 +119,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
cmd_version=READABILITY_VERSION,
|
cmd_version=READABILITY_VERSION,
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
**timer.stats,
|
index_texts= [readability_content] if readability_content else [],
|
||||||
|
**timer.stats,
|
||||||
)
|
)
|
||||||
|
|
|
@ -51,6 +51,8 @@ from .sql import (
|
||||||
write_sql_link_details,
|
write_sql_link_details,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from ..search import search_backend_enabled, query_search_index
|
||||||
|
|
||||||
### Link filtering and checking
|
### Link filtering and checking
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -365,7 +367,7 @@ LINK_FILTERS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
|
def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
|
||||||
q_filter = Q()
|
q_filter = Q()
|
||||||
for pattern in filter_patterns:
|
for pattern in filter_patterns:
|
||||||
try:
|
try:
|
||||||
|
@ -380,6 +382,31 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
return snapshots.filter(q_filter)
|
return snapshots.filter(q_filter)
|
||||||
|
|
||||||
|
def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet:
|
||||||
|
if not search_backend_enabled():
|
||||||
|
stderr()
|
||||||
|
stderr(
|
||||||
|
'[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True',
|
||||||
|
color='red',
|
||||||
|
)
|
||||||
|
raise SystemExit(2)
|
||||||
|
|
||||||
|
qsearch = get_empty_snapshot_queryset()
|
||||||
|
for pattern in filter_patterns:
|
||||||
|
try:
|
||||||
|
qsearch |= query_search_index(pattern)
|
||||||
|
except:
|
||||||
|
raise SystemExit(2)
|
||||||
|
|
||||||
|
return snapshots & qsearch
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
|
||||||
|
if filter_type != 'search':
|
||||||
|
return q_filter(snapshots, filter_patterns, filter_type)
|
||||||
|
else:
|
||||||
|
return search_filter(snapshots, filter_patterns, filter_type)
|
||||||
|
|
||||||
|
|
||||||
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links without checking archive status or data directory validity"""
|
"""indexed links without checking archive status or data directory validity"""
|
||||||
|
|
|
@ -39,6 +39,7 @@ class ArchiveResult:
|
||||||
status: str
|
status: str
|
||||||
start_ts: datetime
|
start_ts: datetime
|
||||||
end_ts: datetime
|
end_ts: datetime
|
||||||
|
index_texts: Union[List[str], None] = None
|
||||||
schema: str = 'ArchiveResult'
|
schema: str = 'ArchiveResult'
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
|
|
@ -115,6 +115,7 @@ from .logging_util import (
|
||||||
printable_dependency_version,
|
printable_dependency_version,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .search import flush_search_index, index_links
|
||||||
|
|
||||||
ALLOWED_IN_OUTPUT_DIR = {
|
ALLOWED_IN_OUTPUT_DIR = {
|
||||||
'lost+found',
|
'lost+found',
|
||||||
|
@ -664,6 +665,7 @@ def remove(filter_str: Optional[str]=None,
|
||||||
|
|
||||||
to_remove = snapshots.count()
|
to_remove = snapshots.count()
|
||||||
|
|
||||||
|
flush_search_index(snapshots=snapshots)
|
||||||
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
||||||
all_snapshots = load_main_index(out_dir=out_dir)
|
all_snapshots = load_main_index(out_dir=out_dir)
|
||||||
log_removal_finished(all_snapshots.count(), to_remove)
|
log_removal_finished(all_snapshots.count(), to_remove)
|
||||||
|
@ -709,6 +711,7 @@ def update(resume: Optional[float]=None,
|
||||||
if index_only:
|
if index_only:
|
||||||
for link in all_links:
|
for link in all_links:
|
||||||
write_link_details(link, out_dir=out_dir, skip_sql_index=True)
|
write_link_details(link, out_dir=out_dir, skip_sql_index=True)
|
||||||
|
index_links(all_links, out_dir=out_dir)
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
# Step 2: Run the archive methods for each link
|
# Step 2: Run the archive methods for each link
|
||||||
|
|
110
archivebox/search/__init__.py
Normal file
110
archivebox/search/__init__.py
Normal file
|
@ -0,0 +1,110 @@
|
||||||
|
from typing import List, Union
|
||||||
|
from pathlib import Path
|
||||||
|
from importlib import import_module
|
||||||
|
|
||||||
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
|
from archivebox.index.schema import Link
|
||||||
|
from archivebox.util import enforce_types
|
||||||
|
from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
|
||||||
|
|
||||||
|
from .utils import get_indexable_content, log_index_started
|
||||||
|
|
||||||
|
def indexing_enabled():
|
||||||
|
return USE_INDEXING_BACKEND
|
||||||
|
|
||||||
|
def search_backend_enabled():
|
||||||
|
return USE_SEARCHING_BACKEND
|
||||||
|
|
||||||
|
def get_backend():
|
||||||
|
return f'search.backends.{SEARCH_BACKEND_ENGINE}'
|
||||||
|
|
||||||
|
def import_backend():
|
||||||
|
backend_string = get_backend()
|
||||||
|
try:
|
||||||
|
backend = import_module(backend_string)
|
||||||
|
except Exception as err:
|
||||||
|
raise Exception("Could not load '%s' as a backend: %s" % (backend_string, err))
|
||||||
|
return backend
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
|
||||||
|
if not indexing_enabled():
|
||||||
|
return
|
||||||
|
|
||||||
|
if not skip_text_index and texts:
|
||||||
|
setup_django(out_dir, check_db=True)
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
|
snap = Snapshot.objects.filter(url=link.url).first()
|
||||||
|
backend = import_backend()
|
||||||
|
if snap:
|
||||||
|
try:
|
||||||
|
backend.index(snapshot_id=str(snap.id), texts=texts)
|
||||||
|
except Exception as err:
|
||||||
|
stderr()
|
||||||
|
stderr(
|
||||||
|
f'[X] The search backend threw an exception={err}:',
|
||||||
|
color='red',
|
||||||
|
)
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
|
||||||
|
setup_django(out_dir, check_db=True)
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
|
if search_backend_enabled():
|
||||||
|
backend = import_backend()
|
||||||
|
try:
|
||||||
|
snapshot_ids = backend.search(query)
|
||||||
|
except Exception as err:
|
||||||
|
stderr()
|
||||||
|
stderr(
|
||||||
|
f'[X] The search backend threw an exception={err}:',
|
||||||
|
color='red',
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
# TODO preserve ordering from backend
|
||||||
|
qsearch = Snapshot.objects.filter(pk__in=snapshot_ids)
|
||||||
|
return qsearch
|
||||||
|
|
||||||
|
return Snapshot.objects.none()
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def flush_search_index(snapshots: QuerySet):
|
||||||
|
if not indexing_enabled() or not snapshots:
|
||||||
|
return
|
||||||
|
backend = import_backend()
|
||||||
|
snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True))
|
||||||
|
try:
|
||||||
|
backend.flush(snapshot_ids)
|
||||||
|
except Exception as err:
|
||||||
|
stderr()
|
||||||
|
stderr(
|
||||||
|
f'[X] The search backend threw an exception={err}:',
|
||||||
|
color='red',
|
||||||
|
)
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
|
||||||
|
if not links:
|
||||||
|
return
|
||||||
|
|
||||||
|
setup_django(out_dir=out_dir, check_db=True)
|
||||||
|
from core.models import Snapshot, ArchiveResult
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
if snap := Snapshot.objects.filter(url=link.url).first():
|
||||||
|
results = ArchiveResult.objects.indexable().filter(snapshot=snap)
|
||||||
|
log_index_started(link.url)
|
||||||
|
try:
|
||||||
|
texts = get_indexable_content(results)
|
||||||
|
except Exception as err:
|
||||||
|
stderr()
|
||||||
|
stderr(
|
||||||
|
f'[X] An Exception ocurred reading the indexable content={err}:',
|
||||||
|
color='red',
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
write_search_index(link, texts, out_dir=out_dir)
|
0
archivebox/search/backends/__init__.py
Normal file
0
archivebox/search/backends/__init__.py
Normal file
47
archivebox/search/backends/ripgrep.py
Normal file
47
archivebox/search/backends/ripgrep.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
import re
|
||||||
|
from subprocess import run, PIPE, DEVNULL
|
||||||
|
from typing import List, Generator
|
||||||
|
|
||||||
|
from archivebox.config import setup_django, ARCHIVE_DIR
|
||||||
|
from archivebox.util import enforce_types
|
||||||
|
|
||||||
|
RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
|
||||||
|
|
||||||
|
RG_ADD_TYPE = '--type-add'
|
||||||
|
RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}"
|
||||||
|
RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l)
|
||||||
|
RG_REGEX_ARGUMENT = '-e'
|
||||||
|
|
||||||
|
TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
|
||||||
|
|
||||||
|
ts_regex = re.compile(TIMESTAMP_REGEX)
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def index(snapshot_id: str, texts: List[str]):
|
||||||
|
return
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def flush(snapshot_ids: Generator[str, None, None]):
|
||||||
|
return
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def search(text: str) -> List[str]:
|
||||||
|
is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL)
|
||||||
|
if is_rg_installed.returncode:
|
||||||
|
raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
|
||||||
|
|
||||||
|
setup_django(check_db=True)
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
|
rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)]
|
||||||
|
rg = run(rg_cmd, stdout=PIPE, stderr=PIPE, timeout=60)
|
||||||
|
file_paths = [p.decode() for p in rg.stdout.splitlines()]
|
||||||
|
timestamps = set()
|
||||||
|
for path in file_paths:
|
||||||
|
if ts := ts_regex.findall(path):
|
||||||
|
timestamps.add(ts[0])
|
||||||
|
|
||||||
|
snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
|
||||||
|
|
||||||
|
return snap_ids
|
||||||
|
|
28
archivebox/search/backends/sonic.py
Normal file
28
archivebox/search/backends/sonic.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
from typing import List, Generator
|
||||||
|
|
||||||
|
from sonic import IngestClient, SearchClient
|
||||||
|
|
||||||
|
from archivebox.util import enforce_types
|
||||||
|
from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION
|
||||||
|
|
||||||
|
MAX_SONIC_TEXT_LENGTH = 20000
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def index(snapshot_id: str, texts: List[str]):
|
||||||
|
with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
|
||||||
|
for text in texts:
|
||||||
|
chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
|
||||||
|
for chunk in chunks:
|
||||||
|
ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def search(text: str) -> List[str]:
|
||||||
|
with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl:
|
||||||
|
snap_ids = querycl.query(SONIC_COLLECTION, SONIC_BUCKET, text)
|
||||||
|
return snap_ids
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def flush(snapshot_ids: Generator[str, None, None]):
|
||||||
|
with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
|
||||||
|
for id in snapshot_ids:
|
||||||
|
ingestcl.flush_object(SONIC_COLLECTION, SONIC_BUCKET, str(id))
|
44
archivebox/search/utils.py
Normal file
44
archivebox/search/utils.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
|
from archivebox.util import enforce_types
|
||||||
|
from archivebox.config import ANSI
|
||||||
|
|
||||||
|
def log_index_started(url):
|
||||||
|
print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
|
||||||
|
print( )
|
||||||
|
|
||||||
|
def get_file_result_content(res, extra_path, use_pwd=False):
|
||||||
|
if use_pwd:
|
||||||
|
fpath = f'{res.pwd}/{res.output}'
|
||||||
|
else:
|
||||||
|
fpath = f'{res.output}'
|
||||||
|
|
||||||
|
if extra_path:
|
||||||
|
fpath = f'{fpath}/{extra_path}'
|
||||||
|
|
||||||
|
with open(fpath, 'r') as file:
|
||||||
|
data = file.read()
|
||||||
|
if data:
|
||||||
|
return [data]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# This should be abstracted by a plugin interface for extractors
|
||||||
|
@enforce_types
|
||||||
|
def get_indexable_content(results: QuerySet):
|
||||||
|
if not results:
|
||||||
|
return []
|
||||||
|
# Only use the first method available
|
||||||
|
res, method = results.first(), results.first().extractor
|
||||||
|
if method not in ('readability', 'singlefile', 'dom', 'wget'):
|
||||||
|
return []
|
||||||
|
# This should come from a plugin interface
|
||||||
|
|
||||||
|
if method == 'readability':
|
||||||
|
return get_file_result_content(res, 'content.txt')
|
||||||
|
elif method == 'singlefile':
|
||||||
|
return get_file_result_content(res, '')
|
||||||
|
elif method == 'dom':
|
||||||
|
return get_file_result_content(res,'',use_pwd=True)
|
||||||
|
elif method == 'wget':
|
||||||
|
return get_file_result_content(res,'',use_pwd=True)
|
|
@ -23,6 +23,7 @@ services:
|
||||||
- SHOW_PROGRESS=False
|
- SHOW_PROGRESS=False
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/data
|
- ./data:/data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Optional Addons: tweak these examples as needed for your specific use case
|
# Optional Addons: tweak these examples as needed for your specific use case
|
||||||
|
@ -73,3 +74,14 @@ services:
|
||||||
# volumes:
|
# volumes:
|
||||||
# ./data:/archivebox
|
# ./data:/archivebox
|
||||||
# ./data/wayback:/webarchive
|
# ./data/wayback:/webarchive
|
||||||
|
|
||||||
|
# Example: Run sonic search backend
|
||||||
|
# sonic:
|
||||||
|
# image: valeriansaliou/sonic:v1.3.0
|
||||||
|
# ports:
|
||||||
|
# - 1491:1491
|
||||||
|
# environment:
|
||||||
|
# - SEARCH_BACKEND_PASSWORD=SecretPassword
|
||||||
|
# volumes:
|
||||||
|
# - ./etc/sonic/config.cfg:/etc/sonic.cfg
|
||||||
|
# - ./data:/var/lib/sonic/store/
|
66
etc/sonic/config.cfg
Normal file
66
etc/sonic/config.cfg
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
# Sonic
|
||||||
|
# Fast, lightweight and schema-less search backend
|
||||||
|
# Configuration file
|
||||||
|
# Example: https://github.com/valeriansaliou/sonic/blob/master/config.cfg
|
||||||
|
|
||||||
|
|
||||||
|
[server]
|
||||||
|
|
||||||
|
log_level = "debug"
|
||||||
|
|
||||||
|
|
||||||
|
[channel]
|
||||||
|
|
||||||
|
inet = "0.0.0.0:1491"
|
||||||
|
tcp_timeout = 300
|
||||||
|
|
||||||
|
auth_password = "${env.SEARCH_BACKEND_PASSWORD}"
|
||||||
|
|
||||||
|
[channel.search]
|
||||||
|
|
||||||
|
query_limit_default = 65535
|
||||||
|
query_limit_maximum = 65535
|
||||||
|
query_alternates_try = 10
|
||||||
|
|
||||||
|
suggest_limit_default = 5
|
||||||
|
suggest_limit_maximum = 20
|
||||||
|
|
||||||
|
|
||||||
|
[store]
|
||||||
|
|
||||||
|
[store.kv]
|
||||||
|
|
||||||
|
path = "/var/lib/sonic/store/kv/"
|
||||||
|
|
||||||
|
retain_word_objects = 100000
|
||||||
|
|
||||||
|
[store.kv.pool]
|
||||||
|
|
||||||
|
inactive_after = 1800
|
||||||
|
|
||||||
|
[store.kv.database]
|
||||||
|
|
||||||
|
flush_after = 900
|
||||||
|
|
||||||
|
compress = true
|
||||||
|
parallelism = 2
|
||||||
|
max_files = 100
|
||||||
|
max_compactions = 1
|
||||||
|
max_flushes = 1
|
||||||
|
write_buffer = 16384
|
||||||
|
write_ahead_log = true
|
||||||
|
|
||||||
|
[store.fst]
|
||||||
|
|
||||||
|
path = "/var/lib/sonic/store/fst/"
|
||||||
|
|
||||||
|
[store.fst.pool]
|
||||||
|
|
||||||
|
inactive_after = 300
|
||||||
|
|
||||||
|
[store.fst.graph]
|
||||||
|
|
||||||
|
consolidate_after = 180
|
||||||
|
|
||||||
|
max_size = 2048
|
||||||
|
max_words = 250000
|
Loading…
Add table
Add a link
Reference in a new issue