Add indexing to update command and utilities

2025-05-14 15:14:31 -04:00 · 2020-11-23 15:51:59 -05:00 · 2020-11-23 15:51:59 -05:00 · caf4660ac8
commit caf4660ac8
parent 273c9d91c6
3 changed files with 56 additions and 1 deletions
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -115,7 +115,7 @@ from .logging_util import (
    printable_dependency_version,
 )
-from .search import flush_search_index
+from .search import flush_search_index, index_links
 ALLOWED_IN_OUTPUT_DIR = {
    'lost+found',
@ -711,6 +711,7 @@ def update(resume: Optional[float]=None,
    if index_only:
        for link in all_links:
            write_link_details(link, out_dir=out_dir, skip_sql_index=True)
        index_links(all_links, out_dir=out_dir)
        return all_links
    # Step 2: Run the archive methods for each link
--- a/archivebox/search/init.py
+++ b/archivebox/search/init.py
@ -8,6 +8,8 @@ from archivebox.index.schema import Link
 from archivebox.util import enforce_types
 from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
 from .utils import get_indexable_content
 def indexing_enabled():
    return USE_INDEXING_BACKEND
@ -83,3 +85,17 @@ def flush_search_index(snapshots: QuerySet):
            f'[X] The search backend threw an exception={err}:',
        color='red',
        )
@enforce_types
 def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
    if not links:
        return
    setup_django(out_dir=out_dir, check_db=True)
    from core.models import Snapshot, ArchiveResult
    for link in links:
        if snap := Snapshot.objects.filter(url=link.url).first():
            results = ArchiveResult.objects.indexable().filter(snapshot=snap)
            texts = get_indexable_content(results)
            write_search_index(link,texts,out_dir=out_dir)
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@ -0,0 +1,38 @@
 from django.db.models import QuerySet
 from archivebox.util import enforce_types
 def get_file_result_content(res, extra_path, use_pwd=False):
    if use_pwd: 
        fpath = f'{res.pwd}/{res.output}'
    else:
        fpath = f'{res.output}'
    if extra_path:
        fpath = f'{fpath}/{extra_path}'
    with open(fpath, 'r') as file:
        data = file.read().replace('\n', '')
    if data:
        return [data]
    return []
 # This should be abstracted by a plugin interface for extractors
@enforce_types
 def get_indexable_content(results: QuerySet):
    if not results:
        return []
    # Only use the first method available
    res, method = results.first(), results.first().extractor
    if method not in ('readability', 'singlefile', 'dom', 'wget'):
        return []
    # This should come from a plugin interface
    if method == 'readability':
        return get_file_result_content(res, 'content.txt')
    elif method == 'singlefile':
        return get_file_result_content(res, '')
    elif method == 'dom':
        return get_file_result_content(res,'',use_pwd=True)
    elif method == 'wget':
        return get_file_result_content(res,'',use_pwd=True)