mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 15:14:31 -04:00
Add indexing to update command and utilities
This commit is contained in:
parent
273c9d91c6
commit
caf4660ac8
3 changed files with 56 additions and 1 deletions
|
@ -115,7 +115,7 @@ from .logging_util import (
|
||||||
printable_dependency_version,
|
printable_dependency_version,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .search import flush_search_index
|
from .search import flush_search_index, index_links
|
||||||
|
|
||||||
ALLOWED_IN_OUTPUT_DIR = {
|
ALLOWED_IN_OUTPUT_DIR = {
|
||||||
'lost+found',
|
'lost+found',
|
||||||
|
@ -711,6 +711,7 @@ def update(resume: Optional[float]=None,
|
||||||
if index_only:
|
if index_only:
|
||||||
for link in all_links:
|
for link in all_links:
|
||||||
write_link_details(link, out_dir=out_dir, skip_sql_index=True)
|
write_link_details(link, out_dir=out_dir, skip_sql_index=True)
|
||||||
|
index_links(all_links, out_dir=out_dir)
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
# Step 2: Run the archive methods for each link
|
# Step 2: Run the archive methods for each link
|
||||||
|
|
|
@ -8,6 +8,8 @@ from archivebox.index.schema import Link
|
||||||
from archivebox.util import enforce_types
|
from archivebox.util import enforce_types
|
||||||
from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
|
from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
|
||||||
|
|
||||||
|
from .utils import get_indexable_content
|
||||||
|
|
||||||
def indexing_enabled():
|
def indexing_enabled():
|
||||||
return USE_INDEXING_BACKEND
|
return USE_INDEXING_BACKEND
|
||||||
|
|
||||||
|
@ -83,3 +85,17 @@ def flush_search_index(snapshots: QuerySet):
|
||||||
f'[X] The search backend threw an exception={err}:',
|
f'[X] The search backend threw an exception={err}:',
|
||||||
color='red',
|
color='red',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
|
||||||
|
if not links:
|
||||||
|
return
|
||||||
|
|
||||||
|
setup_django(out_dir=out_dir, check_db=True)
|
||||||
|
from core.models import Snapshot, ArchiveResult
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
if snap := Snapshot.objects.filter(url=link.url).first():
|
||||||
|
results = ArchiveResult.objects.indexable().filter(snapshot=snap)
|
||||||
|
texts = get_indexable_content(results)
|
||||||
|
write_search_index(link,texts,out_dir=out_dir)
|
||||||
|
|
38
archivebox/search/utils.py
Normal file
38
archivebox/search/utils.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
|
from archivebox.util import enforce_types
|
||||||
|
|
||||||
|
def get_file_result_content(res, extra_path, use_pwd=False):
|
||||||
|
if use_pwd:
|
||||||
|
fpath = f'{res.pwd}/{res.output}'
|
||||||
|
else:
|
||||||
|
fpath = f'{res.output}'
|
||||||
|
|
||||||
|
if extra_path:
|
||||||
|
fpath = f'{fpath}/{extra_path}'
|
||||||
|
|
||||||
|
with open(fpath, 'r') as file:
|
||||||
|
data = file.read().replace('\n', '')
|
||||||
|
if data:
|
||||||
|
return [data]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
# This should be abstracted by a plugin interface for extractors
|
||||||
|
@enforce_types
|
||||||
|
def get_indexable_content(results: QuerySet):
|
||||||
|
if not results:
|
||||||
|
return []
|
||||||
|
# Only use the first method available
|
||||||
|
res, method = results.first(), results.first().extractor
|
||||||
|
if method not in ('readability', 'singlefile', 'dom', 'wget'):
|
||||||
|
return []
|
||||||
|
# This should come from a plugin interface
|
||||||
|
if method == 'readability':
|
||||||
|
return get_file_result_content(res, 'content.txt')
|
||||||
|
elif method == 'singlefile':
|
||||||
|
return get_file_result_content(res, '')
|
||||||
|
elif method == 'dom':
|
||||||
|
return get_file_result_content(res,'',use_pwd=True)
|
||||||
|
elif method == 'wget':
|
||||||
|
return get_file_result_content(res,'',use_pwd=True)
|
Loading…
Add table
Add a link
Reference in a new issue