mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-15 23:54:27 -04:00
refactor: Initial and dirty refactor to replace link with snapshot. Barely functional add command
This commit is contained in:
parent
8e2270e21b
commit
8c4ae73d65
13 changed files with 246 additions and 233 deletions
|
@ -21,7 +21,7 @@ from util import htmldecode, urldecode, ansi_to_html
|
|||
from logging_util import printable_filesize
|
||||
from main import add, remove
|
||||
from config import OUTPUT_DIR
|
||||
from extractors import archive_links
|
||||
from extractors import archive_snapshots
|
||||
|
||||
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from django.db import models, transaction
|
||||
from django.utils.functional import cached_property
|
||||
|
@ -9,9 +10,10 @@ from django.db.models import Case, When, Value, IntegerField
|
|||
|
||||
from ..util import parse_date
|
||||
from ..index.schema import Link
|
||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
from ..config import CONFIG
|
||||
|
||||
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
||||
#EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
||||
EXTRACTORS = ["title", "wget"]
|
||||
STATUS_CHOICES = [
|
||||
("succeeded", "succeeded"),
|
||||
("failed", "failed"),
|
||||
|
@ -89,6 +91,7 @@ class Snapshot(models.Model):
|
|||
title = self.title or '-'
|
||||
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, info: dict):
|
||||
info = {k: v for k, v in info.items() if k in cls.keys}
|
||||
|
@ -133,8 +136,9 @@ class Snapshot(models.Model):
|
|||
return self.as_link().base_url
|
||||
|
||||
@cached_property
|
||||
def link_dir(self):
|
||||
return self.as_link().link_dir
|
||||
def snapshot_dir(self):
|
||||
from ..config import CONFIG
|
||||
return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)
|
||||
|
||||
@cached_property
|
||||
def archive_path(self):
|
||||
|
@ -158,6 +162,16 @@ class Snapshot(models.Model):
|
|||
return self.history['title'][-1].output.strip()
|
||||
return None
|
||||
|
||||
def _asdict(self):
|
||||
return {
|
||||
"id": str(self.id),
|
||||
"url": self.url,
|
||||
"timestamp": self.timestamp,
|
||||
"title": self.title,
|
||||
"added": self.added,
|
||||
"updated": self.updated,
|
||||
}
|
||||
|
||||
def save_tags(self, tags=()):
|
||||
tags_id = []
|
||||
for tag in tags:
|
||||
|
@ -168,6 +182,7 @@ class Snapshot(models.Model):
|
|||
|
||||
class ArchiveResultManager(models.Manager):
|
||||
def indexable(self, sorted: bool = True):
|
||||
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
|
||||
|
||||
|
|
|
@ -4,19 +4,20 @@ import os
|
|||
from pathlib import Path
|
||||
|
||||
from typing import Optional, List, Iterable, Union
|
||||
from datetime import datetime
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..index.sql import write_link_to_sql_index
|
||||
from datetime import datetime
|
||||
from django.db.models import QuerySet, Model
|
||||
|
||||
from ..index.sql import write_snapshot_to_index
|
||||
from ..index import (
|
||||
load_link_details,
|
||||
write_link_details,
|
||||
load_snapshot_details,
|
||||
write_snapshot_details,
|
||||
)
|
||||
from ..util import enforce_types
|
||||
from ..logging_util import (
|
||||
log_archiving_started,
|
||||
log_archiving_paused,
|
||||
|
||||
log_archiving_finished,
|
||||
log_link_archiving_started,
|
||||
log_link_archiving_finished,
|
||||
|
@ -67,15 +68,9 @@ def ignore_methods(to_ignore: List[str]):
|
|||
return list(methods)
|
||||
|
||||
@enforce_types
|
||||
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
|
||||
def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Model:
|
||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||
|
||||
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
|
||||
except Snapshot.DoesNotExist:
|
||||
snapshot = write_link_to_sql_index(link)
|
||||
from core.models import ArchiveResult
|
||||
|
||||
ARCHIVE_METHODS = get_default_archive_methods()
|
||||
|
||||
|
@ -85,33 +80,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
if method[0] in methods
|
||||
]
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||
try:
|
||||
is_new = not Path(out_dir).exists()
|
||||
if is_new:
|
||||
os.makedirs(out_dir)
|
||||
details = {"history": {}}
|
||||
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
|
||||
else:
|
||||
details = list(load_snapshot_details(snapshot))
|
||||
|
||||
link = load_link_details(link, out_dir=out_dir)
|
||||
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
|
||||
log_link_archiving_started(link, out_dir, is_new)
|
||||
link = link.overwrite(updated=datetime.now())
|
||||
#log_link_archiving_started(link, out_dir, is_new)
|
||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||
|
||||
for method_name, should_run, method_function in ARCHIVE_METHODS:
|
||||
try:
|
||||
if method_name not in link.history:
|
||||
link.history[method_name] = []
|
||||
if method_name not in details["history"]:
|
||||
details["history"][method_name] = []
|
||||
|
||||
if should_run(link, out_dir) or overwrite:
|
||||
if should_run(snapshot, out_dir) or overwrite:
|
||||
log_archive_method_started(method_name)
|
||||
|
||||
result = method_function(link=link, out_dir=out_dir)
|
||||
result = method_function(snapshot=snapshot, out_dir=out_dir)
|
||||
|
||||
link.history[method_name].append(result)
|
||||
details["history"][method_name].append(result)
|
||||
|
||||
stats[result.status] += 1
|
||||
log_archive_method_finished(result)
|
||||
write_search_index(link=link, texts=result.index_texts)
|
||||
write_search_index(snapshot=snapshot, texts=result.index_texts)
|
||||
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
|
||||
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
|
||||
|
||||
|
@ -121,7 +117,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
except Exception as e:
|
||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
snapshot.url,
|
||||
)) from e
|
||||
|
||||
# print(' ', stats)
|
||||
|
@ -129,17 +125,17 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
try:
|
||||
latest_title = link.history['title'][-1].output.strip()
|
||||
if latest_title and len(latest_title) >= len(link.title or ''):
|
||||
link = link.overwrite(title=latest_title)
|
||||
snapshot.title = latest_title
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
|
||||
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
|
||||
|
||||
log_link_archiving_finished(link, link.link_dir, is_new, stats)
|
||||
log_link_archiving_finished(snapshot, snapshot.snapshot_dir, is_new, stats)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
try:
|
||||
write_link_details(link, out_dir=link.link_dir)
|
||||
write_snapshot_details(snapshot, out_dir=link.link_dir)
|
||||
except:
|
||||
pass
|
||||
raise
|
||||
|
@ -148,35 +144,29 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
||||
raise
|
||||
|
||||
return link
|
||||
return snapshot
|
||||
|
||||
@enforce_types
|
||||
def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]:
|
||||
def archive_snapshots(all_snapshots: Union[QuerySet, List[Model]], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> QuerySet:
|
||||
|
||||
if type(all_links) is QuerySet:
|
||||
num_links: int = all_links.count()
|
||||
get_link = lambda x: x.as_link()
|
||||
all_links = all_links.iterator()
|
||||
else:
|
||||
num_links: int = len(all_links)
|
||||
get_link = lambda x: x
|
||||
all_snapshots = list(all_snapshots)
|
||||
num_snapshots: int = len(all_snapshots)
|
||||
|
||||
if num_links == 0:
|
||||
if num_snapshots == 0:
|
||||
return []
|
||||
|
||||
log_archiving_started(num_links)
|
||||
log_archiving_started(num_snapshots)
|
||||
idx: int = 0
|
||||
try:
|
||||
for link in all_links:
|
||||
for snapshot in all_snapshots:
|
||||
idx += 1
|
||||
to_archive = get_link(link)
|
||||
archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir))
|
||||
archive_snapshot(snapshot, overwrite=overwrite, methods=methods, out_dir=Path(snapshot.snapshot_dir))
|
||||
except KeyboardInterrupt:
|
||||
log_archiving_paused(num_links, idx, link.timestamp)
|
||||
log_archiving_paused(num_snapshots, idx, snapshot.timestamp)
|
||||
raise SystemExit(0)
|
||||
except BaseException:
|
||||
print()
|
||||
raise
|
||||
|
||||
log_archiving_finished(num_links)
|
||||
return all_links
|
||||
log_archiving_finished(num_snapshots)
|
||||
return all_snapshots
|
||||
|
|
|
@ -5,7 +5,9 @@ from html.parser import HTMLParser
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from django.db.models import Model
|
||||
|
||||
from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
|
@ -61,12 +63,12 @@ class TitleParser(HTMLParser):
|
|||
|
||||
|
||||
@enforce_types
|
||||
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
def should_save_title(snapshot: Model, out_dir: Optional[str]=None) -> bool:
|
||||
# if link already has valid title, skip it
|
||||
if link.title and not link.title.lower().startswith('http'):
|
||||
if snapshot.title and not snapshot.title.lower().startswith('http'):
|
||||
return False
|
||||
|
||||
if is_static_file(link.url):
|
||||
if is_static_file(snapshot.url):
|
||||
return False
|
||||
|
||||
return SAVE_TITLE
|
||||
|
@ -77,7 +79,7 @@ def extract_title_with_regex(html):
|
|||
return output
|
||||
|
||||
@enforce_types
|
||||
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""try to guess the page's title from its content"""
|
||||
|
||||
from core.models import Snapshot
|
||||
|
@ -89,12 +91,12 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
|||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
link.url,
|
||||
snapshot.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
html = download_url(link.url, timeout=timeout)
|
||||
html = download_url(snapshot.url, timeout=timeout)
|
||||
try:
|
||||
# try using relatively strict html parser first
|
||||
parser = TitleParser()
|
||||
|
@ -108,10 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
|||
|
||||
# if title is better than the one in the db, update db with new title
|
||||
if isinstance(output, str) and output:
|
||||
if not link.title or len(output) >= len(link.title):
|
||||
Snapshot.objects.filter(url=link.url,
|
||||
timestamp=link.timestamp)\
|
||||
if not snapshot.title or len(output) >= len(snapshot.title):
|
||||
Snapshot.objects.filter(url=snapshot.url,
|
||||
timestamp=snapshot.timestamp)\
|
||||
.update(title=output)
|
||||
snapshot.title = output
|
||||
else:
|
||||
raise ArchiveError('Unable to detect page title')
|
||||
except Exception as err:
|
||||
|
|
|
@ -10,7 +10,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
|
|||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
from urllib.parse import urlparse
|
||||
from django.db.models import QuerySet, Q
|
||||
from django.db.models import QuerySet, Q, Model
|
||||
|
||||
from ..util import (
|
||||
scheme,
|
||||
|
@ -39,15 +39,15 @@ from ..logging_util import (
|
|||
|
||||
from .schema import Link, ArchiveResult
|
||||
from .html import (
|
||||
write_html_link_details,
|
||||
write_html_snapshot_details,
|
||||
)
|
||||
from .json import (
|
||||
parse_json_link_details,
|
||||
write_json_link_details,
|
||||
parse_json_snapshot_details,
|
||||
write_json_snapshot_details,
|
||||
)
|
||||
from .sql import (
|
||||
write_sql_main_index,
|
||||
write_sql_link_details,
|
||||
write_sql_snapshot_details,
|
||||
)
|
||||
|
||||
from ..search import search_backend_enabled, query_search_index
|
||||
|
@ -55,10 +55,12 @@ from ..search import search_backend_enabled, query_search_index
|
|||
### Link filtering and checking
|
||||
|
||||
@enforce_types
|
||||
def merge_links(a: Link, b: Link) -> Link:
|
||||
"""deterministially merge two links, favoring longer field values over shorter,
|
||||
def merge_snapshots(a: Model, b: Model) -> Model:
|
||||
"""deterministially merge two snapshots, favoring longer field values over shorter,
|
||||
and "cleaner" values over worse ones.
|
||||
TODO: Check if this makes sense with the new setup
|
||||
"""
|
||||
return a
|
||||
assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})'
|
||||
|
||||
# longest url wins (because a fuzzy url will always be shorter)
|
||||
|
@ -109,55 +111,55 @@ def merge_links(a: Link, b: Link) -> Link:
|
|||
key=lambda result: result.start_ts,
|
||||
)))
|
||||
|
||||
return Link(
|
||||
return Snapshot(
|
||||
url=url,
|
||||
timestamp=timestamp,
|
||||
title=title,
|
||||
tags=tags,
|
||||
sources=sources,
|
||||
history=history,
|
||||
#sources=sources,
|
||||
#history=history,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def validate_links(links: Iterable[Link]) -> List[Link]:
|
||||
def validate_snapshots(snapshots: List[Model]) -> List[Model]:
|
||||
timer = TimedProgress(TIMEOUT * 4)
|
||||
try:
|
||||
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
||||
links = sorted_links(links) # deterministically sort the links based on timestamp, url
|
||||
links = fix_duplicate_links(links) # merge/dedupe duplicate timestamps & urls
|
||||
snapshots = archivable_snapshots(snapshots) # remove chrome://, about:, mailto: etc.
|
||||
snapshots = sorted_snapshots(snapshots) # deterministically sort the links based on timestamp, url
|
||||
snapshots = fix_duplicate_snapshots(snapshots) # merge/dedupe duplicate timestamps & urls
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return list(links)
|
||||
return list(snapshots)
|
||||
|
||||
@enforce_types
|
||||
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||
def archivable_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
|
||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||
for link in links:
|
||||
for snapshot in snapshots:
|
||||
try:
|
||||
urlparse(link.url)
|
||||
urlparse(snapshot.url)
|
||||
except ValueError:
|
||||
continue
|
||||
if scheme(link.url) not in ('http', 'https', 'ftp'):
|
||||
if scheme(snapshot.url) not in ('http', 'https', 'ftp'):
|
||||
continue
|
||||
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
|
||||
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(snapshot.url):
|
||||
continue
|
||||
|
||||
yield link
|
||||
yield snapshot
|
||||
|
||||
|
||||
@enforce_types
|
||||
def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
||||
def fix_duplicate_snapshots(sorted_snapshots: Iterable[Model]) -> Iterable[Model]:
|
||||
"""
|
||||
ensures that all non-duplicate links have monotonically increasing timestamps
|
||||
TODO: Review how to do this with the new snapshots refactor
|
||||
"""
|
||||
# from core.models import Snapshot
|
||||
|
||||
return sorted_snapshots
|
||||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||
|
||||
for link in sorted_links:
|
||||
if link.url in unique_urls:
|
||||
for snapshot in sorted_snapshots:
|
||||
if snapshot.url in unique_urls:
|
||||
# merge with any other links that share the same url
|
||||
link = merge_links(unique_urls[link.url], link)
|
||||
unique_urls[link.url] = link
|
||||
|
@ -166,9 +168,9 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
|||
|
||||
|
||||
@enforce_types
|
||||
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
|
||||
return sorted(links, key=sort_func, reverse=True)
|
||||
def sorted_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
|
||||
sort_func = lambda snapshot: (snapshot.timestamp.split('.', 1)[0], snapshot.url)
|
||||
return sorted(snapshots, key=sort_func, reverse=True)
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
@ -222,14 +224,14 @@ def timed_index_update(out_path: Path):
|
|||
|
||||
|
||||
@enforce_types
|
||||
def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
||||
def write_main_index(snapshots: List[Model], out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""Writes links to sqlite3 file for a given list of links"""
|
||||
|
||||
log_indexing_process_started(len(links))
|
||||
log_indexing_process_started(len(snapshots))
|
||||
|
||||
try:
|
||||
with timed_index_update(out_dir / SQL_INDEX_FILENAME):
|
||||
write_sql_main_index(links, out_dir=out_dir)
|
||||
write_sql_main_index(snapshots, out_dir=out_dir)
|
||||
os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
|
@ -244,7 +246,10 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
|||
|
||||
@enforce_types
|
||||
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
||||
"""parse and load existing index with any new links from import_path merged in"""
|
||||
"""
|
||||
Returns all of the snapshots currently in index
|
||||
"""
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
try:
|
||||
return Snapshot.objects.all()
|
||||
|
@ -265,88 +270,62 @@ def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
|
|||
|
||||
|
||||
@enforce_types
|
||||
def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
|
||||
def parse_snapshots_from_source(source_path: str, root_url: Optional[str]=None) -> List[Model]:
|
||||
|
||||
from ..parsers import parse_links
|
||||
from ..parsers import parse_snapshots
|
||||
|
||||
new_links: List[Link] = []
|
||||
new_links: List[Model] = []
|
||||
|
||||
# parse and validate the import file
|
||||
raw_links, parser_name = parse_links(source_path, root_url=root_url)
|
||||
new_links = validate_links(raw_links)
|
||||
raw_snapshots, parser_name = parse_snapshots(source_path, root_url=root_url)
|
||||
new_snapshots = validate_snapshots(raw_snapshots)
|
||||
|
||||
if parser_name:
|
||||
num_parsed = len(raw_links)
|
||||
num_parsed = len(raw_snapshots)
|
||||
log_parsing_finished(num_parsed, parser_name)
|
||||
|
||||
return new_links
|
||||
return new_snapshots
|
||||
|
||||
@enforce_types
|
||||
def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]:
|
||||
def filter_new_urls(snapshots: QuerySet,
|
||||
new_snapshots: List) -> List:
|
||||
"""
|
||||
Given a list of in-memory Links, dedupe and merge them with any conflicting Snapshots in the DB.
|
||||
Returns a list of Snapshots corresponding to the urls that were not present in the index
|
||||
"""
|
||||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||
urls = {snapshot.url: snapshot for snapshot in new_snapshots}
|
||||
filtered_snapshots = snapshots.filter(url__in=urls.keys())
|
||||
|
||||
for link in links:
|
||||
index_link = snapshots.filter(url=link.url)
|
||||
if index_link:
|
||||
link = merge_links(index_link[0].as_link(), link)
|
||||
for found_snapshot in filtered_snapshots:
|
||||
urls.pop(found_snapshot.url)
|
||||
|
||||
unique_urls[link.url] = link
|
||||
log_deduping_finished(len(urls.keys()))
|
||||
|
||||
return unique_urls.values()
|
||||
|
||||
@enforce_types
|
||||
def dedupe_links(snapshots: QuerySet,
|
||||
new_links: List[Link]) -> List[Link]:
|
||||
"""
|
||||
The validation of links happened at a different stage. This method will
|
||||
focus on actual deduplication and timestamp fixing.
|
||||
"""
|
||||
|
||||
# merge existing links in out_dir and new links
|
||||
dedup_links = fix_duplicate_links_in_index(snapshots, new_links)
|
||||
|
||||
new_links = [
|
||||
link for link in new_links
|
||||
if not snapshots.filter(url=link.url).exists()
|
||||
]
|
||||
|
||||
dedup_links_dict = {link.url: link for link in dedup_links}
|
||||
|
||||
# Replace links in new_links with the dedup version
|
||||
for i in range(len(new_links)):
|
||||
if new_links[i].url in dedup_links_dict.keys():
|
||||
new_links[i] = dedup_links_dict[new_links[i].url]
|
||||
log_deduping_finished(len(new_links))
|
||||
|
||||
return new_links
|
||||
return list(urls.values())
|
||||
|
||||
### Link Details Index
|
||||
|
||||
@enforce_types
|
||||
def write_link_details(link: Link, out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
|
||||
out_dir = out_dir or link.link_dir
|
||||
def write_snapshot_details(snapshot: List[Model], out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
|
||||
write_json_link_details(link, out_dir=out_dir)
|
||||
write_html_link_details(link, out_dir=out_dir)
|
||||
write_json_snapshot_details(snapshot, out_dir=out_dir)
|
||||
#write_html_snapshot_details(snapshot, out_dir=out_dir) TODO: Refactor html code too
|
||||
if not skip_sql_index:
|
||||
write_sql_link_details(link)
|
||||
write_sql_snapshot_details(snapshot)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
|
||||
def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model:
|
||||
"""check for an existing link archive in the given directory,
|
||||
and load+merge it into the given link dict
|
||||
"""
|
||||
out_dir = out_dir or link.link_dir
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
|
||||
existing_link = parse_json_link_details(out_dir)
|
||||
if existing_link:
|
||||
return merge_links(existing_link, link)
|
||||
existing_snapshot = parse_json_snapshot_details(out_dir)
|
||||
if existing_snapshot:
|
||||
return merge_snapshots(existing_snapshot, snapshot)
|
||||
|
||||
return link
|
||||
return snapshot
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ from typing import List, Optional, Iterator, Mapping
|
|||
from pathlib import Path
|
||||
|
||||
from django.utils.html import format_html
|
||||
from django.db.models import Model
|
||||
from collections import defaultdict
|
||||
|
||||
from .schema import Link
|
||||
|
@ -71,8 +72,8 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
|
|||
### Link Details Index
|
||||
|
||||
@enforce_types
|
||||
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
||||
out_dir = out_dir or link.link_dir
|
||||
def write_html_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
|
||||
rendered_html = link_details_template(link)
|
||||
atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
|
||||
|
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
|||
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Iterator, Any, Union
|
||||
from django.db.models import Model
|
||||
|
||||
from .schema import Link
|
||||
from ..system import atomic_write
|
||||
|
@ -81,16 +82,17 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
|||
### Link Details Index
|
||||
|
||||
@enforce_types
|
||||
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
||||
"""write a json file with some info about the link"""
|
||||
def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
|
||||
"""write a json file with some info about the snapshot"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
out_dir = out_dir or snapshot.snapshot_dir
|
||||
path = Path(out_dir) / JSON_INDEX_FILENAME
|
||||
atomic_write(str(path), link._asdict(extended=True))
|
||||
print(snapshot._asdict())
|
||||
atomic_write(str(path), snapshot._asdict())
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
|
||||
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Model]:
|
||||
"""load the json link index from a given directory"""
|
||||
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
|
||||
if existing_index.exists():
|
||||
|
@ -102,16 +104,31 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
|
|||
pass
|
||||
return None
|
||||
|
||||
@enforce_types
|
||||
def load_snapshot_details(snapshot: Model, out_dir: Path):
|
||||
"""
|
||||
Loads the detail from the local json index
|
||||
"""
|
||||
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
|
||||
if existing_index.exists():
|
||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||
try:
|
||||
return pyjson.load(f)
|
||||
except pyjson.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
|
||||
def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[Link]:
|
||||
"""read through all the archive data folders and return the parsed links"""
|
||||
|
||||
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
if (Path(entry.path) / 'index.json').exists():
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
link = parse_json_snapshot_details(entry.path)
|
||||
except KeyError:
|
||||
link = None
|
||||
if link:
|
||||
|
|
|
@ -3,8 +3,9 @@ __package__ = 'archivebox.index'
|
|||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Iterator
|
||||
from django.db.models import QuerySet
|
||||
from django.db.models import QuerySet, Model
|
||||
from django.db import transaction
|
||||
from datetime import datetime
|
||||
|
||||
from .schema import Link
|
||||
from ..util import enforce_types
|
||||
|
@ -28,21 +29,20 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) ->
|
|||
snapshots.delete()
|
||||
|
||||
@enforce_types
|
||||
def write_link_to_sql_index(link: Link):
|
||||
def write_snapshot_to_index(snapshot: Model):
|
||||
from core.models import Snapshot
|
||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
||||
tags = info.pop("tags")
|
||||
if tags is None:
|
||||
tags = []
|
||||
|
||||
try:
|
||||
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
|
||||
timestamp = Snapshot.objects.get(url=snapshot.url).timestamp
|
||||
except Snapshot.DoesNotExist:
|
||||
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
||||
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
||||
timestamp = snapshot.timestamp
|
||||
if not timestamp:
|
||||
timestamp = str(datetime.now().timestamp())
|
||||
while Snapshot.objects.filter(timestamp=timestamp).exists():
|
||||
print("the timestamp is: ", timestamp)
|
||||
timestamp = str(float(timestamp) + 1.0)
|
||||
|
||||
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
||||
snapshot.save_tags(tags)
|
||||
snapshot.timestamp = timestamp
|
||||
snapshot.save()
|
||||
return snapshot
|
||||
|
||||
|
||||
|
@ -50,27 +50,29 @@ def write_link_to_sql_index(link: Link):
|
|||
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
||||
with transaction.atomic():
|
||||
for link in links:
|
||||
write_link_to_sql_index(link)
|
||||
write_snapshot_to_index(link)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
def write_sql_snapshot_details(snapshot: Model, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
from core.models import Snapshot
|
||||
|
||||
with transaction.atomic():
|
||||
try:
|
||||
snap = Snapshot.objects.get(url=link.url)
|
||||
snap = Snapshot.objects.get(url=snapshot.url)
|
||||
except Snapshot.DoesNotExist:
|
||||
snap = write_link_to_sql_index(link)
|
||||
snap.title = link.title
|
||||
snap = write_snapshot_to_sql_index(snapshot)
|
||||
snap.title = snapshot.title
|
||||
|
||||
tag_set = (
|
||||
set(tag.strip() for tag in (link.tags or '').split(','))
|
||||
)
|
||||
tag_list = list(tag_set) or []
|
||||
# TODO: If there are actual tags, this will break
|
||||
#tag_set = (
|
||||
# set(tag.strip() for tag in (snapshot.tags.all() or '').split(','))
|
||||
#)
|
||||
#tag_list = list(tag_set) or []
|
||||
|
||||
snap.save()
|
||||
snap.save_tags(tag_list)
|
||||
#snap.save_tags(tag_list)
|
||||
return snap
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -29,8 +29,9 @@ from .util import enforce_types # type: ignore
|
|||
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
||||
from .index import (
|
||||
load_main_index,
|
||||
parse_links_from_source,
|
||||
dedupe_links,
|
||||
get_empty_snapshot_queryset,
|
||||
parse_snapshots_from_source,
|
||||
filter_new_urls,
|
||||
write_main_index,
|
||||
snapshot_filter,
|
||||
get_indexed_folders,
|
||||
|
@ -44,11 +45,11 @@ from .index import (
|
|||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
fix_invalid_folder_locations,
|
||||
write_link_details,
|
||||
write_snapshot_details,
|
||||
)
|
||||
from .index.json import (
|
||||
parse_json_main_index,
|
||||
parse_json_links_details,
|
||||
parse_json_snapshot_details,
|
||||
generate_json_index_from_links,
|
||||
)
|
||||
from .index.sql import (
|
||||
|
@ -60,7 +61,7 @@ from .index.html import (
|
|||
generate_index_from_links,
|
||||
)
|
||||
from .index.csv import links_to_csv
|
||||
from .extractors import archive_links, archive_link, ignore_methods
|
||||
from .extractors import archive_snapshots, archive_snapshot, ignore_methods
|
||||
from .config import (
|
||||
stderr,
|
||||
hint,
|
||||
|
@ -538,6 +539,7 @@ def add(urls: Union[str, List[str]],
|
|||
extractors: str="",
|
||||
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
from core.models import Snapshot
|
||||
|
||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||
|
||||
|
@ -549,8 +551,8 @@ def add(urls: Union[str, List[str]],
|
|||
# Load list of links from the existing index
|
||||
check_data_folder(out_dir=out_dir)
|
||||
check_dependencies()
|
||||
new_links: List[Link] = []
|
||||
all_links = load_main_index(out_dir=out_dir)
|
||||
new_snapshots: List[Snapshot] = []
|
||||
all_snapshots = load_main_index(out_dir=out_dir)
|
||||
|
||||
log_importing_started(urls=urls, depth=depth, index_only=index_only)
|
||||
if isinstance(urls, str):
|
||||
|
@ -560,20 +562,21 @@ def add(urls: Union[str, List[str]],
|
|||
# save verbatim args to sources
|
||||
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
||||
|
||||
new_links += parse_links_from_source(write_ahead_log, root_url=None)
|
||||
new_snapshots += parse_snapshots_from_source(write_ahead_log, root_url=None)
|
||||
|
||||
# If we're going one level deeper, download each link and look for more links
|
||||
new_links_depth = []
|
||||
if new_links and depth == 1:
|
||||
log_crawl_started(new_links)
|
||||
for new_link in new_links:
|
||||
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
||||
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||
new_snapshots_depth = []
|
||||
if new_snapshots and depth == 1:
|
||||
log_crawl_started(new_snapshots)
|
||||
for new_snapshot in new_snapshots:
|
||||
# TODO: Check if we need to add domain to the Snapshot model
|
||||
downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir)
|
||||
new_snapshots_depth += parse_links_from_source(downloaded_file, root_url=new_snapshot.url)
|
||||
|
||||
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
||||
new_links = dedupe_links(all_links, imported_links)
|
||||
imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth]
|
||||
new_snapshots = filter_new_urls(all_snapshots, imported_snapshots)
|
||||
|
||||
write_main_index(links=new_links, out_dir=out_dir)
|
||||
write_main_index(snapshots=new_snapshots, out_dir=out_dir)
|
||||
all_links = load_main_index(out_dir=out_dir)
|
||||
|
||||
if index_only:
|
||||
|
@ -586,13 +589,13 @@ def add(urls: Union[str, List[str]],
|
|||
if extractors:
|
||||
archive_kwargs["methods"] = extractors
|
||||
if update_all:
|
||||
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
||||
archive_snapshots(all_snapshots, overwrite=overwrite, **archive_kwargs)
|
||||
elif overwrite:
|
||||
archive_links(imported_links, overwrite=True, **archive_kwargs)
|
||||
elif new_links:
|
||||
archive_links(new_links, overwrite=False, **archive_kwargs)
|
||||
archive_snapshots(imported_snapshots, overwrite=True, **archive_kwargs)
|
||||
elif new_snapshots:
|
||||
archive_snapshots(new_snapshots, overwrite=False, **archive_kwargs)
|
||||
|
||||
return all_links
|
||||
return all_snapshots
|
||||
|
||||
@enforce_types
|
||||
def remove(filter_str: Optional[str]=None,
|
||||
|
@ -711,7 +714,7 @@ def update(resume: Optional[float]=None,
|
|||
|
||||
if index_only:
|
||||
for link in all_links:
|
||||
write_link_details(link, out_dir=out_dir, skip_sql_index=True)
|
||||
write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True)
|
||||
index_links(all_links, out_dir=out_dir)
|
||||
return all_links
|
||||
|
||||
|
@ -733,7 +736,7 @@ def update(resume: Optional[float]=None,
|
|||
if extractors:
|
||||
archive_kwargs["methods"] = extractors
|
||||
|
||||
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
|
||||
archive_snapshots(to_archive, overwrite=overwrite, **archive_kwargs)
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
all_links = load_main_index(out_dir=out_dir)
|
||||
|
|
|
@ -14,6 +14,8 @@ from typing import IO, Tuple, List, Optional
|
|||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from django.db.models import Model
|
||||
|
||||
from ..system import atomic_write
|
||||
from ..config import (
|
||||
ANSI,
|
||||
|
@ -84,7 +86,7 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
|
|||
|
||||
|
||||
@enforce_types
|
||||
def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]:
|
||||
def parse_snapshots(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Model], str]:
|
||||
"""parse a list of URLs with their metadata from an
|
||||
RSS feed, bookmarks export, or text file
|
||||
"""
|
||||
|
@ -93,27 +95,27 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
|
|||
|
||||
timer = TimedProgress(TIMEOUT * 4)
|
||||
with open(source_file, 'r', encoding='utf-8') as file:
|
||||
links, parser = run_parser_functions(file, timer, root_url=root_url)
|
||||
snapshots, parser = run_parser_functions(file, timer, root_url=root_url)
|
||||
|
||||
timer.end()
|
||||
if parser is None:
|
||||
return [], 'Failed to parse'
|
||||
return links, parser
|
||||
return snapshots, parser
|
||||
|
||||
|
||||
def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]:
|
||||
most_links: List[Link] = []
|
||||
def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Model], Optional[str]]:
|
||||
most_snapshots: List[Model] = []
|
||||
best_parser_name = None
|
||||
|
||||
for parser_name, parser_func in PARSERS:
|
||||
try:
|
||||
parsed_links = list(parser_func(to_parse, root_url=root_url))
|
||||
if not parsed_links:
|
||||
parsed_snapshots = list(parser_func(to_parse, root_url=root_url))
|
||||
if not parsed_snapshots:
|
||||
raise Exception('no links found')
|
||||
|
||||
# print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
|
||||
if len(parsed_links) > len(most_links):
|
||||
most_links = parsed_links
|
||||
if len(parsed_snapshots) > len(most_snapshots):
|
||||
most_snapshots = parsed_snapshots
|
||||
best_parser_name = parser_name
|
||||
|
||||
except Exception as err: # noqa
|
||||
|
@ -125,7 +127,7 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None)
|
|||
# raise
|
||||
pass
|
||||
timer.end()
|
||||
return most_links, best_parser_name
|
||||
return most_snapshots, best_parser_name
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -31,6 +31,7 @@ class HrefParser(HTMLParser):
|
|||
@enforce_types
|
||||
def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
|
||||
"""Parse Generic HTML for href tags and use only the url (support for title coming later)"""
|
||||
from core.models import Snapshot
|
||||
|
||||
html_file.seek(0)
|
||||
for line in html_file:
|
||||
|
@ -44,10 +45,10 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
|||
url = urljoin(root_url, url)
|
||||
|
||||
for archivable_url in re.findall(URL_REGEX, url):
|
||||
yield Link(
|
||||
yield Snapshot(
|
||||
url=htmldecode(archivable_url),
|
||||
timestamp=str(datetime.now().timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[html_file.name],
|
||||
#tags=None,
|
||||
#sources=[html_file.name],
|
||||
)
|
||||
|
|
|
@ -18,6 +18,8 @@ from ..util import (
|
|||
@enforce_types
|
||||
def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
"""Parse raw links from each line in a text file"""
|
||||
# TODO: Check if we should add sources list to the database
|
||||
from core.models import Snapshot
|
||||
|
||||
text_file.seek(0)
|
||||
for line in text_file.readlines():
|
||||
|
@ -40,22 +42,22 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
|
||||
# otherwise look for anything that looks like a URL in the line
|
||||
for url in re.findall(URL_REGEX, line):
|
||||
yield Link(
|
||||
yield Snapshot(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(datetime.now().timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[text_file.name],
|
||||
#tags=None,
|
||||
#sources=[text_file.name],
|
||||
)
|
||||
|
||||
# look inside the URL for any sub-urls, e.g. for archive.org links
|
||||
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
||||
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
||||
for url in re.findall(URL_REGEX, line[1:]):
|
||||
yield Link(
|
||||
yield Snapshot(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(datetime.now().timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[text_file.name],
|
||||
#tags=None,
|
||||
#sources=[text_file.name],
|
||||
)
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import List, Union
|
|||
from pathlib import Path
|
||||
from importlib import import_module
|
||||
|
||||
from django.db.models import QuerySet
|
||||
from django.db.models import QuerySet, Model
|
||||
|
||||
from archivebox.index.schema import Link
|
||||
from archivebox.util import enforce_types
|
||||
|
@ -28,18 +28,16 @@ def import_backend():
|
|||
return backend
|
||||
|
||||
@enforce_types
|
||||
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
|
||||
def write_search_index(snapshot: Model, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
|
||||
if not indexing_enabled():
|
||||
return
|
||||
|
||||
if not skip_text_index and texts:
|
||||
from core.models import Snapshot
|
||||
|
||||
snap = Snapshot.objects.filter(url=link.url).first()
|
||||
backend = import_backend()
|
||||
if snap:
|
||||
try:
|
||||
backend.index(snapshot_id=str(snap.id), texts=texts)
|
||||
backend.index(snapshot_id=str(snapshot.id), texts=texts)
|
||||
except Exception as err:
|
||||
stderr()
|
||||
stderr(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue