mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-16 08:04:26 -04:00
refactor: Initial and dirty refactor to replace link with snapshot. Barely functional add command
This commit is contained in:
parent
8e2270e21b
commit
8c4ae73d65
13 changed files with 246 additions and 233 deletions
|
@ -21,7 +21,7 @@ from util import htmldecode, urldecode, ansi_to_html
|
||||||
from logging_util import printable_filesize
|
from logging_util import printable_filesize
|
||||||
from main import add, remove
|
from main import add, remove
|
||||||
from config import OUTPUT_DIR
|
from config import OUTPUT_DIR
|
||||||
from extractors import archive_links
|
from extractors import archive_snapshots
|
||||||
|
|
||||||
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
|
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
__package__ = 'archivebox.core'
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from django.db import models, transaction
|
from django.db import models, transaction
|
||||||
from django.utils.functional import cached_property
|
from django.utils.functional import cached_property
|
||||||
|
@ -9,9 +10,10 @@ from django.db.models import Case, When, Value, IntegerField
|
||||||
|
|
||||||
from ..util import parse_date
|
from ..util import parse_date
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
from ..config import CONFIG
|
||||||
|
|
||||||
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
#EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
||||||
|
EXTRACTORS = ["title", "wget"]
|
||||||
STATUS_CHOICES = [
|
STATUS_CHOICES = [
|
||||||
("succeeded", "succeeded"),
|
("succeeded", "succeeded"),
|
||||||
("failed", "failed"),
|
("failed", "failed"),
|
||||||
|
@ -89,6 +91,7 @@ class Snapshot(models.Model):
|
||||||
title = self.title or '-'
|
title = self.title or '-'
|
||||||
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
|
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_json(cls, info: dict):
|
def from_json(cls, info: dict):
|
||||||
info = {k: v for k, v in info.items() if k in cls.keys}
|
info = {k: v for k, v in info.items() if k in cls.keys}
|
||||||
|
@ -133,8 +136,9 @@ class Snapshot(models.Model):
|
||||||
return self.as_link().base_url
|
return self.as_link().base_url
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def link_dir(self):
|
def snapshot_dir(self):
|
||||||
return self.as_link().link_dir
|
from ..config import CONFIG
|
||||||
|
return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def archive_path(self):
|
def archive_path(self):
|
||||||
|
@ -158,6 +162,16 @@ class Snapshot(models.Model):
|
||||||
return self.history['title'][-1].output.strip()
|
return self.history['title'][-1].output.strip()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _asdict(self):
|
||||||
|
return {
|
||||||
|
"id": str(self.id),
|
||||||
|
"url": self.url,
|
||||||
|
"timestamp": self.timestamp,
|
||||||
|
"title": self.title,
|
||||||
|
"added": self.added,
|
||||||
|
"updated": self.updated,
|
||||||
|
}
|
||||||
|
|
||||||
def save_tags(self, tags=()):
|
def save_tags(self, tags=()):
|
||||||
tags_id = []
|
tags_id = []
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
|
@ -168,6 +182,7 @@ class Snapshot(models.Model):
|
||||||
|
|
||||||
class ArchiveResultManager(models.Manager):
|
class ArchiveResultManager(models.Manager):
|
||||||
def indexable(self, sorted: bool = True):
|
def indexable(self, sorted: bool = True):
|
||||||
|
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||||
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
|
||||||
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
|
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded')
|
||||||
|
|
||||||
|
|
|
@ -4,19 +4,20 @@ import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from typing import Optional, List, Iterable, Union
|
from typing import Optional, List, Iterable, Union
|
||||||
from datetime import datetime
|
|
||||||
from django.db.models import QuerySet
|
|
||||||
|
|
||||||
from ..index.schema import Link
|
from datetime import datetime
|
||||||
from ..index.sql import write_link_to_sql_index
|
from django.db.models import QuerySet, Model
|
||||||
|
|
||||||
|
from ..index.sql import write_snapshot_to_index
|
||||||
from ..index import (
|
from ..index import (
|
||||||
load_link_details,
|
load_snapshot_details,
|
||||||
write_link_details,
|
write_snapshot_details,
|
||||||
)
|
)
|
||||||
from ..util import enforce_types
|
from ..util import enforce_types
|
||||||
from ..logging_util import (
|
from ..logging_util import (
|
||||||
log_archiving_started,
|
log_archiving_started,
|
||||||
log_archiving_paused,
|
log_archiving_paused,
|
||||||
|
|
||||||
log_archiving_finished,
|
log_archiving_finished,
|
||||||
log_link_archiving_started,
|
log_link_archiving_started,
|
||||||
log_link_archiving_finished,
|
log_link_archiving_finished,
|
||||||
|
@ -67,15 +68,9 @@ def ignore_methods(to_ignore: List[str]):
|
||||||
return list(methods)
|
return list(methods)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
|
def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Model:
|
||||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||||
|
from core.models import ArchiveResult
|
||||||
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
|
|
||||||
from core.models import Snapshot, ArchiveResult
|
|
||||||
try:
|
|
||||||
snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
|
|
||||||
except Snapshot.DoesNotExist:
|
|
||||||
snapshot = write_link_to_sql_index(link)
|
|
||||||
|
|
||||||
ARCHIVE_METHODS = get_default_archive_methods()
|
ARCHIVE_METHODS = get_default_archive_methods()
|
||||||
|
|
||||||
|
@ -85,33 +80,34 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||||
if method[0] in methods
|
if method[0] in methods
|
||||||
]
|
]
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(snapshot.snapshot_dir)
|
||||||
try:
|
try:
|
||||||
is_new = not Path(out_dir).exists()
|
is_new = not Path(out_dir).exists()
|
||||||
if is_new:
|
if is_new:
|
||||||
os.makedirs(out_dir)
|
os.makedirs(out_dir)
|
||||||
|
details = {"history": {}}
|
||||||
|
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
|
||||||
|
else:
|
||||||
|
details = list(load_snapshot_details(snapshot))
|
||||||
|
|
||||||
link = load_link_details(link, out_dir=out_dir)
|
#log_link_archiving_started(link, out_dir, is_new)
|
||||||
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
|
|
||||||
log_link_archiving_started(link, out_dir, is_new)
|
|
||||||
link = link.overwrite(updated=datetime.now())
|
|
||||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||||
|
|
||||||
for method_name, should_run, method_function in ARCHIVE_METHODS:
|
for method_name, should_run, method_function in ARCHIVE_METHODS:
|
||||||
try:
|
try:
|
||||||
if method_name not in link.history:
|
if method_name not in details["history"]:
|
||||||
link.history[method_name] = []
|
details["history"][method_name] = []
|
||||||
|
|
||||||
if should_run(link, out_dir) or overwrite:
|
if should_run(snapshot, out_dir) or overwrite:
|
||||||
log_archive_method_started(method_name)
|
log_archive_method_started(method_name)
|
||||||
|
|
||||||
result = method_function(link=link, out_dir=out_dir)
|
result = method_function(snapshot=snapshot, out_dir=out_dir)
|
||||||
|
|
||||||
link.history[method_name].append(result)
|
details["history"][method_name].append(result)
|
||||||
|
|
||||||
stats[result.status] += 1
|
stats[result.status] += 1
|
||||||
log_archive_method_finished(result)
|
log_archive_method_finished(result)
|
||||||
write_search_index(link=link, texts=result.index_texts)
|
write_search_index(snapshot=snapshot, texts=result.index_texts)
|
||||||
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
|
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
|
||||||
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
|
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
|
||||||
|
|
||||||
|
@ -121,7 +117,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||||
method_name,
|
method_name,
|
||||||
link.url,
|
snapshot.url,
|
||||||
)) from e
|
)) from e
|
||||||
|
|
||||||
# print(' ', stats)
|
# print(' ', stats)
|
||||||
|
@ -129,17 +125,17 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||||
try:
|
try:
|
||||||
latest_title = link.history['title'][-1].output.strip()
|
latest_title = link.history['title'][-1].output.strip()
|
||||||
if latest_title and len(latest_title) >= len(link.title or ''):
|
if latest_title and len(latest_title) >= len(link.title or ''):
|
||||||
link = link.overwrite(title=latest_title)
|
snapshot.title = latest_title
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
|
write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
|
||||||
|
|
||||||
log_link_archiving_finished(link, link.link_dir, is_new, stats)
|
log_link_archiving_finished(snapshot, snapshot.snapshot_dir, is_new, stats)
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
try:
|
try:
|
||||||
write_link_details(link, out_dir=link.link_dir)
|
write_snapshot_details(snapshot, out_dir=link.link_dir)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
raise
|
raise
|
||||||
|
@ -148,35 +144,29 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
||||||
raise
|
raise
|
||||||
|
|
||||||
return link
|
return snapshot
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]:
|
def archive_snapshots(all_snapshots: Union[QuerySet, List[Model]], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> QuerySet:
|
||||||
|
|
||||||
if type(all_links) is QuerySet:
|
all_snapshots = list(all_snapshots)
|
||||||
num_links: int = all_links.count()
|
num_snapshots: int = len(all_snapshots)
|
||||||
get_link = lambda x: x.as_link()
|
|
||||||
all_links = all_links.iterator()
|
|
||||||
else:
|
|
||||||
num_links: int = len(all_links)
|
|
||||||
get_link = lambda x: x
|
|
||||||
|
|
||||||
if num_links == 0:
|
if num_snapshots == 0:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
log_archiving_started(num_links)
|
log_archiving_started(num_snapshots)
|
||||||
idx: int = 0
|
idx: int = 0
|
||||||
try:
|
try:
|
||||||
for link in all_links:
|
for snapshot in all_snapshots:
|
||||||
idx += 1
|
idx += 1
|
||||||
to_archive = get_link(link)
|
archive_snapshot(snapshot, overwrite=overwrite, methods=methods, out_dir=Path(snapshot.snapshot_dir))
|
||||||
archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir))
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
log_archiving_paused(num_links, idx, link.timestamp)
|
log_archiving_paused(num_snapshots, idx, snapshot.timestamp)
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
except BaseException:
|
except BaseException:
|
||||||
print()
|
print()
|
||||||
raise
|
raise
|
||||||
|
|
||||||
log_archiving_finished(num_links)
|
log_archiving_finished(num_snapshots)
|
||||||
return all_links
|
return all_snapshots
|
||||||
|
|
|
@ -5,7 +5,9 @@ from html.parser import HTMLParser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from django.db.models import Model
|
||||||
|
|
||||||
|
from ..index.schema import ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
@ -61,12 +63,12 @@ class TitleParser(HTMLParser):
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
|
def should_save_title(snapshot: Model, out_dir: Optional[str]=None) -> bool:
|
||||||
# if link already has valid title, skip it
|
# if link already has valid title, skip it
|
||||||
if link.title and not link.title.lower().startswith('http'):
|
if snapshot.title and not snapshot.title.lower().startswith('http'):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(snapshot.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_TITLE
|
return SAVE_TITLE
|
||||||
|
@ -77,7 +79,7 @@ def extract_title_with_regex(html):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_title(snapshot: Model, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||||
"""try to guess the page's title from its content"""
|
"""try to guess the page's title from its content"""
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
|
@ -89,12 +91,12 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
link.url,
|
snapshot.url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
html = download_url(link.url, timeout=timeout)
|
html = download_url(snapshot.url, timeout=timeout)
|
||||||
try:
|
try:
|
||||||
# try using relatively strict html parser first
|
# try using relatively strict html parser first
|
||||||
parser = TitleParser()
|
parser = TitleParser()
|
||||||
|
@ -108,10 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
||||||
|
|
||||||
# if title is better than the one in the db, update db with new title
|
# if title is better than the one in the db, update db with new title
|
||||||
if isinstance(output, str) and output:
|
if isinstance(output, str) and output:
|
||||||
if not link.title or len(output) >= len(link.title):
|
if not snapshot.title or len(output) >= len(snapshot.title):
|
||||||
Snapshot.objects.filter(url=link.url,
|
Snapshot.objects.filter(url=snapshot.url,
|
||||||
timestamp=link.timestamp)\
|
timestamp=snapshot.timestamp)\
|
||||||
.update(title=output)
|
.update(title=output)
|
||||||
|
snapshot.title = output
|
||||||
else:
|
else:
|
||||||
raise ArchiveError('Unable to detect page title')
|
raise ArchiveError('Unable to detect page title')
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
|
|
@ -10,7 +10,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from django.db.models import QuerySet, Q
|
from django.db.models import QuerySet, Q, Model
|
||||||
|
|
||||||
from ..util import (
|
from ..util import (
|
||||||
scheme,
|
scheme,
|
||||||
|
@ -39,15 +39,15 @@ from ..logging_util import (
|
||||||
|
|
||||||
from .schema import Link, ArchiveResult
|
from .schema import Link, ArchiveResult
|
||||||
from .html import (
|
from .html import (
|
||||||
write_html_link_details,
|
write_html_snapshot_details,
|
||||||
)
|
)
|
||||||
from .json import (
|
from .json import (
|
||||||
parse_json_link_details,
|
parse_json_snapshot_details,
|
||||||
write_json_link_details,
|
write_json_snapshot_details,
|
||||||
)
|
)
|
||||||
from .sql import (
|
from .sql import (
|
||||||
write_sql_main_index,
|
write_sql_main_index,
|
||||||
write_sql_link_details,
|
write_sql_snapshot_details,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ..search import search_backend_enabled, query_search_index
|
from ..search import search_backend_enabled, query_search_index
|
||||||
|
@ -55,10 +55,12 @@ from ..search import search_backend_enabled, query_search_index
|
||||||
### Link filtering and checking
|
### Link filtering and checking
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def merge_links(a: Link, b: Link) -> Link:
|
def merge_snapshots(a: Model, b: Model) -> Model:
|
||||||
"""deterministially merge two links, favoring longer field values over shorter,
|
"""deterministially merge two snapshots, favoring longer field values over shorter,
|
||||||
and "cleaner" values over worse ones.
|
and "cleaner" values over worse ones.
|
||||||
|
TODO: Check if this makes sense with the new setup
|
||||||
"""
|
"""
|
||||||
|
return a
|
||||||
assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})'
|
assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})'
|
||||||
|
|
||||||
# longest url wins (because a fuzzy url will always be shorter)
|
# longest url wins (because a fuzzy url will always be shorter)
|
||||||
|
@ -109,55 +111,55 @@ def merge_links(a: Link, b: Link) -> Link:
|
||||||
key=lambda result: result.start_ts,
|
key=lambda result: result.start_ts,
|
||||||
)))
|
)))
|
||||||
|
|
||||||
return Link(
|
return Snapshot(
|
||||||
url=url,
|
url=url,
|
||||||
timestamp=timestamp,
|
timestamp=timestamp,
|
||||||
title=title,
|
title=title,
|
||||||
tags=tags,
|
tags=tags,
|
||||||
sources=sources,
|
#sources=sources,
|
||||||
history=history,
|
#history=history,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def validate_links(links: Iterable[Link]) -> List[Link]:
|
def validate_snapshots(snapshots: List[Model]) -> List[Model]:
|
||||||
timer = TimedProgress(TIMEOUT * 4)
|
timer = TimedProgress(TIMEOUT * 4)
|
||||||
try:
|
try:
|
||||||
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
snapshots = archivable_snapshots(snapshots) # remove chrome://, about:, mailto: etc.
|
||||||
links = sorted_links(links) # deterministically sort the links based on timestamp, url
|
snapshots = sorted_snapshots(snapshots) # deterministically sort the links based on timestamp, url
|
||||||
links = fix_duplicate_links(links) # merge/dedupe duplicate timestamps & urls
|
snapshots = fix_duplicate_snapshots(snapshots) # merge/dedupe duplicate timestamps & urls
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
return list(links)
|
return list(snapshots)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
def archivable_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
|
||||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||||
for link in links:
|
for snapshot in snapshots:
|
||||||
try:
|
try:
|
||||||
urlparse(link.url)
|
urlparse(snapshot.url)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
if scheme(link.url) not in ('http', 'https', 'ftp'):
|
if scheme(snapshot.url) not in ('http', 'https', 'ftp'):
|
||||||
continue
|
continue
|
||||||
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
|
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(snapshot.url):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield link
|
yield snapshot
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
def fix_duplicate_snapshots(sorted_snapshots: Iterable[Model]) -> Iterable[Model]:
|
||||||
"""
|
"""
|
||||||
ensures that all non-duplicate links have monotonically increasing timestamps
|
ensures that all non-duplicate links have monotonically increasing timestamps
|
||||||
|
TODO: Review how to do this with the new snapshots refactor
|
||||||
"""
|
"""
|
||||||
# from core.models import Snapshot
|
return sorted_snapshots
|
||||||
|
|
||||||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||||
|
|
||||||
for link in sorted_links:
|
for snapshot in sorted_snapshots:
|
||||||
if link.url in unique_urls:
|
if snapshot.url in unique_urls:
|
||||||
# merge with any other links that share the same url
|
# merge with any other links that share the same url
|
||||||
link = merge_links(unique_urls[link.url], link)
|
link = merge_links(unique_urls[link.url], link)
|
||||||
unique_urls[link.url] = link
|
unique_urls[link.url] = link
|
||||||
|
@ -166,9 +168,9 @@ def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
|
def sorted_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
|
||||||
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
|
sort_func = lambda snapshot: (snapshot.timestamp.split('.', 1)[0], snapshot.url)
|
||||||
return sorted(links, key=sort_func, reverse=True)
|
return sorted(snapshots, key=sort_func, reverse=True)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -222,14 +224,14 @@ def timed_index_update(out_path: Path):
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
def write_main_index(snapshots: List[Model], out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
"""Writes links to sqlite3 file for a given list of links"""
|
"""Writes links to sqlite3 file for a given list of links"""
|
||||||
|
|
||||||
log_indexing_process_started(len(links))
|
log_indexing_process_started(len(snapshots))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with timed_index_update(out_dir / SQL_INDEX_FILENAME):
|
with timed_index_update(out_dir / SQL_INDEX_FILENAME):
|
||||||
write_sql_main_index(links, out_dir=out_dir)
|
write_sql_main_index(snapshots, out_dir=out_dir)
|
||||||
os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
|
os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit):
|
except (KeyboardInterrupt, SystemExit):
|
||||||
|
@ -244,7 +246,10 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
||||||
"""parse and load existing index with any new links from import_path merged in"""
|
"""
|
||||||
|
Returns all of the snapshots currently in index
|
||||||
|
"""
|
||||||
|
setup_django(out_dir, check_db=True)
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
try:
|
try:
|
||||||
return Snapshot.objects.all()
|
return Snapshot.objects.all()
|
||||||
|
@ -265,88 +270,62 @@ def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
|
def parse_snapshots_from_source(source_path: str, root_url: Optional[str]=None) -> List[Model]:
|
||||||
|
|
||||||
from ..parsers import parse_links
|
from ..parsers import parse_snapshots
|
||||||
|
|
||||||
new_links: List[Link] = []
|
new_links: List[Model] = []
|
||||||
|
|
||||||
# parse and validate the import file
|
# parse and validate the import file
|
||||||
raw_links, parser_name = parse_links(source_path, root_url=root_url)
|
raw_snapshots, parser_name = parse_snapshots(source_path, root_url=root_url)
|
||||||
new_links = validate_links(raw_links)
|
new_snapshots = validate_snapshots(raw_snapshots)
|
||||||
|
|
||||||
if parser_name:
|
if parser_name:
|
||||||
num_parsed = len(raw_links)
|
num_parsed = len(raw_snapshots)
|
||||||
log_parsing_finished(num_parsed, parser_name)
|
log_parsing_finished(num_parsed, parser_name)
|
||||||
|
|
||||||
return new_links
|
return new_snapshots
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]:
|
def filter_new_urls(snapshots: QuerySet,
|
||||||
|
new_snapshots: List) -> List:
|
||||||
"""
|
"""
|
||||||
Given a list of in-memory Links, dedupe and merge them with any conflicting Snapshots in the DB.
|
Returns a list of Snapshots corresponding to the urls that were not present in the index
|
||||||
"""
|
"""
|
||||||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
urls = {snapshot.url: snapshot for snapshot in new_snapshots}
|
||||||
|
filtered_snapshots = snapshots.filter(url__in=urls.keys())
|
||||||
|
|
||||||
for link in links:
|
for found_snapshot in filtered_snapshots:
|
||||||
index_link = snapshots.filter(url=link.url)
|
urls.pop(found_snapshot.url)
|
||||||
if index_link:
|
|
||||||
link = merge_links(index_link[0].as_link(), link)
|
|
||||||
|
|
||||||
unique_urls[link.url] = link
|
|
||||||
|
|
||||||
return unique_urls.values()
|
|
||||||
|
|
||||||
@enforce_types
|
|
||||||
def dedupe_links(snapshots: QuerySet,
|
|
||||||
new_links: List[Link]) -> List[Link]:
|
|
||||||
"""
|
|
||||||
The validation of links happened at a different stage. This method will
|
|
||||||
focus on actual deduplication and timestamp fixing.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# merge existing links in out_dir and new links
|
log_deduping_finished(len(urls.keys()))
|
||||||
dedup_links = fix_duplicate_links_in_index(snapshots, new_links)
|
|
||||||
|
|
||||||
new_links = [
|
return list(urls.values())
|
||||||
link for link in new_links
|
|
||||||
if not snapshots.filter(url=link.url).exists()
|
|
||||||
]
|
|
||||||
|
|
||||||
dedup_links_dict = {link.url: link for link in dedup_links}
|
|
||||||
|
|
||||||
# Replace links in new_links with the dedup version
|
|
||||||
for i in range(len(new_links)):
|
|
||||||
if new_links[i].url in dedup_links_dict.keys():
|
|
||||||
new_links[i] = dedup_links_dict[new_links[i].url]
|
|
||||||
log_deduping_finished(len(new_links))
|
|
||||||
|
|
||||||
return new_links
|
|
||||||
|
|
||||||
### Link Details Index
|
### Link Details Index
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_link_details(link: Link, out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
|
def write_snapshot_details(snapshot: List[Model], out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or snapshot.snapshot_dir
|
||||||
|
|
||||||
write_json_link_details(link, out_dir=out_dir)
|
write_json_snapshot_details(snapshot, out_dir=out_dir)
|
||||||
write_html_link_details(link, out_dir=out_dir)
|
#write_html_snapshot_details(snapshot, out_dir=out_dir) TODO: Refactor html code too
|
||||||
if not skip_sql_index:
|
if not skip_sql_index:
|
||||||
write_sql_link_details(link)
|
write_sql_snapshot_details(snapshot)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
|
def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model:
|
||||||
"""check for an existing link archive in the given directory,
|
"""check for an existing link archive in the given directory,
|
||||||
and load+merge it into the given link dict
|
and load+merge it into the given link dict
|
||||||
"""
|
"""
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or snapshot.snapshot_dir
|
||||||
|
|
||||||
existing_link = parse_json_link_details(out_dir)
|
existing_snapshot = parse_json_snapshot_details(out_dir)
|
||||||
if existing_link:
|
if existing_snapshot:
|
||||||
return merge_links(existing_link, link)
|
return merge_snapshots(existing_snapshot, snapshot)
|
||||||
|
|
||||||
return link
|
return snapshot
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ from typing import List, Optional, Iterator, Mapping
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from django.utils.html import format_html
|
from django.utils.html import format_html
|
||||||
|
from django.db.models import Model
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
|
@ -71,8 +72,8 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
|
||||||
### Link Details Index
|
### Link Details Index
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
def write_html_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or snapshot.snapshot_dir
|
||||||
|
|
||||||
rendered_html = link_details_template(link)
|
rendered_html = link_details_template(link)
|
||||||
atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
|
atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
|
||||||
|
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Optional, Iterator, Any, Union
|
from typing import List, Optional, Iterator, Any, Union
|
||||||
|
from django.db.models import Model
|
||||||
|
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
from ..system import atomic_write
|
from ..system import atomic_write
|
||||||
|
@ -81,16 +82,17 @@ def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
### Link Details Index
|
### Link Details Index
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
def write_json_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> None:
|
||||||
"""write a json file with some info about the link"""
|
"""write a json file with some info about the snapshot"""
|
||||||
|
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or snapshot.snapshot_dir
|
||||||
path = Path(out_dir) / JSON_INDEX_FILENAME
|
path = Path(out_dir) / JSON_INDEX_FILENAME
|
||||||
atomic_write(str(path), link._asdict(extended=True))
|
print(snapshot._asdict())
|
||||||
|
atomic_write(str(path), snapshot._asdict())
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
|
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Model]:
|
||||||
"""load the json link index from a given directory"""
|
"""load the json link index from a given directory"""
|
||||||
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
|
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
|
||||||
if existing_index.exists():
|
if existing_index.exists():
|
||||||
|
@ -102,16 +104,31 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
|
||||||
pass
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def load_snapshot_details(snapshot: Model, out_dir: Path):
|
||||||
|
"""
|
||||||
|
Loads the detail from the local json index
|
||||||
|
"""
|
||||||
|
existing_index = Path(out_dir) / JSON_INDEX_FILENAME
|
||||||
|
if existing_index.exists():
|
||||||
|
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||||
|
try:
|
||||||
|
return pyjson.load(f)
|
||||||
|
except pyjson.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
|
def parse_json_snapshot_details(out_dir: Union[Path, str]) -> Iterator[Link]:
|
||||||
"""read through all the archive data folders and return the parsed links"""
|
"""read through all the archive data folders and return the parsed links"""
|
||||||
|
|
||||||
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
|
for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
|
||||||
if entry.is_dir(follow_symlinks=True):
|
if entry.is_dir(follow_symlinks=True):
|
||||||
if (Path(entry.path) / 'index.json').exists():
|
if (Path(entry.path) / 'index.json').exists():
|
||||||
try:
|
try:
|
||||||
link = parse_json_link_details(entry.path)
|
link = parse_json_snapshot_details(entry.path)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
link = None
|
link = None
|
||||||
if link:
|
if link:
|
||||||
|
|
|
@ -3,8 +3,9 @@ __package__ = 'archivebox.index'
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Tuple, Iterator
|
from typing import List, Tuple, Iterator
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet, Model
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
from ..util import enforce_types
|
from ..util import enforce_types
|
||||||
|
@ -28,21 +29,20 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) ->
|
||||||
snapshots.delete()
|
snapshots.delete()
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_link_to_sql_index(link: Link):
|
def write_snapshot_to_index(snapshot: Model):
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
|
||||||
tags = info.pop("tags")
|
|
||||||
if tags is None:
|
|
||||||
tags = []
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
|
timestamp = Snapshot.objects.get(url=snapshot.url).timestamp
|
||||||
except Snapshot.DoesNotExist:
|
except Snapshot.DoesNotExist:
|
||||||
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
timestamp = snapshot.timestamp
|
||||||
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
if not timestamp:
|
||||||
|
timestamp = str(datetime.now().timestamp())
|
||||||
|
while Snapshot.objects.filter(timestamp=timestamp).exists():
|
||||||
|
print("the timestamp is: ", timestamp)
|
||||||
|
timestamp = str(float(timestamp) + 1.0)
|
||||||
|
|
||||||
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
snapshot.timestamp = timestamp
|
||||||
snapshot.save_tags(tags)
|
snapshot.save()
|
||||||
return snapshot
|
return snapshot
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,27 +50,29 @@ def write_link_to_sql_index(link: Link):
|
||||||
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
for link in links:
|
for link in links:
|
||||||
write_link_to_sql_index(link)
|
write_snapshot_to_index(link)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
|
def write_sql_snapshot_details(snapshot: Model, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
|
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
try:
|
try:
|
||||||
snap = Snapshot.objects.get(url=link.url)
|
snap = Snapshot.objects.get(url=snapshot.url)
|
||||||
except Snapshot.DoesNotExist:
|
except Snapshot.DoesNotExist:
|
||||||
snap = write_link_to_sql_index(link)
|
snap = write_snapshot_to_sql_index(snapshot)
|
||||||
snap.title = link.title
|
snap.title = snapshot.title
|
||||||
|
|
||||||
tag_set = (
|
# TODO: If there are actual tags, this will break
|
||||||
set(tag.strip() for tag in (link.tags or '').split(','))
|
#tag_set = (
|
||||||
)
|
# set(tag.strip() for tag in (snapshot.tags.all() or '').split(','))
|
||||||
tag_list = list(tag_set) or []
|
#)
|
||||||
|
#tag_list = list(tag_set) or []
|
||||||
|
|
||||||
snap.save()
|
snap.save()
|
||||||
snap.save_tags(tag_list)
|
#snap.save_tags(tag_list)
|
||||||
|
return snap
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -29,8 +29,9 @@ from .util import enforce_types # type: ignore
|
||||||
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
||||||
from .index import (
|
from .index import (
|
||||||
load_main_index,
|
load_main_index,
|
||||||
parse_links_from_source,
|
get_empty_snapshot_queryset,
|
||||||
dedupe_links,
|
parse_snapshots_from_source,
|
||||||
|
filter_new_urls,
|
||||||
write_main_index,
|
write_main_index,
|
||||||
snapshot_filter,
|
snapshot_filter,
|
||||||
get_indexed_folders,
|
get_indexed_folders,
|
||||||
|
@ -44,11 +45,11 @@ from .index import (
|
||||||
get_corrupted_folders,
|
get_corrupted_folders,
|
||||||
get_unrecognized_folders,
|
get_unrecognized_folders,
|
||||||
fix_invalid_folder_locations,
|
fix_invalid_folder_locations,
|
||||||
write_link_details,
|
write_snapshot_details,
|
||||||
)
|
)
|
||||||
from .index.json import (
|
from .index.json import (
|
||||||
parse_json_main_index,
|
parse_json_main_index,
|
||||||
parse_json_links_details,
|
parse_json_snapshot_details,
|
||||||
generate_json_index_from_links,
|
generate_json_index_from_links,
|
||||||
)
|
)
|
||||||
from .index.sql import (
|
from .index.sql import (
|
||||||
|
@ -60,7 +61,7 @@ from .index.html import (
|
||||||
generate_index_from_links,
|
generate_index_from_links,
|
||||||
)
|
)
|
||||||
from .index.csv import links_to_csv
|
from .index.csv import links_to_csv
|
||||||
from .extractors import archive_links, archive_link, ignore_methods
|
from .extractors import archive_snapshots, archive_snapshot, ignore_methods
|
||||||
from .config import (
|
from .config import (
|
||||||
stderr,
|
stderr,
|
||||||
hint,
|
hint,
|
||||||
|
@ -538,6 +539,7 @@ def add(urls: Union[str, List[str]],
|
||||||
extractors: str="",
|
extractors: str="",
|
||||||
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
||||||
"""Add a new URL or list of URLs to your archive"""
|
"""Add a new URL or list of URLs to your archive"""
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||||
|
|
||||||
|
@ -549,8 +551,8 @@ def add(urls: Union[str, List[str]],
|
||||||
# Load list of links from the existing index
|
# Load list of links from the existing index
|
||||||
check_data_folder(out_dir=out_dir)
|
check_data_folder(out_dir=out_dir)
|
||||||
check_dependencies()
|
check_dependencies()
|
||||||
new_links: List[Link] = []
|
new_snapshots: List[Snapshot] = []
|
||||||
all_links = load_main_index(out_dir=out_dir)
|
all_snapshots = load_main_index(out_dir=out_dir)
|
||||||
|
|
||||||
log_importing_started(urls=urls, depth=depth, index_only=index_only)
|
log_importing_started(urls=urls, depth=depth, index_only=index_only)
|
||||||
if isinstance(urls, str):
|
if isinstance(urls, str):
|
||||||
|
@ -560,20 +562,21 @@ def add(urls: Union[str, List[str]],
|
||||||
# save verbatim args to sources
|
# save verbatim args to sources
|
||||||
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
||||||
|
|
||||||
new_links += parse_links_from_source(write_ahead_log, root_url=None)
|
new_snapshots += parse_snapshots_from_source(write_ahead_log, root_url=None)
|
||||||
|
|
||||||
# If we're going one level deeper, download each link and look for more links
|
# If we're going one level deeper, download each link and look for more links
|
||||||
new_links_depth = []
|
new_snapshots_depth = []
|
||||||
if new_links and depth == 1:
|
if new_snapshots and depth == 1:
|
||||||
log_crawl_started(new_links)
|
log_crawl_started(new_snapshots)
|
||||||
for new_link in new_links:
|
for new_snapshot in new_snapshots:
|
||||||
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
# TODO: Check if we need to add domain to the Snapshot model
|
||||||
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
downloaded_file = save_file_as_source(new_snapshot.url, filename=f'{new_snapshot.timestamp}-crawl-{new_snapshot.url}.txt', out_dir=out_dir)
|
||||||
|
new_snapshots_depth += parse_links_from_source(downloaded_file, root_url=new_snapshot.url)
|
||||||
|
|
||||||
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
imported_snapshots = [Snapshot(url=snapshot.url) for snapshot in new_snapshots + new_snapshots_depth]
|
||||||
new_links = dedupe_links(all_links, imported_links)
|
new_snapshots = filter_new_urls(all_snapshots, imported_snapshots)
|
||||||
|
|
||||||
write_main_index(links=new_links, out_dir=out_dir)
|
write_main_index(snapshots=new_snapshots, out_dir=out_dir)
|
||||||
all_links = load_main_index(out_dir=out_dir)
|
all_links = load_main_index(out_dir=out_dir)
|
||||||
|
|
||||||
if index_only:
|
if index_only:
|
||||||
|
@ -586,13 +589,13 @@ def add(urls: Union[str, List[str]],
|
||||||
if extractors:
|
if extractors:
|
||||||
archive_kwargs["methods"] = extractors
|
archive_kwargs["methods"] = extractors
|
||||||
if update_all:
|
if update_all:
|
||||||
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
archive_snapshots(all_snapshots, overwrite=overwrite, **archive_kwargs)
|
||||||
elif overwrite:
|
elif overwrite:
|
||||||
archive_links(imported_links, overwrite=True, **archive_kwargs)
|
archive_snapshots(imported_snapshots, overwrite=True, **archive_kwargs)
|
||||||
elif new_links:
|
elif new_snapshots:
|
||||||
archive_links(new_links, overwrite=False, **archive_kwargs)
|
archive_snapshots(new_snapshots, overwrite=False, **archive_kwargs)
|
||||||
|
|
||||||
return all_links
|
return all_snapshots
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def remove(filter_str: Optional[str]=None,
|
def remove(filter_str: Optional[str]=None,
|
||||||
|
@ -711,7 +714,7 @@ def update(resume: Optional[float]=None,
|
||||||
|
|
||||||
if index_only:
|
if index_only:
|
||||||
for link in all_links:
|
for link in all_links:
|
||||||
write_link_details(link, out_dir=out_dir, skip_sql_index=True)
|
write_snapshot_details(link, out_dir=out_dir, skip_sql_index=True)
|
||||||
index_links(all_links, out_dir=out_dir)
|
index_links(all_links, out_dir=out_dir)
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
|
@ -733,7 +736,7 @@ def update(resume: Optional[float]=None,
|
||||||
if extractors:
|
if extractors:
|
||||||
archive_kwargs["methods"] = extractors
|
archive_kwargs["methods"] = extractors
|
||||||
|
|
||||||
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
|
archive_snapshots(to_archive, overwrite=overwrite, **archive_kwargs)
|
||||||
|
|
||||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||||
all_links = load_main_index(out_dir=out_dir)
|
all_links = load_main_index(out_dir=out_dir)
|
||||||
|
|
|
@ -14,6 +14,8 @@ from typing import IO, Tuple, List, Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.db.models import Model
|
||||||
|
|
||||||
from ..system import atomic_write
|
from ..system import atomic_write
|
||||||
from ..config import (
|
from ..config import (
|
||||||
ANSI,
|
ANSI,
|
||||||
|
@ -84,7 +86,7 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]:
|
def parse_snapshots(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Model], str]:
|
||||||
"""parse a list of URLs with their metadata from an
|
"""parse a list of URLs with their metadata from an
|
||||||
RSS feed, bookmarks export, or text file
|
RSS feed, bookmarks export, or text file
|
||||||
"""
|
"""
|
||||||
|
@ -93,27 +95,27 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
|
||||||
|
|
||||||
timer = TimedProgress(TIMEOUT * 4)
|
timer = TimedProgress(TIMEOUT * 4)
|
||||||
with open(source_file, 'r', encoding='utf-8') as file:
|
with open(source_file, 'r', encoding='utf-8') as file:
|
||||||
links, parser = run_parser_functions(file, timer, root_url=root_url)
|
snapshots, parser = run_parser_functions(file, timer, root_url=root_url)
|
||||||
|
|
||||||
timer.end()
|
timer.end()
|
||||||
if parser is None:
|
if parser is None:
|
||||||
return [], 'Failed to parse'
|
return [], 'Failed to parse'
|
||||||
return links, parser
|
return snapshots, parser
|
||||||
|
|
||||||
|
|
||||||
def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]:
|
def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Model], Optional[str]]:
|
||||||
most_links: List[Link] = []
|
most_snapshots: List[Model] = []
|
||||||
best_parser_name = None
|
best_parser_name = None
|
||||||
|
|
||||||
for parser_name, parser_func in PARSERS:
|
for parser_name, parser_func in PARSERS:
|
||||||
try:
|
try:
|
||||||
parsed_links = list(parser_func(to_parse, root_url=root_url))
|
parsed_snapshots = list(parser_func(to_parse, root_url=root_url))
|
||||||
if not parsed_links:
|
if not parsed_snapshots:
|
||||||
raise Exception('no links found')
|
raise Exception('no links found')
|
||||||
|
|
||||||
# print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
|
# print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
|
||||||
if len(parsed_links) > len(most_links):
|
if len(parsed_snapshots) > len(most_snapshots):
|
||||||
most_links = parsed_links
|
most_snapshots = parsed_snapshots
|
||||||
best_parser_name = parser_name
|
best_parser_name = parser_name
|
||||||
|
|
||||||
except Exception as err: # noqa
|
except Exception as err: # noqa
|
||||||
|
@ -125,7 +127,7 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None)
|
||||||
# raise
|
# raise
|
||||||
pass
|
pass
|
||||||
timer.end()
|
timer.end()
|
||||||
return most_links, best_parser_name
|
return most_snapshots, best_parser_name
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
|
@ -31,6 +31,7 @@ class HrefParser(HTMLParser):
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
|
def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
|
||||||
"""Parse Generic HTML for href tags and use only the url (support for title coming later)"""
|
"""Parse Generic HTML for href tags and use only the url (support for title coming later)"""
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
html_file.seek(0)
|
html_file.seek(0)
|
||||||
for line in html_file:
|
for line in html_file:
|
||||||
|
@ -44,10 +45,10 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
||||||
url = urljoin(root_url, url)
|
url = urljoin(root_url, url)
|
||||||
|
|
||||||
for archivable_url in re.findall(URL_REGEX, url):
|
for archivable_url in re.findall(URL_REGEX, url):
|
||||||
yield Link(
|
yield Snapshot(
|
||||||
url=htmldecode(archivable_url),
|
url=htmldecode(archivable_url),
|
||||||
timestamp=str(datetime.now().timestamp()),
|
timestamp=str(datetime.now().timestamp()),
|
||||||
title=None,
|
title=None,
|
||||||
tags=None,
|
#tags=None,
|
||||||
sources=[html_file.name],
|
#sources=[html_file.name],
|
||||||
)
|
)
|
||||||
|
|
|
@ -18,6 +18,8 @@ from ..util import (
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
"""Parse raw links from each line in a text file"""
|
"""Parse raw links from each line in a text file"""
|
||||||
|
# TODO: Check if we should add sources list to the database
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
text_file.seek(0)
|
text_file.seek(0)
|
||||||
for line in text_file.readlines():
|
for line in text_file.readlines():
|
||||||
|
@ -40,22 +42,22 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
|
|
||||||
# otherwise look for anything that looks like a URL in the line
|
# otherwise look for anything that looks like a URL in the line
|
||||||
for url in re.findall(URL_REGEX, line):
|
for url in re.findall(URL_REGEX, line):
|
||||||
yield Link(
|
yield Snapshot(
|
||||||
url=htmldecode(url),
|
url=htmldecode(url),
|
||||||
timestamp=str(datetime.now().timestamp()),
|
timestamp=str(datetime.now().timestamp()),
|
||||||
title=None,
|
title=None,
|
||||||
tags=None,
|
#tags=None,
|
||||||
sources=[text_file.name],
|
#sources=[text_file.name],
|
||||||
)
|
)
|
||||||
|
|
||||||
# look inside the URL for any sub-urls, e.g. for archive.org links
|
# look inside the URL for any sub-urls, e.g. for archive.org links
|
||||||
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
||||||
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
||||||
for url in re.findall(URL_REGEX, line[1:]):
|
for url in re.findall(URL_REGEX, line[1:]):
|
||||||
yield Link(
|
yield Snapshot(
|
||||||
url=htmldecode(url),
|
url=htmldecode(url),
|
||||||
timestamp=str(datetime.now().timestamp()),
|
timestamp=str(datetime.now().timestamp()),
|
||||||
title=None,
|
title=None,
|
||||||
tags=None,
|
#tags=None,
|
||||||
sources=[text_file.name],
|
#sources=[text_file.name],
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,7 +2,7 @@ from typing import List, Union
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
|
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet, Model
|
||||||
|
|
||||||
from archivebox.index.schema import Link
|
from archivebox.index.schema import Link
|
||||||
from archivebox.util import enforce_types
|
from archivebox.util import enforce_types
|
||||||
|
@ -28,24 +28,22 @@ def import_backend():
|
||||||
return backend
|
return backend
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
|
def write_search_index(snapshot: Model, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
|
||||||
if not indexing_enabled():
|
if not indexing_enabled():
|
||||||
return
|
return
|
||||||
|
|
||||||
if not skip_text_index and texts:
|
if not skip_text_index and texts:
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
|
|
||||||
snap = Snapshot.objects.filter(url=link.url).first()
|
|
||||||
backend = import_backend()
|
backend = import_backend()
|
||||||
if snap:
|
try:
|
||||||
try:
|
backend.index(snapshot_id=str(snapshot.id), texts=texts)
|
||||||
backend.index(snapshot_id=str(snap.id), texts=texts)
|
except Exception as err:
|
||||||
except Exception as err:
|
stderr()
|
||||||
stderr()
|
stderr(
|
||||||
stderr(
|
f'[X] The search backend threw an exception={err}:',
|
||||||
f'[X] The search backend threw an exception={err}:',
|
|
||||||
color='red',
|
color='red',
|
||||||
)
|
)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
|
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue