better loading and saving storage mechanism

This commit is contained in:
Nick Sweeting 2019-04-17 02:25:28 -04:00
parent c95f893b61
commit 9ce47431da
14 changed files with 395 additions and 238 deletions

View file

@ -1,33 +1,28 @@
import os
import json
from datetime import datetime
from string import Template
from typing import List, Tuple, Iterator, Optional, Mapping, Iterable
from typing import List, Tuple, Optional, Iterable
from collections import OrderedDict
from .schema import Link, ArchiveResult
from .config import (
OUTPUT_DIR,
TEMPLATES_DIR,
VERSION,
GIT_SHA,
FOOTER_INFO,
TIMEOUT,
URL_BLACKLIST_PTN,
ANSI,
stderr,
)
from .storage.html import write_html_main_index, write_html_link_details
from .storage.json import (
parse_json_main_index,
write_json_main_index,
parse_json_link_details,
write_json_link_details,
)
from .util import (
scheme,
ts_to_date,
urlencode,
htmlencode,
urldecode,
wget_output_path,
enforce_types,
TimedProgress,
copy_and_overwrite,
atomic_write,
ExtendedEncoder,
)
@ -40,8 +35,6 @@ from .logs import (
log_parsing_finished,
)
TITLE_LOADING_MSG = 'Not yet archived...'
### Link filtering and checking
@ -53,8 +46,10 @@ def merge_links(a: Link, b: Link) -> Link:
"""
assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
# longest url wins (because a fuzzy url will always be shorter)
url = a.url if len(a.url) > len(b.url) else b.url
# best title based on length and quality
possible_titles = [
title
for title in (a.title, b.title)
@ -66,20 +61,24 @@ def merge_links(a: Link, b: Link) -> Link:
elif len(possible_titles) == 1:
title = possible_titles[0]
# earliest valid timestamp
timestamp = (
a.timestamp
if float(a.timestamp or 0) < float(b.timestamp or 0) else
b.timestamp
)
# all unique, truthy tags
tags_set = (
set(tag.strip() for tag in (a.tags or '').split(','))
| set(tag.strip() for tag in (b.tags or '').split(','))
)
tags = ','.join(tags_set) or None
# all unique source entries
sources = list(set(a.sources + b.sources))
# all unique history entries for the combined archive methods
all_methods = set(list(a.history.keys()) + list(a.history.keys()))
history = {
method: (a.history.get(method) or []) + (b.history.get(method) or [])
@ -95,7 +94,6 @@ def merge_links(a: Link, b: Link) -> Link:
key=lambda result: result.start_ts,
)))
return Link(
url=url,
timestamp=timestamp,
@ -105,6 +103,8 @@ def merge_links(a: Link, b: Link) -> Link:
history=history,
)
@enforce_types
def validate_links(links: Iterable[Link]) -> Iterable[Link]:
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timstamp, url
@ -121,6 +121,8 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
return links
@enforce_types
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived"""
for link in links:
@ -130,6 +132,7 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
yield link
@enforce_types
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
"""
ensures that all non-duplicate links have monotonically increasing timestamps
@ -153,12 +156,14 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
return unique_timestamps.values()
@enforce_types
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
return sorted(links, key=sort_func, reverse=True)
def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
@enforce_types
def links_after_timestamp(links: Iterable[Link], resume: Optional[float]=None) -> Iterable[Link]:
if not resume:
yield from links
return
@ -171,6 +176,7 @@ def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable
print('Resume value and all timestamp values must be valid numbers.')
@enforce_types
def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
@ -190,10 +196,10 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
### Homepage index for all the links
### Main Links Index
@enforce_types
def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""create index.html file for a given list of links"""
log_indexing_process_started()
@ -201,7 +207,7 @@ def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool
log_indexing_started(out_dir, 'index.json')
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
write_json_links_index(links, out_dir=out_dir)
write_json_main_index(links, out_dir=out_dir)
finally:
timer.end()
log_indexing_finished(out_dir, 'index.json')
@ -209,19 +215,19 @@ def write_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool
log_indexing_started(out_dir, 'index.html')
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
write_html_links_index(links, out_dir=out_dir, finished=finished)
write_html_main_index(links, out_dir=out_dir, finished=finished)
finally:
timer.end()
log_indexing_finished(out_dir, 'index.html')
@enforce_types
def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
"""parse and load existing index with any new links from import_path merged in"""
existing_links: List[Link] = []
if out_dir:
existing_links = list(parse_json_links_index(out_dir))
existing_links = list(parse_json_main_index(out_dir))
new_links: List[Link] = []
if import_path:
@ -242,108 +248,16 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -
@enforce_types
def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
"""write the json link index to a given path"""
def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
"""hack to in-place update one row's info in the generated index files"""
assert isinstance(links, List), 'Links must be a list, not a generator.'
assert not links or isinstance(links[0].history, dict)
assert not links or isinstance(links[0].sources, list)
# TODO: remove this ASAP, it's ugly, error-prone, and potentially dangerous
if links and links[0].history.get('title'):
assert isinstance(links[0].history['title'][0], ArchiveResult)
if links and links[0].sources:
assert isinstance(links[0].sources[0], str)
path = os.path.join(out_dir, 'index.json')
index_json = {
'info': 'ArchiveBox Index',
'source': 'https://github.com/pirate/ArchiveBox',
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
'version': VERSION,
'num_links': len(links),
'updated': datetime.now(),
'links': links,
}
atomic_write(index_json, path)
@enforce_types
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links']
for link_json in links:
yield Link.from_json(link_json)
return ()
@enforce_types
def write_html_links_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""write the html link index to a given path"""
copy_and_overwrite(
os.path.join(TEMPLATES_DIR, 'static'),
os.path.join(out_dir, 'static'),
)
atomic_write('User-agent: *\nDisallow: /', os.path.join(out_dir, 'robots.txt'))
with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f:
index_html = f.read()
with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
link_row_html = f.read()
link_rows = []
for link in links:
template_row_vars: Mapping[str, str] = {
**derived_link_info(link),
'title': (
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
'favicon_url': (
os.path.join('archive', link.timestamp, 'favicon.ico')
# if link['is_archived'] else ''
),
'archive_url': urlencode(
wget_output_path(link) or 'index.html'
),
}
link_rows.append(Template(link_row_html).substitute(**template_row_vars))
template_vars: Mapping[str, str] = {
'num_links': str(len(links)),
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'footer_info': FOOTER_INFO,
'version': VERSION,
'git_sha': GIT_SHA,
'rows': '\n'.join(link_rows),
'status': 'finished' if finished else 'running',
}
template_html = Template(index_html).substitute(**template_vars)
atomic_write(template_html, os.path.join(out_dir, 'index.html'))
@enforce_types
def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
"""hack to in-place update one row's info in the generated index html"""
title = link.title or link.latest_outputs()['title']
title = link.title or link.latest_outputs(status='succeeded')['title']
successful = link.num_outputs
# Patch JSON index
json_file_links = parse_json_links_index(out_dir)
# Patch JSON main index
json_file_links = parse_json_main_index(out_dir)
patched_links = []
for saved_link in json_file_links:
if saved_link.url == link.url:
@ -355,11 +269,12 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
else:
patched_links.append(saved_link)
write_json_links_index(patched_links, out_dir=out_dir)
write_json_main_index(patched_links, out_dir=out_dir)
# Patch HTML index
# Patch HTML main index
html_path = os.path.join(out_dir, 'index.html')
html = open(html_path, 'r').read().split('\n')
with open(html_path, 'r') as f:
html = f.read().split('\n')
for idx, line in enumerate(html):
if title and ('<span data-title-for="{}"'.format(link.url) in line):
html[idx] = '<span>{}</span>'.format(title)
@ -370,76 +285,31 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
atomic_write('\n'.join(html), html_path)
### Individual link index
### Link Details Index
@enforce_types
def write_link_index(link: Link, link_dir: Optional[str]=None) -> None:
link_dir = link_dir or link.link_dir
def write_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
write_json_link_index(link, link_dir)
write_html_link_index(link, link_dir)
write_json_link_details(link, out_dir=out_dir)
write_html_link_details(link, out_dir=out_dir)
@enforce_types
def write_json_link_index(link: Link, link_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
link_dir = link_dir or link.link_dir
path = os.path.join(link_dir, 'index.json')
atomic_write(link._asdict(), path)
@enforce_types
def parse_json_link_index(link_dir: str) -> Optional[Link]:
"""load the json link index from a given directory"""
existing_index = os.path.join(link_dir, 'index.json')
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f)
return Link.from_json(link_json)
return None
@enforce_types
def load_json_link_index(link: Link, link_dir: Optional[str]=None) -> Link:
def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
link_dir = link_dir or link.link_dir
existing_link = parse_json_link_index(link_dir)
out_dir = out_dir or link.link_dir
existing_link = parse_json_link_details(out_dir)
if existing_link:
return merge_links(existing_link, link)
return link
@enforce_types
def write_html_link_index(link: Link, link_dir: Optional[str]=None) -> None:
link_dir = link_dir or link.link_dir
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
link_html = f.read()
path = os.path.join(link_dir, 'index.html')
template_vars: Mapping[str, str] = {
**derived_link_info(link),
'title': (
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
'url_str': htmlencode(urldecode(link.base_url)),
'archive_url': urlencode(
wget_output_path(link)
or (link.domain if link.is_archived else 'about:blank')
),
'extension': link.extension or 'html',
'tags': link.tags or 'untagged',
'status': 'archived' if link.is_archived else 'not yet archived',
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date(link.oldest_archive_date),
}
html_index = Template(link_html).substitute(**template_vars)
atomic_write(html_index, path)