mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
switch to strict type hints with NamedTuples instead of dicts
This commit is contained in:
parent
0a44779b21
commit
76abc58135
8 changed files with 201 additions and 98 deletions
|
@ -12,6 +12,9 @@ Usage & Documentation:
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from schema import Link
|
||||||
from links import links_after_timestamp
|
from links import links_after_timestamp
|
||||||
from index import write_links_index, load_links_index
|
from index import write_links_index, load_links_index
|
||||||
from archive_methods import archive_link
|
from archive_methods import archive_link
|
||||||
|
@ -50,7 +53,7 @@ def print_help():
|
||||||
print(" ./archive 15109948213.123\n")
|
print(" ./archive 15109948213.123\n")
|
||||||
|
|
||||||
|
|
||||||
def main(*args):
|
def main(*args) -> List[Link]:
|
||||||
if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
|
if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
|
||||||
print_help()
|
print_help()
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
@ -95,10 +98,10 @@ def main(*args):
|
||||||
import_path = save_remote_source(import_path)
|
import_path = save_remote_source(import_path)
|
||||||
|
|
||||||
### Run the main archive update process
|
### Run the main archive update process
|
||||||
update_archive_data(import_path=import_path, resume=resume)
|
return update_archive_data(import_path=import_path, resume=resume)
|
||||||
|
|
||||||
|
|
||||||
def update_archive_data(import_path=None, resume=None):
|
def update_archive_data(import_path: str=None, resume: float=None) -> List[Link]:
|
||||||
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
||||||
|
|
||||||
# Step 1: Load list of links from the existing index
|
# Step 1: Load list of links from the existing index
|
||||||
|
@ -111,14 +114,14 @@ def update_archive_data(import_path=None, resume=None):
|
||||||
# Step 3: Run the archive methods for each link
|
# Step 3: Run the archive methods for each link
|
||||||
links = new_links if ONLY_NEW else all_links
|
links = new_links if ONLY_NEW else all_links
|
||||||
log_archiving_started(len(links), resume)
|
log_archiving_started(len(links), resume)
|
||||||
idx, link = 0, 0
|
idx, link = 0, {'timestamp': 0}
|
||||||
try:
|
try:
|
||||||
for idx, link in enumerate(links_after_timestamp(links, resume)):
|
for idx, link in enumerate(links_after_timestamp(links, resume)):
|
||||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||||
archive_link(link_dir, link)
|
archive_link(link_dir, link)
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
log_archiving_paused(len(links), idx, link and link['timestamp'])
|
log_archiving_paused(len(links), idx, link['timestamp'])
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
|
||||||
except:
|
except:
|
||||||
|
@ -130,7 +133,7 @@ def update_archive_data(import_path=None, resume=None):
|
||||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||||
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
|
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
|
||||||
write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
|
write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
|
||||||
|
return all_links
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main(*sys.argv)
|
main(*sys.argv)
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
import os
|
import os
|
||||||
import json
|
|
||||||
|
|
||||||
from typing import Union, Dict, List, Tuple, NamedTuple
|
from typing import Dict, List, Tuple
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
from schema import Link, ArchiveResult, ArchiveError
|
||||||
from index import (
|
from index import (
|
||||||
write_link_index,
|
write_link_index,
|
||||||
patch_links_index,
|
patch_links_index,
|
||||||
|
@ -102,7 +102,7 @@ def archive_link(link_dir: str, link: Link, page=None) -> Link:
|
||||||
link['history'][method_name].append(result._asdict())
|
link['history'][method_name].append(result._asdict())
|
||||||
|
|
||||||
stats[result.status] += 1
|
stats[result.status] += 1
|
||||||
log_archive_method_finished(result._asdict())
|
log_archive_method_finished(result)
|
||||||
else:
|
else:
|
||||||
stats['skipped'] += 1
|
stats['skipped'] += 1
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ except ImportError:
|
||||||
print('[X] Missing "distutils" python package. To install it, run:')
|
print('[X] Missing "distutils" python package. To install it, run:')
|
||||||
print(' pip install distutils')
|
print(' pip install distutils')
|
||||||
|
|
||||||
|
from schema import Link, ArchiveIndex
|
||||||
from config import (
|
from config import (
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
TEMPLATES_DIR,
|
TEMPLATES_DIR,
|
||||||
|
@ -25,7 +26,7 @@ from util import (
|
||||||
check_links_structure,
|
check_links_structure,
|
||||||
wget_output_path,
|
wget_output_path,
|
||||||
latest_output,
|
latest_output,
|
||||||
Link,
|
ExtendedEncoder,
|
||||||
)
|
)
|
||||||
from parse import parse_links
|
from parse import parse_links
|
||||||
from links import validate_links
|
from links import validate_links
|
||||||
|
@ -56,6 +57,7 @@ def write_links_index(out_dir: str, links: List[Link], finished: bool=False) ->
|
||||||
write_html_links_index(out_dir, links, finished=finished)
|
write_html_links_index(out_dir, links, finished=finished)
|
||||||
log_indexing_finished(out_dir, 'index.html')
|
log_indexing_finished(out_dir, 'index.html')
|
||||||
|
|
||||||
|
|
||||||
def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]:
|
def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]:
|
||||||
"""parse and load existing index with any new links from import_path merged in"""
|
"""parse and load existing index with any new links from import_path merged in"""
|
||||||
|
|
||||||
|
@ -82,6 +84,7 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[Li
|
||||||
|
|
||||||
return all_links, new_links
|
return all_links, new_links
|
||||||
|
|
||||||
|
|
||||||
def write_json_links_index(out_dir: str, links: List[Link]) -> None:
|
def write_json_links_index(out_dir: str, links: List[Link]) -> None:
|
||||||
"""write the json link index to a given path"""
|
"""write the json link index to a given path"""
|
||||||
|
|
||||||
|
@ -89,20 +92,24 @@ def write_json_links_index(out_dir: str, links: List[Link]) -> None:
|
||||||
|
|
||||||
path = os.path.join(out_dir, 'index.json')
|
path = os.path.join(out_dir, 'index.json')
|
||||||
|
|
||||||
index_json = {
|
index_json = ArchiveIndex(
|
||||||
'info': 'ArchiveBox Index',
|
info='ArchiveBox Index',
|
||||||
'help': 'https://github.com/pirate/ArchiveBox',
|
source='https://github.com/pirate/ArchiveBox',
|
||||||
'version': GIT_SHA,
|
docs='https://github.com/pirate/ArchiveBox/wiki',
|
||||||
'num_links': len(links),
|
version=GIT_SHA,
|
||||||
'updated': str(datetime.now().timestamp()),
|
num_links=len(links),
|
||||||
'links': links,
|
updated=str(datetime.now().timestamp()),
|
||||||
}
|
links=links,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(index_json._asdict(), dict)
|
||||||
|
|
||||||
with open(path, 'w', encoding='utf-8') as f:
|
with open(path, 'w', encoding='utf-8') as f:
|
||||||
json.dump(index_json, f, indent=4, default=str)
|
json.dump(index_json._asdict(), f, indent=4, cls=ExtendedEncoder)
|
||||||
|
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> List[Link]:
|
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> List[Link]:
|
||||||
"""parse a archive index json file and return the list of links"""
|
"""parse a archive index json file and return the list of links"""
|
||||||
index_path = os.path.join(out_dir, 'index.json')
|
index_path = os.path.join(out_dir, 'index.json')
|
||||||
|
@ -114,6 +121,7 @@ def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> List[Link]:
|
||||||
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
|
def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
|
||||||
"""write the html link index to a given path"""
|
"""write the html link index to a given path"""
|
||||||
|
|
||||||
|
@ -208,6 +216,7 @@ def write_link_index(out_dir: str, link: Link) -> None:
|
||||||
write_json_link_index(out_dir, link)
|
write_json_link_index(out_dir, link)
|
||||||
write_html_link_index(out_dir, link)
|
write_html_link_index(out_dir, link)
|
||||||
|
|
||||||
|
|
||||||
def write_json_link_index(out_dir: str, link: Link) -> None:
|
def write_json_link_index(out_dir: str, link: Link) -> None:
|
||||||
"""write a json file with some info about the link"""
|
"""write a json file with some info about the link"""
|
||||||
|
|
||||||
|
@ -215,10 +224,11 @@ def write_json_link_index(out_dir: str, link: Link) -> None:
|
||||||
path = os.path.join(out_dir, 'index.json')
|
path = os.path.join(out_dir, 'index.json')
|
||||||
|
|
||||||
with open(path, 'w', encoding='utf-8') as f:
|
with open(path, 'w', encoding='utf-8') as f:
|
||||||
json.dump(link, f, indent=4, default=str)
|
json.dump(link, f, indent=4, cls=ExtendedEncoder)
|
||||||
|
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
def parse_json_link_index(out_dir: str) -> dict:
|
def parse_json_link_index(out_dir: str) -> dict:
|
||||||
"""load the json link index from a given directory"""
|
"""load the json link index from a given directory"""
|
||||||
existing_index = os.path.join(out_dir, 'index.json')
|
existing_index = os.path.join(out_dir, 'index.json')
|
||||||
|
@ -229,6 +239,7 @@ def parse_json_link_index(out_dir: str) -> dict:
|
||||||
return link_json
|
return link_json
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def load_json_link_index(out_dir: str, link: Link) -> Link:
|
def load_json_link_index(out_dir: str, link: Link) -> Link:
|
||||||
"""check for an existing link archive in the given directory,
|
"""check for an existing link archive in the given directory,
|
||||||
and load+merge it into the given link dict
|
and load+merge it into the given link dict
|
||||||
|
@ -244,6 +255,7 @@ def load_json_link_index(out_dir: str, link: Link) -> Link:
|
||||||
check_link_structure(link)
|
check_link_structure(link)
|
||||||
return link
|
return link
|
||||||
|
|
||||||
|
|
||||||
def write_html_link_index(out_dir: str, link: Link) -> None:
|
def write_html_link_index(out_dir: str, link: Link) -> None:
|
||||||
check_link_structure(link)
|
check_link_structure(link)
|
||||||
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
|
||||||
|
|
|
@ -19,17 +19,19 @@ Link {
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from html import unescape
|
from typing import List, Iterable
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
from schema import Link
|
||||||
from util import (
|
from util import (
|
||||||
merge_links,
|
merge_links,
|
||||||
check_link_structure,
|
check_link_structure,
|
||||||
check_links_structure,
|
check_links_structure,
|
||||||
|
htmldecode,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def validate_links(links):
|
def validate_links(links: Iterable[Link]) -> List[Link]:
|
||||||
check_links_structure(links)
|
check_links_structure(links)
|
||||||
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
||||||
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
|
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
|
||||||
|
@ -40,13 +42,13 @@ def validate_links(links):
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
link['title'] = unescape(link['title'].strip()) if link['title'] else None
|
link['title'] = htmldecode(link['title'].strip()) if link['title'] else None
|
||||||
check_link_structure(link)
|
check_link_structure(link)
|
||||||
|
|
||||||
return list(links)
|
return list(links)
|
||||||
|
|
||||||
|
|
||||||
def archivable_links(links):
|
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||||
return (
|
return (
|
||||||
link
|
link
|
||||||
|
@ -55,12 +57,12 @@ def archivable_links(links):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def uniquefied_links(sorted_links):
|
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
||||||
"""
|
"""
|
||||||
ensures that all non-duplicate links have monotonically increasing timestamps
|
ensures that all non-duplicate links have monotonically increasing timestamps
|
||||||
"""
|
"""
|
||||||
|
|
||||||
unique_urls = OrderedDict()
|
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||||
|
|
||||||
lower = lambda url: url.lower().strip()
|
lower = lambda url: url.lower().strip()
|
||||||
without_www = lambda url: url.replace('://www.', '://', 1)
|
without_www = lambda url: url.replace('://www.', '://', 1)
|
||||||
|
@ -73,7 +75,7 @@ def uniquefied_links(sorted_links):
|
||||||
link = merge_links(unique_urls[fuzzy_url], link)
|
link = merge_links(unique_urls[fuzzy_url], link)
|
||||||
unique_urls[fuzzy_url] = link
|
unique_urls[fuzzy_url] = link
|
||||||
|
|
||||||
unique_timestamps = OrderedDict()
|
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
||||||
for link in unique_urls.values():
|
for link in unique_urls.values():
|
||||||
link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
|
link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
|
||||||
unique_timestamps[link['timestamp']] = link
|
unique_timestamps[link['timestamp']] = link
|
||||||
|
@ -81,12 +83,12 @@ def uniquefied_links(sorted_links):
|
||||||
return unique_timestamps.values()
|
return unique_timestamps.values()
|
||||||
|
|
||||||
|
|
||||||
def sorted_links(links):
|
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||||
sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
|
sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
|
||||||
return sorted(links, key=sort_func, reverse=True)
|
return sorted(links, key=sort_func, reverse=True)
|
||||||
|
|
||||||
|
|
||||||
def links_after_timestamp(links, timestamp=None):
|
def links_after_timestamp(links: Iterable[Link], timestamp: str=None) -> Iterable[Link]:
|
||||||
if not timestamp:
|
if not timestamp:
|
||||||
yield from links
|
yield from links
|
||||||
return
|
return
|
||||||
|
@ -99,7 +101,7 @@ def links_after_timestamp(links, timestamp=None):
|
||||||
print('Resume value and all timestamp values must be valid numbers.')
|
print('Resume value and all timestamp values must be valid numbers.')
|
||||||
|
|
||||||
|
|
||||||
def lowest_uniq_timestamp(used_timestamps, timestamp):
|
def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
|
||||||
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
|
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
|
||||||
|
|
||||||
timestamp = timestamp.split('.')[0]
|
timestamp = timestamp.split('.')[0]
|
||||||
|
|
|
@ -1,43 +1,44 @@
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
from schema import Link, ArchiveResult, RuntimeStats
|
||||||
from config import ANSI, REPO_DIR, OUTPUT_DIR
|
from config import ANSI, REPO_DIR, OUTPUT_DIR
|
||||||
|
|
||||||
|
|
||||||
# globals are bad, mmkay
|
# globals are bad, mmkay
|
||||||
_LAST_RUN_STATS = {
|
_LAST_RUN_STATS = RuntimeStats(
|
||||||
'skipped': 0,
|
skipped=0,
|
||||||
'succeeded': 0,
|
succeeded=0,
|
||||||
'failed': 0,
|
failed=0,
|
||||||
|
|
||||||
'parsing_start_ts': 0,
|
parse_start_ts=0,
|
||||||
'parsing_end_ts': 0,
|
parse_end_ts=0,
|
||||||
|
|
||||||
'indexing_start_ts': 0,
|
index_start_ts=0,
|
||||||
'indexing_end_ts': 0,
|
index_end_ts=0,
|
||||||
|
|
||||||
'archiving_start_ts': 0,
|
archiving_start_ts=0,
|
||||||
'archiving_end_ts': 0,
|
archiving_end_ts=0,
|
||||||
|
)
|
||||||
|
|
||||||
'links': {},
|
def pretty_path(path: str) -> str:
|
||||||
}
|
|
||||||
|
|
||||||
def pretty_path(path):
|
|
||||||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||||
return path.replace(REPO_DIR + '/', '')
|
return path.replace(REPO_DIR + '/', '')
|
||||||
|
|
||||||
|
|
||||||
### Parsing Stage
|
### Parsing Stage
|
||||||
|
|
||||||
def log_parsing_started(source_file):
|
def log_parsing_started(source_file: str):
|
||||||
start_ts = datetime.now()
|
start_ts = datetime.now()
|
||||||
_LAST_RUN_STATS['parse_start_ts'] = start_ts
|
_LAST_RUN_STATS.parse_start_ts = start_ts
|
||||||
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
||||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
source_file.rsplit('/', 1)[-1],
|
source_file.rsplit('/', 1)[-1],
|
||||||
**ANSI,
|
**ANSI,
|
||||||
))
|
))
|
||||||
|
|
||||||
def log_parsing_finished(num_new_links, parser_name):
|
def log_parsing_finished(num_new_links: int, parser_name: str):
|
||||||
|
end_ts = datetime.now()
|
||||||
|
_LAST_RUN_STATS.parse_end_ts = end_ts
|
||||||
print(' > Adding {} new links to index (parsed import as {})'.format(
|
print(' > Adding {} new links to index (parsed import as {})'.format(
|
||||||
num_new_links,
|
num_new_links,
|
||||||
parser_name,
|
parser_name,
|
||||||
|
@ -48,26 +49,26 @@ def log_parsing_finished(num_new_links, parser_name):
|
||||||
|
|
||||||
def log_indexing_process_started():
|
def log_indexing_process_started():
|
||||||
start_ts = datetime.now()
|
start_ts = datetime.now()
|
||||||
_LAST_RUN_STATS['index_start_ts'] = start_ts
|
_LAST_RUN_STATS.index_start_ts = start_ts
|
||||||
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
||||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
**ANSI,
|
**ANSI,
|
||||||
))
|
))
|
||||||
|
|
||||||
def log_indexing_started(out_dir, out_file):
|
def log_indexing_started(out_dir: str, out_file: str):
|
||||||
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
|
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
|
||||||
|
|
||||||
def log_indexing_finished(out_dir, out_file):
|
def log_indexing_finished(out_dir: str, out_file: str):
|
||||||
end_ts = datetime.now()
|
end_ts = datetime.now()
|
||||||
_LAST_RUN_STATS['index_end_ts'] = end_ts
|
_LAST_RUN_STATS.index_end_ts = end_ts
|
||||||
print('\r √ {}/{}'.format(pretty_path(out_dir), out_file))
|
print('\r √ {}/{}'.format(pretty_path(out_dir), out_file))
|
||||||
|
|
||||||
|
|
||||||
### Archiving Stage
|
### Archiving Stage
|
||||||
|
|
||||||
def log_archiving_started(num_links, resume):
|
def log_archiving_started(num_links: int, resume: float):
|
||||||
start_ts = datetime.now()
|
start_ts = datetime.now()
|
||||||
_LAST_RUN_STATS['start_ts'] = start_ts
|
_LAST_RUN_STATS.archiving_start_ts = start_ts
|
||||||
if resume:
|
if resume:
|
||||||
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
|
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
|
||||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
@ -82,9 +83,9 @@ def log_archiving_started(num_links, resume):
|
||||||
**ANSI,
|
**ANSI,
|
||||||
))
|
))
|
||||||
|
|
||||||
def log_archiving_paused(num_links, idx, timestamp):
|
def log_archiving_paused(num_links: int, idx: int, timestamp: str):
|
||||||
end_ts = datetime.now()
|
end_ts = datetime.now()
|
||||||
_LAST_RUN_STATS['end_ts'] = end_ts
|
_LAST_RUN_STATS.archiving_end_ts = end_ts
|
||||||
print()
|
print()
|
||||||
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
|
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
|
||||||
**ANSI,
|
**ANSI,
|
||||||
|
@ -100,10 +101,10 @@ def log_archiving_paused(num_links, idx, timestamp):
|
||||||
timestamp,
|
timestamp,
|
||||||
))
|
))
|
||||||
|
|
||||||
def log_archiving_finished(num_links):
|
def log_archiving_finished(num_links: int):
|
||||||
end_ts = datetime.now()
|
end_ts = datetime.now()
|
||||||
_LAST_RUN_STATS['end_ts'] = end_ts
|
_LAST_RUN_STATS.archiving_end_ts = end_ts
|
||||||
seconds = end_ts.timestamp() - _LAST_RUN_STATS['start_ts'].timestamp()
|
seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
|
||||||
if seconds > 60:
|
if seconds > 60:
|
||||||
duration = '{0:.2f} min'.format(seconds / 60, 2)
|
duration = '{0:.2f} min'.format(seconds / 60, 2)
|
||||||
else:
|
else:
|
||||||
|
@ -116,13 +117,13 @@ def log_archiving_finished(num_links):
|
||||||
duration,
|
duration,
|
||||||
ANSI['reset'],
|
ANSI['reset'],
|
||||||
))
|
))
|
||||||
print(' - {} links skipped'.format(_LAST_RUN_STATS['skipped']))
|
print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
|
||||||
print(' - {} links updated'.format(_LAST_RUN_STATS['succeeded']))
|
print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded))
|
||||||
print(' - {} links had errors'.format(_LAST_RUN_STATS['failed']))
|
print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
|
||||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
||||||
|
|
||||||
|
|
||||||
def log_link_archiving_started(link_dir, link, is_new):
|
def log_link_archiving_started(link_dir: str, link: Link, is_new: bool):
|
||||||
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
|
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
|
||||||
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
|
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
|
||||||
# > output/archive/1478739709
|
# > output/archive/1478739709
|
||||||
|
@ -140,40 +141,34 @@ def log_link_archiving_started(link_dir, link, is_new):
|
||||||
pretty_path(link_dir),
|
pretty_path(link_dir),
|
||||||
))
|
))
|
||||||
|
|
||||||
def log_link_archiving_finished(link_dir, link, is_new, stats):
|
def log_link_archiving_finished(link_dir: str, link: Link, is_new: bool, stats: dict):
|
||||||
total = sum(stats.values())
|
total = sum(stats.values())
|
||||||
|
|
||||||
if stats['failed'] > 0 :
|
if stats['failed'] > 0 :
|
||||||
_LAST_RUN_STATS['failed'] += 1
|
_LAST_RUN_STATS.failed += 1
|
||||||
elif stats['skipped'] == total:
|
elif stats['skipped'] == total:
|
||||||
_LAST_RUN_STATS['skipped'] += 1
|
_LAST_RUN_STATS.skipped += 1
|
||||||
else:
|
else:
|
||||||
_LAST_RUN_STATS['succeeded'] += 1
|
_LAST_RUN_STATS.succeeded += 1
|
||||||
|
|
||||||
|
|
||||||
def log_archive_method_started(method):
|
def log_archive_method_started(method: str):
|
||||||
print(' > {}'.format(method))
|
print(' > {}'.format(method))
|
||||||
|
|
||||||
def log_archive_method_finished(result):
|
|
||||||
|
def log_archive_method_finished(result: ArchiveResult):
|
||||||
"""quote the argument with whitespace in a command so the user can
|
"""quote the argument with whitespace in a command so the user can
|
||||||
copy-paste the outputted string directly to run the cmd
|
copy-paste the outputted string directly to run the cmd
|
||||||
"""
|
"""
|
||||||
required_keys = ('cmd', 'pwd', 'output', 'status', 'start_ts', 'end_ts')
|
|
||||||
assert (
|
|
||||||
isinstance(result, dict)
|
|
||||||
and all(key in result for key in required_keys)
|
|
||||||
and ('output' in result)
|
|
||||||
), 'Archive method did not return a valid result.'
|
|
||||||
|
|
||||||
# Prettify CMD string and make it safe to copy-paste by quoting arguments
|
# Prettify CMD string and make it safe to copy-paste by quoting arguments
|
||||||
quoted_cmd = ' '.join(
|
quoted_cmd = ' '.join(
|
||||||
'"{}"'.format(arg) if ' ' in arg else arg
|
'"{}"'.format(arg) if ' ' in arg else arg
|
||||||
for arg in result['cmd']
|
for arg in result.cmd
|
||||||
)
|
)
|
||||||
|
|
||||||
if result['status'] == 'failed':
|
if result.status == 'failed':
|
||||||
# Prettify error output hints string and limit to five lines
|
# Prettify error output hints string and limit to five lines
|
||||||
hints = getattr(result['output'], 'hints', None) or ()
|
hints = getattr(result.output, 'hints', None) or ()
|
||||||
if hints:
|
if hints:
|
||||||
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
||||||
hints = (
|
hints = (
|
||||||
|
@ -185,13 +180,13 @@ def log_archive_method_finished(result):
|
||||||
output_lines = [
|
output_lines = [
|
||||||
'{}Failed:{} {}{}'.format(
|
'{}Failed:{} {}{}'.format(
|
||||||
ANSI['red'],
|
ANSI['red'],
|
||||||
result['output'].__class__.__name__.replace('ArchiveError', ''),
|
result.output.__class__.__name__.replace('ArchiveError', ''),
|
||||||
result['output'],
|
result.output,
|
||||||
ANSI['reset']
|
ANSI['reset']
|
||||||
),
|
),
|
||||||
*hints,
|
*hints,
|
||||||
'{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
|
'{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
|
||||||
' cd {};'.format(result['pwd']),
|
*((' cd {};'.format(result.pwd),) if result.pwd else ()),
|
||||||
' {}'.format(quoted_cmd),
|
' {}'.format(quoted_cmd),
|
||||||
]
|
]
|
||||||
print('\n'.join(
|
print('\n'.join(
|
||||||
|
|
|
@ -20,6 +20,7 @@ Link: {
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from typing import Tuple, List, IO, Iterable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import xml.etree.ElementTree as etree
|
import xml.etree.ElementTree as etree
|
||||||
|
|
||||||
|
@ -29,10 +30,11 @@ from util import (
|
||||||
URL_REGEX,
|
URL_REGEX,
|
||||||
check_url_parsing_invariants,
|
check_url_parsing_invariants,
|
||||||
TimedProgress,
|
TimedProgress,
|
||||||
|
Link,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_links(source_file):
|
def parse_links(source_file: str) -> Tuple[List[Link], str]:
|
||||||
"""parse a list of URLs with their metadata from an
|
"""parse a list of URLs with their metadata from an
|
||||||
RSS feed, bookmarks export, or text file
|
RSS feed, bookmarks export, or text file
|
||||||
"""
|
"""
|
||||||
|
@ -74,7 +76,7 @@ def parse_links(source_file):
|
||||||
|
|
||||||
### Import Parser Functions
|
### Import Parser Functions
|
||||||
|
|
||||||
def parse_pocket_html_export(html_file):
|
def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||||
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
||||||
|
|
||||||
html_file.seek(0)
|
html_file.seek(0)
|
||||||
|
@ -98,7 +100,7 @@ def parse_pocket_html_export(html_file):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_json_export(json_file):
|
def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
|
||||||
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
||||||
|
|
||||||
json_file.seek(0)
|
json_file.seek(0)
|
||||||
|
@ -150,7 +152,7 @@ def parse_json_export(json_file):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_rss_export(rss_file):
|
def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
"""Parse RSS XML-format files into links"""
|
"""Parse RSS XML-format files into links"""
|
||||||
|
|
||||||
rss_file.seek(0)
|
rss_file.seek(0)
|
||||||
|
@ -187,7 +189,7 @@ def parse_rss_export(rss_file):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_shaarli_rss_export(rss_file):
|
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
"""Parse Shaarli-specific RSS XML-format files into links"""
|
"""Parse Shaarli-specific RSS XML-format files into links"""
|
||||||
|
|
||||||
rss_file.seek(0)
|
rss_file.seek(0)
|
||||||
|
@ -224,7 +226,7 @@ def parse_shaarli_rss_export(rss_file):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_netscape_html_export(html_file):
|
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||||
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
||||||
|
|
||||||
html_file.seek(0)
|
html_file.seek(0)
|
||||||
|
@ -247,7 +249,7 @@ def parse_netscape_html_export(html_file):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_pinboard_rss_export(rss_file):
|
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
"""Parse Pinboard RSS feed files into links"""
|
"""Parse Pinboard RSS feed files into links"""
|
||||||
|
|
||||||
rss_file.seek(0)
|
rss_file.seek(0)
|
||||||
|
@ -278,7 +280,7 @@ def parse_pinboard_rss_export(rss_file):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_medium_rss_export(rss_file):
|
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
"""Parse Medium RSS feed files into links"""
|
"""Parse Medium RSS feed files into links"""
|
||||||
|
|
||||||
rss_file.seek(0)
|
rss_file.seek(0)
|
||||||
|
@ -299,7 +301,7 @@ def parse_medium_rss_export(rss_file):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_plain_text_export(text_file):
|
def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
|
||||||
"""Parse raw links from each line in a text file"""
|
"""Parse raw links from each line in a text file"""
|
||||||
|
|
||||||
text_file.seek(0)
|
text_file.seek(0)
|
||||||
|
|
55
archivebox/schema.py
Normal file
55
archivebox/schema.py
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from typing import List, Dict, Any, Optional, Union, NamedTuple
|
||||||
|
from recordclass import RecordClass
|
||||||
|
|
||||||
|
Link = Dict[str, Any]
|
||||||
|
|
||||||
|
class ArchiveIndex(NamedTuple):
|
||||||
|
info: str
|
||||||
|
version: str
|
||||||
|
source: str
|
||||||
|
docs: str
|
||||||
|
num_links: int
|
||||||
|
updated: str
|
||||||
|
links: List[Link]
|
||||||
|
|
||||||
|
class ArchiveResult(NamedTuple):
|
||||||
|
cmd: List[str]
|
||||||
|
pwd: Optional[str]
|
||||||
|
cmd_version: Optional[str]
|
||||||
|
output: Union[str, Exception, None]
|
||||||
|
status: str
|
||||||
|
start_ts: datetime
|
||||||
|
end_ts: datetime
|
||||||
|
duration: int
|
||||||
|
|
||||||
|
|
||||||
|
class ArchiveError(Exception):
|
||||||
|
def __init__(self, message, hints=None):
|
||||||
|
super().__init__(message)
|
||||||
|
self.hints = hints
|
||||||
|
|
||||||
|
|
||||||
|
class LinkDict(NamedTuple):
|
||||||
|
timestamp: str
|
||||||
|
url: str
|
||||||
|
title: Optional[str]
|
||||||
|
tags: str
|
||||||
|
sources: List[str]
|
||||||
|
history: Dict[str, ArchiveResult]
|
||||||
|
|
||||||
|
|
||||||
|
class RuntimeStats(RecordClass):
|
||||||
|
skipped: int
|
||||||
|
succeeded: int
|
||||||
|
failed: int
|
||||||
|
|
||||||
|
parse_start_ts: datetime
|
||||||
|
parse_end_ts: datetime
|
||||||
|
|
||||||
|
index_start_ts: datetime
|
||||||
|
index_end_ts: datetime
|
||||||
|
|
||||||
|
archiving_start_ts: datetime
|
||||||
|
archiving_end_ts: datetime
|
|
@ -3,11 +3,13 @@ import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from typing import List, Dict, Any, Optional, Union
|
from json import JSONEncoder
|
||||||
|
|
||||||
|
from typing import List, Dict, Optional, Iterable
|
||||||
|
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
from urllib.parse import urlparse, quote
|
from urllib.parse import urlparse, quote, unquote
|
||||||
from decimal import Decimal
|
from html import escape, unescape
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from multiprocessing import Process
|
from multiprocessing import Process
|
||||||
from subprocess import (
|
from subprocess import (
|
||||||
|
@ -19,6 +21,7 @@ from subprocess import (
|
||||||
CalledProcessError,
|
CalledProcessError,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from schema import Link
|
||||||
from config import (
|
from config import (
|
||||||
ANSI,
|
ANSI,
|
||||||
TERM_WIDTH,
|
TERM_WIDTH,
|
||||||
|
@ -38,7 +41,8 @@ from logs import pretty_path
|
||||||
|
|
||||||
### Parsing Helpers
|
### Parsing Helpers
|
||||||
|
|
||||||
# Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing
|
# All of these are (str) -> str
|
||||||
|
# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
|
||||||
scheme = lambda url: urlparse(url).scheme
|
scheme = lambda url: urlparse(url).scheme
|
||||||
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
|
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
|
||||||
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
|
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
|
||||||
|
@ -54,6 +58,9 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
||||||
|
|
||||||
short_ts = lambda ts: ts.split('.')[0]
|
short_ts = lambda ts: ts.split('.')[0]
|
||||||
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
|
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
|
||||||
|
urldecode = lambda s: unquote(s)
|
||||||
|
htmlencode = lambda s: escape(s, quote=True)
|
||||||
|
htmldecode = lambda s: unescape(s)
|
||||||
|
|
||||||
URL_REGEX = re.compile(
|
URL_REGEX = re.compile(
|
||||||
r'http[s]?://' # start matching from allowed schemes
|
r'http[s]?://' # start matching from allowed schemes
|
||||||
|
@ -89,7 +96,7 @@ STATICFILE_EXTENSIONS = {
|
||||||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||||
}
|
}
|
||||||
|
|
||||||
Link = Dict[str, Any]
|
|
||||||
|
|
||||||
### Checks & Tests
|
### Checks & Tests
|
||||||
|
|
||||||
|
@ -105,7 +112,7 @@ def check_link_structure(link: Link) -> None:
|
||||||
assert isinstance(key, str)
|
assert isinstance(key, str)
|
||||||
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
|
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
|
||||||
|
|
||||||
def check_links_structure(links: List[Link]) -> None:
|
def check_links_structure(links: Iterable[Link]) -> None:
|
||||||
"""basic sanity check invariants to make sure the data is valid"""
|
"""basic sanity check invariants to make sure the data is valid"""
|
||||||
assert isinstance(links, list)
|
assert isinstance(links, list)
|
||||||
if links:
|
if links:
|
||||||
|
@ -334,7 +341,7 @@ def derived_link_info(link: Link) -> dict:
|
||||||
|
|
||||||
url = link['url']
|
url = link['url']
|
||||||
|
|
||||||
to_date_str = lambda ts: datetime.fromtimestamp(Decimal(ts)).strftime('%Y-%m-%d %H:%M')
|
to_date_str = lambda ts: datetime.fromtimestamp(float(ts)).strftime('%Y-%m-%d %H:%M')
|
||||||
|
|
||||||
extended_info = {
|
extended_info = {
|
||||||
**link,
|
**link,
|
||||||
|
@ -582,3 +589,30 @@ def chrome_args(**options) -> List[str]:
|
||||||
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
||||||
|
|
||||||
return cmd_args
|
return cmd_args
|
||||||
|
|
||||||
|
|
||||||
|
class ExtendedEncoder(JSONEncoder):
|
||||||
|
"""
|
||||||
|
Extended json serializer that supports serializing several model
|
||||||
|
fields and objects
|
||||||
|
"""
|
||||||
|
|
||||||
|
def default(self, obj):
|
||||||
|
cls_name = obj.__class__.__name__
|
||||||
|
|
||||||
|
if hasattr(obj, '_asdict'):
|
||||||
|
return obj._asdict()
|
||||||
|
|
||||||
|
elif isinstance(obj, bytes):
|
||||||
|
return obj.decode()
|
||||||
|
|
||||||
|
elif isinstance(obj, datetime):
|
||||||
|
return obj.isoformat()
|
||||||
|
|
||||||
|
elif isinstance(obj, Exception):
|
||||||
|
return '{}: {}'.format(obj.__class__.__name__, obj)
|
||||||
|
|
||||||
|
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
|
||||||
|
return tuple(obj)
|
||||||
|
|
||||||
|
return JSONEncoder.default(self, obj)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue