mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
switch to dataclasses, working Link type hints everywhere
This commit is contained in:
parent
346811fb78
commit
25a107df43
10 changed files with 504 additions and 363 deletions
|
@ -12,14 +12,13 @@ Usage & Documentation:
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
|
|
||||||
from schema import Link
|
from schema import Link
|
||||||
from links import links_after_timestamp
|
from links import links_after_timestamp
|
||||||
from index import write_links_index, load_links_index
|
from index import write_links_index, load_links_index
|
||||||
from archive_methods import archive_link
|
from archive_methods import archive_link
|
||||||
from config import (
|
from config import (
|
||||||
ARCHIVE_DIR,
|
|
||||||
ONLY_NEW,
|
ONLY_NEW,
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
GIT_SHA,
|
GIT_SHA,
|
||||||
|
@ -109,19 +108,19 @@ def update_archive_data(import_path: str=None, resume: float=None) -> List[Link]
|
||||||
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
|
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
|
||||||
|
|
||||||
# Step 2: Write updated index with deduped old and new links back to disk
|
# Step 2: Write updated index with deduped old and new links back to disk
|
||||||
write_links_index(out_dir=OUTPUT_DIR, links=all_links)
|
write_links_index(out_dir=OUTPUT_DIR, links=list(all_links))
|
||||||
|
|
||||||
# Step 3: Run the archive methods for each link
|
# Step 3: Run the archive methods for each link
|
||||||
links = new_links if ONLY_NEW else all_links
|
links = new_links if ONLY_NEW else all_links
|
||||||
log_archiving_started(len(links), resume)
|
log_archiving_started(len(links), resume)
|
||||||
idx, link = 0, {'timestamp': 0}
|
idx: int = 0
|
||||||
|
link: Optional[Link] = None
|
||||||
try:
|
try:
|
||||||
for idx, link in enumerate(links_after_timestamp(links, resume)):
|
for idx, link in enumerate(links_after_timestamp(links, resume)):
|
||||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
archive_link(link)
|
||||||
archive_link(link_dir, link)
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
log_archiving_paused(len(links), idx, link['timestamp'])
|
log_archiving_paused(len(links), idx, link.timestamp if link else '0')
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
|
||||||
except:
|
except:
|
||||||
|
@ -132,7 +131,7 @@ def update_archive_data(import_path: str=None, resume: float=None) -> List[Link]
|
||||||
|
|
||||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||||
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
|
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
|
||||||
write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
|
write_links_index(out_dir=OUTPUT_DIR, links=list(all_links), finished=True)
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -52,7 +52,6 @@ from util import (
|
||||||
chmod_file,
|
chmod_file,
|
||||||
wget_output_path,
|
wget_output_path,
|
||||||
chrome_args,
|
chrome_args,
|
||||||
check_link_structure,
|
|
||||||
run, PIPE, DEVNULL,
|
run, PIPE, DEVNULL,
|
||||||
Link,
|
Link,
|
||||||
)
|
)
|
||||||
|
@ -64,9 +63,7 @@ from logs import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def archive_link(link: Link, page=None) -> Link:
|
||||||
|
|
||||||
def archive_link(link_dir: str, link: Link, page=None) -> Link:
|
|
||||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||||
|
|
||||||
ARCHIVE_METHODS = (
|
ARCHIVE_METHODS = (
|
||||||
|
@ -82,24 +79,24 @@ def archive_link(link_dir: str, link: Link, page=None) -> Link:
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
is_new = not os.path.exists(link_dir)
|
is_new = not os.path.exists(link.link_dir)
|
||||||
if is_new:
|
if is_new:
|
||||||
os.makedirs(link_dir)
|
os.makedirs(link.link_dir)
|
||||||
|
|
||||||
link = load_json_link_index(link_dir, link)
|
link = load_json_link_index(link.link_dir, link)
|
||||||
log_link_archiving_started(link_dir, link, is_new)
|
log_link_archiving_started(link.link_dir, link, is_new)
|
||||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||||
|
|
||||||
for method_name, should_run, method_function in ARCHIVE_METHODS:
|
for method_name, should_run, method_function in ARCHIVE_METHODS:
|
||||||
if method_name not in link['history']:
|
if method_name not in link.history:
|
||||||
link['history'][method_name] = []
|
link.history[method_name] = []
|
||||||
|
|
||||||
if should_run(link_dir, link):
|
if should_run(link.link_dir, link):
|
||||||
log_archive_method_started(method_name)
|
log_archive_method_started(method_name)
|
||||||
|
|
||||||
result = method_function(link_dir, link)
|
result = method_function(link.link_dir, link)
|
||||||
|
|
||||||
link['history'][method_name].append(result._asdict())
|
link.history[method_name].append(result)
|
||||||
|
|
||||||
stats[result.status] += 1
|
stats[result.status] += 1
|
||||||
log_archive_method_finished(result)
|
log_archive_method_finished(result)
|
||||||
|
@ -108,14 +105,22 @@ def archive_link(link_dir: str, link: Link, page=None) -> Link:
|
||||||
|
|
||||||
# print(' ', stats)
|
# print(' ', stats)
|
||||||
|
|
||||||
write_link_index(link_dir, link)
|
link = Link(**{
|
||||||
|
**link._asdict(),
|
||||||
|
'updated': datetime.now(),
|
||||||
|
})
|
||||||
|
|
||||||
|
write_link_index(link.link_dir, link)
|
||||||
patch_links_index(link)
|
patch_links_index(link)
|
||||||
log_link_archiving_finished(link_dir, link, is_new, stats)
|
log_link_archiving_finished(link.link_dir, link, is_new, stats)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
||||||
raise
|
raise
|
||||||
|
|
||||||
return link
|
return link
|
||||||
|
|
||||||
|
|
||||||
|
@ -123,10 +128,10 @@ def archive_link(link_dir: str, link: Link, page=None) -> Link:
|
||||||
|
|
||||||
def should_fetch_title(link_dir: str, link: Link) -> bool:
|
def should_fetch_title(link_dir: str, link: Link) -> bool:
|
||||||
# if link already has valid title, skip it
|
# if link already has valid title, skip it
|
||||||
if link['title'] and not link['title'].lower().startswith('http'):
|
if link.title and not link.title.lower().startswith('http'):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return FETCH_TITLE
|
return FETCH_TITLE
|
||||||
|
@ -137,7 +142,7 @@ def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResul
|
||||||
output = None
|
output = None
|
||||||
cmd = [
|
cmd = [
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
link['url'],
|
link.url,
|
||||||
'|',
|
'|',
|
||||||
'grep',
|
'grep',
|
||||||
'<title>',
|
'<title>',
|
||||||
|
@ -145,7 +150,7 @@ def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResul
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
output = fetch_page_title(link['url'], timeout=timeout, progress=False)
|
output = fetch_page_title(link.url, timeout=timeout, progress=False)
|
||||||
if not output:
|
if not output:
|
||||||
raise ArchiveError('Unable to detect page title')
|
raise ArchiveError('Unable to detect page title')
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
@ -180,7 +185,7 @@ def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveRes
|
||||||
'--location',
|
'--location',
|
||||||
'--output', output,
|
'--output', output,
|
||||||
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
||||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
|
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
@ -240,7 +245,7 @@ def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult
|
||||||
*(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
|
*(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
|
||||||
*(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
|
*(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
|
||||||
*((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
|
*((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
|
||||||
link['url'],
|
link.url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
@ -290,7 +295,7 @@ def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult
|
||||||
)
|
)
|
||||||
|
|
||||||
def should_fetch_pdf(link_dir: str, link: Link) -> bool:
|
def should_fetch_pdf(link_dir: str, link: Link) -> bool:
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if os.path.exists(os.path.join(link_dir, 'output.pdf')):
|
if os.path.exists(os.path.join(link_dir, 'output.pdf')):
|
||||||
|
@ -306,7 +311,7 @@ def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(TIMEOUT=timeout),
|
*chrome_args(TIMEOUT=timeout),
|
||||||
'--print-to-pdf',
|
'--print-to-pdf',
|
||||||
link['url'],
|
link.url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
@ -334,7 +339,7 @@ def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||||
)
|
)
|
||||||
|
|
||||||
def should_fetch_screenshot(link_dir: str, link: Link) -> bool:
|
def should_fetch_screenshot(link_dir: str, link: Link) -> bool:
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
|
if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
|
||||||
|
@ -349,7 +354,7 @@ def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> Archive
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(TIMEOUT=timeout),
|
*chrome_args(TIMEOUT=timeout),
|
||||||
'--screenshot',
|
'--screenshot',
|
||||||
link['url'],
|
link.url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
@ -377,7 +382,7 @@ def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> Archive
|
||||||
)
|
)
|
||||||
|
|
||||||
def should_fetch_dom(link_dir: str, link: Link) -> bool:
|
def should_fetch_dom(link_dir: str, link: Link) -> bool:
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if os.path.exists(os.path.join(link_dir, 'output.html')):
|
if os.path.exists(os.path.join(link_dir, 'output.html')):
|
||||||
|
@ -393,7 +398,7 @@ def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(TIMEOUT=timeout),
|
*chrome_args(TIMEOUT=timeout),
|
||||||
'--dump-dom',
|
'--dump-dom',
|
||||||
link['url']
|
link.url
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
@ -422,15 +427,15 @@ def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||||
)
|
)
|
||||||
|
|
||||||
def should_fetch_git(link_dir: str, link: Link) -> bool:
|
def should_fetch_git(link_dir: str, link: Link) -> bool:
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if os.path.exists(os.path.join(link_dir, 'git')):
|
if os.path.exists(os.path.join(link_dir, 'git')):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
is_clonable_url = (
|
is_clonable_url = (
|
||||||
(domain(link['url']) in GIT_DOMAINS)
|
(domain(link.url) in GIT_DOMAINS)
|
||||||
or (extension(link['url']) == 'git')
|
or (extension(link.url) == 'git')
|
||||||
)
|
)
|
||||||
if not is_clonable_url:
|
if not is_clonable_url:
|
||||||
return False
|
return False
|
||||||
|
@ -450,7 +455,7 @@ def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||||
'--mirror',
|
'--mirror',
|
||||||
'--recursive',
|
'--recursive',
|
||||||
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
|
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
|
||||||
without_query(without_fragment(link['url'])),
|
without_query(without_fragment(link.url)),
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
@ -481,7 +486,7 @@ def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||||
|
|
||||||
|
|
||||||
def should_fetch_media(link_dir: str, link: Link) -> bool:
|
def should_fetch_media(link_dir: str, link: Link) -> bool:
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if os.path.exists(os.path.join(link_dir, 'media')):
|
if os.path.exists(os.path.join(link_dir, 'media')):
|
||||||
|
@ -515,7 +520,7 @@ def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> Archiv
|
||||||
'--embed-thumbnail',
|
'--embed-thumbnail',
|
||||||
'--add-metadata',
|
'--add-metadata',
|
||||||
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
|
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
|
||||||
link['url'],
|
link.url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
@ -553,7 +558,7 @@ def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> Archiv
|
||||||
|
|
||||||
|
|
||||||
def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool:
|
def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool:
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
|
if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
|
||||||
|
@ -567,7 +572,7 @@ def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveR
|
||||||
|
|
||||||
output = 'archive.org.txt'
|
output = 'archive.org.txt'
|
||||||
archive_org_url = None
|
archive_org_url = None
|
||||||
submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
|
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||||
cmd = [
|
cmd = [
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
'--location',
|
'--location',
|
||||||
|
@ -586,7 +591,7 @@ def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveR
|
||||||
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
||||||
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
||||||
archive_org_url = None
|
archive_org_url = None
|
||||||
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
|
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
|
||||||
elif errors:
|
elif errors:
|
||||||
raise ArchiveError(', '.join(errors))
|
raise ArchiveError(', '.join(errors))
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
@ -77,7 +76,7 @@ if COOKIES_FILE:
|
||||||
COOKIES_FILE = os.path.abspath(COOKIES_FILE)
|
COOKIES_FILE = os.path.abspath(COOKIES_FILE)
|
||||||
|
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
# ************************ Environment & Dependencies **************************
|
# ***************************** Helper Functions *******************************
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
|
|
||||||
def check_version(binary: str) -> str:
|
def check_version(binary: str) -> str:
|
||||||
|
@ -95,6 +94,7 @@ def check_version(binary: str) -> str:
|
||||||
print('{red}[X] Unable to find a working version of {cmd}, is it installed and in your $PATH?'.format(cmd=binary, **ANSI))
|
print('{red}[X] Unable to find a working version of {cmd}, is it installed and in your $PATH?'.format(cmd=binary, **ANSI))
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
def find_chrome_binary() -> Optional[str]:
|
def find_chrome_binary() -> Optional[str]:
|
||||||
"""find any installed chrome binaries in the default locations"""
|
"""find any installed chrome binaries in the default locations"""
|
||||||
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
||||||
|
@ -119,6 +119,7 @@ def find_chrome_binary() -> Optional[str]:
|
||||||
print('{red}[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?'.format(**ANSI))
|
print('{red}[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?'.format(**ANSI))
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
def find_chrome_data_dir() -> Optional[str]:
|
def find_chrome_data_dir() -> Optional[str]:
|
||||||
"""find any installed chrome user data directories in the default locations"""
|
"""find any installed chrome user data directories in the default locations"""
|
||||||
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
||||||
|
@ -142,6 +143,7 @@ def find_chrome_data_dir() -> Optional[str]:
|
||||||
return full_path
|
return full_path
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_git_version() -> str:
|
def get_git_version() -> str:
|
||||||
"""get the git commit hash of the python code folder (aka code version)"""
|
"""get the git commit hash of the python code folder (aka code version)"""
|
||||||
try:
|
try:
|
||||||
|
@ -151,6 +153,10 @@ def get_git_version() -> str:
|
||||||
return 'unknown'
|
return 'unknown'
|
||||||
|
|
||||||
|
|
||||||
|
# ******************************************************************************
|
||||||
|
# ************************ Environment & Dependencies **************************
|
||||||
|
# ******************************************************************************
|
||||||
|
|
||||||
try:
|
try:
|
||||||
GIT_SHA = get_git_version()
|
GIT_SHA = get_git_version()
|
||||||
|
|
||||||
|
@ -188,19 +194,33 @@ try:
|
||||||
print(' Alternatively, run this script with:')
|
print(' Alternatively, run this script with:')
|
||||||
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
|
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
|
||||||
|
|
||||||
|
|
||||||
### Make sure curl is installed
|
### Make sure curl is installed
|
||||||
USE_CURL = FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG
|
USE_CURL = FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG
|
||||||
CURL_VERSION = USE_CURL and check_version(CURL_BINARY)
|
CURL_VERSION = None
|
||||||
|
if USE_CURL:
|
||||||
|
CURL_VERSION = check_version(CURL_BINARY)
|
||||||
|
|
||||||
### Make sure wget is installed and calculate version
|
### Make sure wget is installed and calculate version
|
||||||
USE_WGET = FETCH_WGET or FETCH_WARC
|
USE_WGET = FETCH_WGET or FETCH_WARC
|
||||||
WGET_VERSION = USE_WGET and check_version(WGET_BINARY)
|
WGET_VERSION = None
|
||||||
|
if USE_WGET:
|
||||||
|
WGET_VERSION = check_version(WGET_BINARY)
|
||||||
|
|
||||||
WGET_USER_AGENT = WGET_USER_AGENT.format(
|
WGET_USER_AGENT = WGET_USER_AGENT.format(
|
||||||
GIT_SHA=GIT_SHA[:9],
|
GIT_SHA=GIT_SHA[:9],
|
||||||
WGET_VERSION=WGET_VERSION or '',
|
WGET_VERSION=WGET_VERSION or '',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
### Make sure git is installed
|
||||||
|
GIT_VERSION = None
|
||||||
|
if FETCH_GIT:
|
||||||
|
GIT_VERSION = check_version(GIT_BINARY)
|
||||||
|
|
||||||
|
### Make sure youtube-dl is installed
|
||||||
|
YOUTUBEDL_VERSION = None
|
||||||
|
if FETCH_MEDIA:
|
||||||
|
check_version(YOUTUBEDL_BINARY)
|
||||||
|
|
||||||
### Make sure chrome is installed and calculate version
|
### Make sure chrome is installed and calculate version
|
||||||
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
|
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
|
||||||
CHROME_VERSION = None
|
CHROME_VERSION = None
|
||||||
|
@ -214,13 +234,6 @@ try:
|
||||||
CHROME_USER_DATA_DIR = find_chrome_data_dir()
|
CHROME_USER_DATA_DIR = find_chrome_data_dir()
|
||||||
# print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
|
# print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
|
||||||
|
|
||||||
### Make sure git is installed
|
|
||||||
GIT_VERSION = FETCH_GIT and check_version(GIT_BINARY)
|
|
||||||
|
|
||||||
### Make sure youtube-dl is installed
|
|
||||||
YOUTUBEDL_VERSION = FETCH_MEDIA and check_version(YOUTUBEDL_BINARY)
|
|
||||||
|
|
||||||
### Chrome housekeeping options
|
|
||||||
CHROME_OPTIONS = {
|
CHROME_OPTIONS = {
|
||||||
'TIMEOUT': TIMEOUT,
|
'TIMEOUT': TIMEOUT,
|
||||||
'RESOLUTION': RESOLUTION,
|
'RESOLUTION': RESOLUTION,
|
||||||
|
@ -236,7 +249,6 @@ try:
|
||||||
# 'ignoreHTTPSErrors': not CHECK_SSL_VALIDITY,
|
# 'ignoreHTTPSErrors': not CHECK_SSL_VALIDITY,
|
||||||
# # 'executablePath': CHROME_BINARY,
|
# # 'executablePath': CHROME_BINARY,
|
||||||
# }
|
# }
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from itertools import chain
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from string import Template
|
from string import Template
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple, Iterator, Optional
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from distutils.dir_util import copy_tree
|
from distutils.dir_util import copy_tree
|
||||||
|
@ -11,7 +12,7 @@ except ImportError:
|
||||||
print('[X] Missing "distutils" python package. To install it, run:')
|
print('[X] Missing "distutils" python package. To install it, run:')
|
||||||
print(' pip install distutils')
|
print(' pip install distutils')
|
||||||
|
|
||||||
from schema import Link, ArchiveIndex
|
from schema import Link, ArchiveIndex, ArchiveResult
|
||||||
from config import (
|
from config import (
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
TEMPLATES_DIR,
|
TEMPLATES_DIR,
|
||||||
|
@ -22,11 +23,10 @@ from util import (
|
||||||
chmod_file,
|
chmod_file,
|
||||||
urlencode,
|
urlencode,
|
||||||
derived_link_info,
|
derived_link_info,
|
||||||
|
wget_output_path,
|
||||||
|
ExtendedEncoder,
|
||||||
check_link_structure,
|
check_link_structure,
|
||||||
check_links_structure,
|
check_links_structure,
|
||||||
wget_output_path,
|
|
||||||
latest_output,
|
|
||||||
ExtendedEncoder,
|
|
||||||
)
|
)
|
||||||
from parse import parse_links
|
from parse import parse_links
|
||||||
from links import validate_links
|
from links import validate_links
|
||||||
|
@ -47,7 +47,6 @@ def write_links_index(out_dir: str, links: List[Link], finished: bool=False) ->
|
||||||
"""create index.html file for a given list of links"""
|
"""create index.html file for a given list of links"""
|
||||||
|
|
||||||
log_indexing_process_started()
|
log_indexing_process_started()
|
||||||
check_links_structure(links)
|
|
||||||
|
|
||||||
log_indexing_started(out_dir, 'index.json')
|
log_indexing_started(out_dir, 'index.json')
|
||||||
write_json_links_index(out_dir, links)
|
write_json_links_index(out_dir, links)
|
||||||
|
@ -63,20 +62,17 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[Li
|
||||||
|
|
||||||
existing_links: List[Link] = []
|
existing_links: List[Link] = []
|
||||||
if out_dir:
|
if out_dir:
|
||||||
existing_links = parse_json_links_index(out_dir)
|
existing_links = list(parse_json_links_index(out_dir))
|
||||||
check_links_structure(existing_links)
|
|
||||||
|
|
||||||
new_links: List[Link] = []
|
new_links: List[Link] = []
|
||||||
if import_path:
|
if import_path:
|
||||||
# parse and validate the import file
|
# parse and validate the import file
|
||||||
log_parsing_started(import_path)
|
log_parsing_started(import_path)
|
||||||
raw_links, parser_name = parse_links(import_path)
|
raw_links, parser_name = parse_links(import_path)
|
||||||
new_links = validate_links(raw_links)
|
new_links = list(validate_links(raw_links))
|
||||||
check_links_structure(new_links)
|
|
||||||
|
|
||||||
# merge existing links in out_dir and new links
|
# merge existing links in out_dir and new links
|
||||||
all_links = validate_links(existing_links + new_links)
|
all_links = list(validate_links(existing_links + new_links))
|
||||||
check_links_structure(all_links)
|
|
||||||
num_new_links = len(all_links) - len(existing_links)
|
num_new_links = len(all_links) - len(existing_links)
|
||||||
|
|
||||||
if import_path and parser_name:
|
if import_path and parser_name:
|
||||||
|
@ -88,7 +84,15 @@ def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[Li
|
||||||
def write_json_links_index(out_dir: str, links: List[Link]) -> None:
|
def write_json_links_index(out_dir: str, links: List[Link]) -> None:
|
||||||
"""write the json link index to a given path"""
|
"""write the json link index to a given path"""
|
||||||
|
|
||||||
check_links_structure(links)
|
assert isinstance(links, List), 'Links must be a list, not a generator.'
|
||||||
|
assert isinstance(links[0].history, dict)
|
||||||
|
assert isinstance(links[0].sources, list)
|
||||||
|
|
||||||
|
if links[0].history.get('title'):
|
||||||
|
assert isinstance(links[0].history['title'][0], ArchiveResult)
|
||||||
|
|
||||||
|
if links[0].sources:
|
||||||
|
assert isinstance(links[0].sources[0], str)
|
||||||
|
|
||||||
path = os.path.join(out_dir, 'index.json')
|
path = os.path.join(out_dir, 'index.json')
|
||||||
|
|
||||||
|
@ -98,7 +102,7 @@ def write_json_links_index(out_dir: str, links: List[Link]) -> None:
|
||||||
docs='https://github.com/pirate/ArchiveBox/wiki',
|
docs='https://github.com/pirate/ArchiveBox/wiki',
|
||||||
version=GIT_SHA,
|
version=GIT_SHA,
|
||||||
num_links=len(links),
|
num_links=len(links),
|
||||||
updated=str(datetime.now().timestamp()),
|
updated=datetime.now(),
|
||||||
links=links,
|
links=links,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -110,23 +114,23 @@ def write_json_links_index(out_dir: str, links: List[Link]) -> None:
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> List[Link]:
|
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
"""parse a archive index json file and return the list of links"""
|
"""parse a archive index json file and return the list of links"""
|
||||||
|
|
||||||
index_path = os.path.join(out_dir, 'index.json')
|
index_path = os.path.join(out_dir, 'index.json')
|
||||||
if os.path.exists(index_path):
|
if os.path.exists(index_path):
|
||||||
with open(index_path, 'r', encoding='utf-8') as f:
|
with open(index_path, 'r', encoding='utf-8') as f:
|
||||||
links = json.load(f)['links']
|
links = json.load(f)['links']
|
||||||
check_links_structure(links)
|
check_links_structure(links)
|
||||||
return links
|
for link in links:
|
||||||
|
yield Link(**link)
|
||||||
|
|
||||||
return []
|
return ()
|
||||||
|
|
||||||
|
|
||||||
def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
|
def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False) -> None:
|
||||||
"""write the html link index to a given path"""
|
"""write the html link index to a given path"""
|
||||||
|
|
||||||
check_links_structure(links)
|
|
||||||
|
|
||||||
path = os.path.join(out_dir, 'index.html')
|
path = os.path.join(out_dir, 'index.html')
|
||||||
|
|
||||||
copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static'))
|
copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static'))
|
||||||
|
@ -140,24 +144,22 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
|
||||||
with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
|
||||||
link_row_html = f.read()
|
link_row_html = f.read()
|
||||||
|
|
||||||
full_links_info = (derived_link_info(link) for link in links)
|
|
||||||
|
|
||||||
link_rows = '\n'.join(
|
link_rows = '\n'.join(
|
||||||
Template(link_row_html).substitute(**{
|
Template(link_row_html).substitute(**{
|
||||||
**link,
|
**derived_link_info(link),
|
||||||
'title': (
|
'title': (
|
||||||
link['title']
|
link.title
|
||||||
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
|
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
|
||||||
),
|
),
|
||||||
'favicon_url': (
|
'favicon_url': (
|
||||||
os.path.join('archive', link['timestamp'], 'favicon.ico')
|
os.path.join('archive', link.timestamp, 'favicon.ico')
|
||||||
# if link['is_archived'] else ''
|
# if link['is_archived'] else ''
|
||||||
),
|
),
|
||||||
'archive_url': urlencode(
|
'archive_url': urlencode(
|
||||||
wget_output_path(link) or 'index.html'
|
wget_output_path(link) or 'index.html'
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
for link in full_links_info
|
for link in links
|
||||||
)
|
)
|
||||||
|
|
||||||
template_vars = {
|
template_vars = {
|
||||||
|
@ -180,28 +182,33 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
|
||||||
def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||||
"""hack to in-place update one row's info in the generated index html"""
|
"""hack to in-place update one row's info in the generated index html"""
|
||||||
|
|
||||||
title = link['title'] or latest_output(link)['title']
|
title = link.title or link.latest_outputs()['title']
|
||||||
successful = len(tuple(filter(None, latest_output(link).values())))
|
successful = link.num_outputs
|
||||||
|
|
||||||
# Patch JSON index
|
# Patch JSON index
|
||||||
changed = False
|
changed = False
|
||||||
json_file_links = parse_json_links_index(out_dir)
|
json_file_links = parse_json_links_index(out_dir)
|
||||||
|
patched_links = []
|
||||||
for saved_link in json_file_links:
|
for saved_link in json_file_links:
|
||||||
if saved_link['url'] == link['url']:
|
if saved_link.url == link.url:
|
||||||
saved_link['title'] = title
|
patched_links.append(Link(**{
|
||||||
saved_link['history'] = link['history']
|
**saved_link._asdict(),
|
||||||
changed = True
|
'title': title,
|
||||||
break
|
'history': link.history,
|
||||||
if changed:
|
'updated': link.updated,
|
||||||
write_json_links_index(out_dir, json_file_links)
|
}))
|
||||||
|
else:
|
||||||
|
patched_links.append(saved_link)
|
||||||
|
|
||||||
|
write_json_links_index(out_dir, patched_links)
|
||||||
|
|
||||||
# Patch HTML index
|
# Patch HTML index
|
||||||
html_path = os.path.join(out_dir, 'index.html')
|
html_path = os.path.join(out_dir, 'index.html')
|
||||||
html = open(html_path, 'r').read().split('\n')
|
html = open(html_path, 'r').read().split('\n')
|
||||||
for idx, line in enumerate(html):
|
for idx, line in enumerate(html):
|
||||||
if title and ('<span data-title-for="{}"'.format(link['url']) in line):
|
if title and ('<span data-title-for="{}"'.format(link.url) in line):
|
||||||
html[idx] = '<span>{}</span>'.format(title)
|
html[idx] = '<span>{}</span>'.format(title)
|
||||||
elif successful and ('<span data-number-for="{}"'.format(link['url']) in line):
|
elif successful and ('<span data-number-for="{}"'.format(link.url) in line):
|
||||||
html[idx] = '<span>{}</span>'.format(successful)
|
html[idx] = '<span>{}</span>'.format(successful)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -212,7 +219,6 @@ def patch_links_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||||
### Individual link index
|
### Individual link index
|
||||||
|
|
||||||
def write_link_index(out_dir: str, link: Link) -> None:
|
def write_link_index(out_dir: str, link: Link) -> None:
|
||||||
link['updated'] = str(datetime.now().timestamp())
|
|
||||||
write_json_link_index(out_dir, link)
|
write_json_link_index(out_dir, link)
|
||||||
write_html_link_index(out_dir, link)
|
write_html_link_index(out_dir, link)
|
||||||
|
|
||||||
|
@ -220,66 +226,58 @@ def write_link_index(out_dir: str, link: Link) -> None:
|
||||||
def write_json_link_index(out_dir: str, link: Link) -> None:
|
def write_json_link_index(out_dir: str, link: Link) -> None:
|
||||||
"""write a json file with some info about the link"""
|
"""write a json file with some info about the link"""
|
||||||
|
|
||||||
check_link_structure(link)
|
|
||||||
path = os.path.join(out_dir, 'index.json')
|
path = os.path.join(out_dir, 'index.json')
|
||||||
|
|
||||||
with open(path, 'w', encoding='utf-8') as f:
|
with open(path, 'w', encoding='utf-8') as f:
|
||||||
json.dump(link, f, indent=4, cls=ExtendedEncoder)
|
json.dump(link._asdict(), f, indent=4, cls=ExtendedEncoder)
|
||||||
|
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
def parse_json_link_index(out_dir: str) -> dict:
|
def parse_json_link_index(out_dir: str) -> Optional[Link]:
|
||||||
"""load the json link index from a given directory"""
|
"""load the json link index from a given directory"""
|
||||||
existing_index = os.path.join(out_dir, 'index.json')
|
existing_index = os.path.join(out_dir, 'index.json')
|
||||||
if os.path.exists(existing_index):
|
if os.path.exists(existing_index):
|
||||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||||
link_json = json.load(f)
|
link_json = json.load(f)
|
||||||
check_link_structure(link_json)
|
check_link_structure(link_json)
|
||||||
return link_json
|
return Link(**link_json)
|
||||||
return {}
|
return None
|
||||||
|
|
||||||
|
|
||||||
def load_json_link_index(out_dir: str, link: Link) -> Link:
|
def load_json_link_index(out_dir: str, link: Link) -> Link:
|
||||||
"""check for an existing link archive in the given directory,
|
"""check for an existing link archive in the given directory,
|
||||||
and load+merge it into the given link dict
|
and load+merge it into the given link dict
|
||||||
"""
|
"""
|
||||||
link = {
|
|
||||||
**parse_json_link_index(out_dir),
|
|
||||||
**link,
|
|
||||||
}
|
|
||||||
link.update({
|
|
||||||
'history': link.get('history') or {},
|
|
||||||
})
|
|
||||||
|
|
||||||
check_link_structure(link)
|
existing_link = parse_json_link_index(out_dir)
|
||||||
return link
|
existing_link = existing_link._asdict() if existing_link else {}
|
||||||
|
new_link = link._asdict()
|
||||||
|
|
||||||
|
return Link(**{**existing_link, **new_link})
|
||||||
|
|
||||||
|
|
||||||
def write_html_link_index(out_dir: str, link: Link) -> None:
|
def write_html_link_index(out_dir: str, link: Link) -> None:
|
||||||
check_link_structure(link)
|
|
||||||
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
|
||||||
link_html = f.read()
|
link_html = f.read()
|
||||||
|
|
||||||
path = os.path.join(out_dir, 'index.html')
|
path = os.path.join(out_dir, 'index.html')
|
||||||
|
|
||||||
link = derived_link_info(link)
|
|
||||||
|
|
||||||
with open(path, 'w', encoding='utf-8') as f:
|
with open(path, 'w', encoding='utf-8') as f:
|
||||||
f.write(Template(link_html).substitute({
|
f.write(Template(link_html).substitute({
|
||||||
**link,
|
**derived_link_info(link),
|
||||||
'title': (
|
'title': (
|
||||||
link['title']
|
link.title
|
||||||
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
|
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
|
||||||
),
|
),
|
||||||
'archive_url': urlencode(
|
'archive_url': urlencode(
|
||||||
wget_output_path(link)
|
wget_output_path(link)
|
||||||
or (link['domain'] if link['is_archived'] else 'about:blank')
|
or (link.domain if link.is_archived else 'about:blank')
|
||||||
),
|
),
|
||||||
'extension': link['extension'] or 'html',
|
'extension': link.extension or 'html',
|
||||||
'tags': link['tags'].strip() or 'untagged',
|
'tags': link.tags or 'untagged',
|
||||||
'status': 'Archived' if link['is_archived'] else 'Not yet archived',
|
'status': 'Archived' if link.is_archived else 'Not yet archived',
|
||||||
'status_color': 'success' if link['is_archived'] else 'danger',
|
'status_color': 'success' if link.is_archived else 'danger',
|
||||||
}))
|
}))
|
||||||
|
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
|
@ -11,7 +11,7 @@ Link {
|
||||||
sources: [str],
|
sources: [str],
|
||||||
history: {
|
history: {
|
||||||
pdf: [
|
pdf: [
|
||||||
{start_ts, end_ts, duration, cmd, pwd, status, output},
|
{start_ts, end_ts, cmd, pwd, cmd_version, status, output},
|
||||||
...
|
...
|
||||||
],
|
],
|
||||||
...
|
...
|
||||||
|
@ -19,41 +19,36 @@ Link {
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List, Iterable
|
from typing import Iterable
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from schema import Link
|
from schema import Link
|
||||||
from util import (
|
from util import (
|
||||||
|
scheme,
|
||||||
|
fuzzy_url,
|
||||||
merge_links,
|
merge_links,
|
||||||
check_link_structure,
|
|
||||||
check_links_structure,
|
|
||||||
htmldecode,
|
htmldecode,
|
||||||
|
hashurl,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def validate_links(links: Iterable[Link]) -> List[Link]:
|
def validate_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||||
check_links_structure(links)
|
|
||||||
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
||||||
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
|
|
||||||
links = sorted_links(links) # deterministically sort the links based on timstamp, url
|
links = sorted_links(links) # deterministically sort the links based on timstamp, url
|
||||||
|
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
|
||||||
|
|
||||||
if not links:
|
if not links:
|
||||||
print('[X] No links found :(')
|
print('[X] No links found :(')
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
for link in links:
|
return links
|
||||||
link['title'] = htmldecode(link['title'].strip()) if link['title'] else None
|
|
||||||
check_link_structure(link)
|
|
||||||
|
|
||||||
return list(links)
|
|
||||||
|
|
||||||
|
|
||||||
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||||
return (
|
return (
|
||||||
link
|
link
|
||||||
for link in links
|
for link in links
|
||||||
if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://'))
|
if scheme(link.url) in ('http', 'https', 'ftp')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -64,38 +59,37 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
||||||
|
|
||||||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||||
|
|
||||||
lower = lambda url: url.lower().strip()
|
|
||||||
without_www = lambda url: url.replace('://www.', '://', 1)
|
|
||||||
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
|
|
||||||
|
|
||||||
for link in sorted_links:
|
for link in sorted_links:
|
||||||
fuzzy_url = without_www(without_trailing_slash(lower(link['url'])))
|
fuzzy = fuzzy_url(link.url)
|
||||||
if fuzzy_url in unique_urls:
|
if fuzzy in unique_urls:
|
||||||
# merge with any other links that share the same url
|
# merge with any other links that share the same url
|
||||||
link = merge_links(unique_urls[fuzzy_url], link)
|
link = merge_links(unique_urls[fuzzy], link)
|
||||||
unique_urls[fuzzy_url] = link
|
unique_urls[fuzzy] = link
|
||||||
|
|
||||||
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
||||||
for link in unique_urls.values():
|
for link in unique_urls.values():
|
||||||
link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
|
new_link = Link(**{
|
||||||
unique_timestamps[link['timestamp']] = link
|
**link._asdict(),
|
||||||
|
'timestamp': lowest_uniq_timestamp(unique_timestamps, link.timestamp),
|
||||||
|
})
|
||||||
|
unique_timestamps[new_link.timestamp] = new_link
|
||||||
|
|
||||||
return unique_timestamps.values()
|
return unique_timestamps.values()
|
||||||
|
|
||||||
|
|
||||||
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
|
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||||
sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
|
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
|
||||||
return sorted(links, key=sort_func, reverse=True)
|
return sorted(links, key=sort_func, reverse=True)
|
||||||
|
|
||||||
|
|
||||||
def links_after_timestamp(links: Iterable[Link], timestamp: str=None) -> Iterable[Link]:
|
def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
|
||||||
if not timestamp:
|
if not resume:
|
||||||
yield from links
|
yield from links
|
||||||
return
|
return
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
try:
|
try:
|
||||||
if float(link['timestamp']) <= float(timestamp):
|
if float(link.timestamp) <= resume:
|
||||||
yield link
|
yield link
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
print('Resume value and all timestamp values must be valid numbers.')
|
print('Resume value and all timestamp values must be valid numbers.')
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
from schema import Link, ArchiveResult, RuntimeStats
|
from schema import Link, ArchiveResult, RuntimeStats
|
||||||
from config import ANSI, REPO_DIR, OUTPUT_DIR
|
from config import ANSI, REPO_DIR, OUTPUT_DIR
|
||||||
|
|
||||||
|
@ -66,7 +67,7 @@ def log_indexing_finished(out_dir: str, out_file: str):
|
||||||
|
|
||||||
### Archiving Stage
|
### Archiving Stage
|
||||||
|
|
||||||
def log_archiving_started(num_links: int, resume: float):
|
def log_archiving_started(num_links: int, resume: Optional[float]):
|
||||||
start_ts = datetime.now()
|
start_ts = datetime.now()
|
||||||
_LAST_RUN_STATS.archiving_start_ts = start_ts
|
_LAST_RUN_STATS.archiving_start_ts = start_ts
|
||||||
if resume:
|
if resume:
|
||||||
|
@ -132,10 +133,10 @@ def log_link_archiving_started(link_dir: str, link: Link, is_new: bool):
|
||||||
symbol_color=ANSI['green' if is_new else 'black'],
|
symbol_color=ANSI['green' if is_new else 'black'],
|
||||||
symbol='+' if is_new else '*',
|
symbol='+' if is_new else '*',
|
||||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
title=link['title'] or link['url'],
|
title=link.title or link.base_url,
|
||||||
**ANSI,
|
**ANSI,
|
||||||
))
|
))
|
||||||
print(' {blue}{url}{reset}'.format(url=link['url'], **ANSI))
|
print(' {blue}{url}{reset}'.format(url=link.url, **ANSI))
|
||||||
print(' {} {}'.format(
|
print(' {} {}'.format(
|
||||||
'>' if is_new else '√',
|
'>' if is_new else '√',
|
||||||
pretty_path(link_dir),
|
pretty_path(link_dir),
|
||||||
|
|
|
@ -26,6 +26,7 @@ import xml.etree.ElementTree as etree
|
||||||
|
|
||||||
from config import TIMEOUT
|
from config import TIMEOUT
|
||||||
from util import (
|
from util import (
|
||||||
|
htmldecode,
|
||||||
str_between,
|
str_between,
|
||||||
URL_REGEX,
|
URL_REGEX,
|
||||||
check_url_parsing_invariants,
|
check_url_parsing_invariants,
|
||||||
|
@ -91,13 +92,13 @@ def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||||
tags = match.group(3)
|
tags = match.group(3)
|
||||||
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
|
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
|
||||||
|
|
||||||
yield {
|
yield Link(
|
||||||
'url': url,
|
url=url,
|
||||||
'timestamp': str(time.timestamp()),
|
timestamp=str(time.timestamp()),
|
||||||
'title': title or None,
|
title=title or None,
|
||||||
'tags': tags or '',
|
tags=tags or '',
|
||||||
'sources': [html_file.name],
|
sources=[html_file.name],
|
||||||
}
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
|
def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
|
||||||
|
@ -137,19 +138,19 @@ def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
|
||||||
# Parse the title
|
# Parse the title
|
||||||
title = None
|
title = None
|
||||||
if link.get('title'):
|
if link.get('title'):
|
||||||
title = link['title'].strip() or None
|
title = link['title'].strip()
|
||||||
elif link.get('description'):
|
elif link.get('description'):
|
||||||
title = link['description'].replace(' — Readability', '').strip() or None
|
title = link['description'].replace(' — Readability', '').strip()
|
||||||
elif link.get('name'):
|
elif link.get('name'):
|
||||||
title = link['name'].strip() or None
|
title = link['name'].strip()
|
||||||
|
|
||||||
yield {
|
yield Link(
|
||||||
'url': url,
|
url=url,
|
||||||
'timestamp': ts_str,
|
timestamp=ts_str,
|
||||||
'title': title,
|
title=htmldecode(title) or None,
|
||||||
'tags': link.get('tags') or '',
|
tags=link.get('tags') or '',
|
||||||
'sources': [json_file.name],
|
sources=[json_file.name],
|
||||||
}
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
|
@ -178,15 +179,15 @@ def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
url = str_between(get_row('link'), '<link>', '</link>')
|
url = str_between(get_row('link'), '<link>', '</link>')
|
||||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
|
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
||||||
|
|
||||||
yield {
|
yield Link(
|
||||||
'url': url,
|
url=url,
|
||||||
'timestamp': str(time.timestamp()),
|
timestamp=str(time.timestamp()),
|
||||||
'title': title,
|
title=htmldecode(title) or None,
|
||||||
'tags': '',
|
tags='',
|
||||||
'sources': [rss_file.name],
|
sources=[rss_file.name],
|
||||||
}
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
|
@ -217,13 +218,13 @@ def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
||||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||||
|
|
||||||
yield {
|
yield Link(
|
||||||
'url': url,
|
url=url,
|
||||||
'timestamp': str(time.timestamp()),
|
timestamp=str(time.timestamp()),
|
||||||
'title': title or None,
|
title=htmldecode(title) or None,
|
||||||
'tags': '',
|
tags='',
|
||||||
'sources': [rss_file.name],
|
sources=[rss_file.name],
|
||||||
}
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
|
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||||
|
@ -239,14 +240,15 @@ def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||||
if match:
|
if match:
|
||||||
url = match.group(1)
|
url = match.group(1)
|
||||||
time = datetime.fromtimestamp(float(match.group(2)))
|
time = datetime.fromtimestamp(float(match.group(2)))
|
||||||
|
title = match.group(3).strip()
|
||||||
|
|
||||||
yield {
|
yield Link(
|
||||||
'url': url,
|
url=url,
|
||||||
'timestamp': str(time.timestamp()),
|
timestamp=str(time.timestamp()),
|
||||||
'title': match.group(3).strip() or None,
|
title=htmldecode(title) or None,
|
||||||
'tags': '',
|
tags='',
|
||||||
'sources': [html_file.name],
|
sources=[html_file.name],
|
||||||
}
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
|
@ -271,13 +273,13 @@ def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
else:
|
else:
|
||||||
time = datetime.now()
|
time = datetime.now()
|
||||||
|
|
||||||
yield {
|
yield Link(
|
||||||
'url': url,
|
url=url,
|
||||||
'timestamp': str(time.timestamp()),
|
timestamp=str(time.timestamp()),
|
||||||
'title': title or None,
|
title=htmldecode(title) or None,
|
||||||
'tags': tags or '',
|
tags=tags or '',
|
||||||
'sources': [rss_file.name],
|
sources=[rss_file.name],
|
||||||
}
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
|
@ -292,13 +294,13 @@ def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||||
ts_str = item.find("pubDate").text
|
ts_str = item.find("pubDate").text
|
||||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
|
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
|
||||||
|
|
||||||
yield {
|
yield Link(
|
||||||
'url': url,
|
url=url,
|
||||||
'timestamp': str(time.timestamp()),
|
timestamp=str(time.timestamp()),
|
||||||
'title': title or None,
|
title=htmldecode(title) or None,
|
||||||
'tags': '',
|
tags='',
|
||||||
'sources': [rss_file.name],
|
sources=[rss_file.name],
|
||||||
}
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
|
def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
|
||||||
|
@ -308,10 +310,10 @@ def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
|
||||||
for line in text_file.readlines():
|
for line in text_file.readlines():
|
||||||
urls = re.findall(URL_REGEX, line) if line.strip() else ()
|
urls = re.findall(URL_REGEX, line) if line.strip() else ()
|
||||||
for url in urls:
|
for url in urls:
|
||||||
yield {
|
yield Link(
|
||||||
'url': url,
|
url=url,
|
||||||
'timestamp': str(datetime.now().timestamp()),
|
timestamp=str(datetime.now().timestamp()),
|
||||||
'title': None,
|
title=None,
|
||||||
'tags': '',
|
tags='',
|
||||||
'sources': [text_file.name],
|
sources=[text_file.name],
|
||||||
}
|
)
|
||||||
|
|
|
@ -1,11 +1,223 @@
|
||||||
|
import os
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from typing import List, Dict, Any, Optional, Union, NamedTuple
|
from typing import List, Dict, Any, Optional, Union
|
||||||
from recordclass import RecordClass
|
|
||||||
|
|
||||||
Link = Dict[str, Any]
|
from dataclasses import dataclass, asdict, field
|
||||||
|
|
||||||
class ArchiveIndex(NamedTuple):
|
|
||||||
|
class ArchiveError(Exception):
|
||||||
|
def __init__(self, message, hints=None):
|
||||||
|
super().__init__(message)
|
||||||
|
self.hints = hints
|
||||||
|
|
||||||
|
LinkDict = Dict[str, Any]
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ArchiveResult:
|
||||||
|
cmd: List[str]
|
||||||
|
pwd: Optional[str]
|
||||||
|
cmd_version: Optional[str]
|
||||||
|
output: Union[str, Exception, None]
|
||||||
|
status: str
|
||||||
|
start_ts: datetime
|
||||||
|
end_ts: datetime
|
||||||
|
|
||||||
|
def _asdict(self):
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def duration(self) -> int:
|
||||||
|
return (self.end_ts - self.start_ts).seconds
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Link:
|
||||||
|
timestamp: str
|
||||||
|
url: str
|
||||||
|
title: Optional[str]
|
||||||
|
tags: Optional[str]
|
||||||
|
sources: List[str]
|
||||||
|
history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
|
||||||
|
updated: Optional[str] = None
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
return self.urlhash
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
if not isinstance(other, Link):
|
||||||
|
return NotImplemented
|
||||||
|
return self.urlhash == other.urlhash
|
||||||
|
|
||||||
|
def __gt__(self, other):
|
||||||
|
if not isinstance(other, Link):
|
||||||
|
return NotImplemented
|
||||||
|
if not self.timestamp or not other.timestamp:
|
||||||
|
return
|
||||||
|
return float(self.timestamp) > float(other.timestamp)
|
||||||
|
|
||||||
|
def _asdict(self, extended=False):
|
||||||
|
info = {
|
||||||
|
'url': self.url,
|
||||||
|
'title': self.title or None,
|
||||||
|
'timestamp': self.timestamp,
|
||||||
|
'updated': self.updated or None,
|
||||||
|
'tags': self.tags or None,
|
||||||
|
'sources': self.sources or [],
|
||||||
|
'history': self.history or {},
|
||||||
|
}
|
||||||
|
if extended:
|
||||||
|
info.update({
|
||||||
|
'link_dir': self.link_dir,
|
||||||
|
'archive_path': self.archive_path,
|
||||||
|
'bookmarked_date': self.bookmarked_date,
|
||||||
|
'updated_date': self.updated_date,
|
||||||
|
'domain': self.domain,
|
||||||
|
'path': self.path,
|
||||||
|
'basename': self.basename,
|
||||||
|
'extension': self.extension,
|
||||||
|
'base_url': self.base_url,
|
||||||
|
'is_static': self.is_static,
|
||||||
|
'is_archived': self.is_archived,
|
||||||
|
'num_outputs': self.num_outputs,
|
||||||
|
})
|
||||||
|
return info
|
||||||
|
|
||||||
|
@property
|
||||||
|
def link_dir(self) -> str:
|
||||||
|
from config import ARCHIVE_DIR
|
||||||
|
return os.path.join(ARCHIVE_DIR, self.timestamp)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def archive_path(self) -> str:
|
||||||
|
from config import ARCHIVE_DIR_NAME
|
||||||
|
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
|
||||||
|
|
||||||
|
### URL Helpers
|
||||||
|
@property
|
||||||
|
def urlhash(self):
|
||||||
|
from util import hashurl
|
||||||
|
|
||||||
|
return hashurl(self.url)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def extension(self) -> str:
|
||||||
|
from util import extension
|
||||||
|
return extension(self.url)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def domain(self) -> str:
|
||||||
|
from util import domain
|
||||||
|
return domain(self.url)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def path(self) -> str:
|
||||||
|
from util import path
|
||||||
|
return path(self.url)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def basename(self) -> str:
|
||||||
|
from util import basename
|
||||||
|
return basename(self.url)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def base_url(self) -> str:
|
||||||
|
from util import base_url
|
||||||
|
return base_url(self.url)
|
||||||
|
|
||||||
|
### Pretty Printing Helpers
|
||||||
|
@property
|
||||||
|
def bookmarked_date(self) -> Optional[str]:
|
||||||
|
from util import ts_to_date
|
||||||
|
return ts_to_date(self.timestamp) if self.timestamp else None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def updated_date(self) -> Optional[str]:
|
||||||
|
from util import ts_to_date
|
||||||
|
return ts_to_date(self.updated) if self.updated else None
|
||||||
|
|
||||||
|
### Archive Status Helpers
|
||||||
|
@property
|
||||||
|
def num_outputs(self) -> int:
|
||||||
|
return len(tuple(filter(None, self.latest_outputs().values())))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_static(self) -> bool:
|
||||||
|
from util import is_static_file
|
||||||
|
return is_static_file(self.url)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_archived(self) -> bool:
|
||||||
|
from config import ARCHIVE_DIR
|
||||||
|
from util import domain
|
||||||
|
|
||||||
|
return os.path.exists(os.path.join(
|
||||||
|
ARCHIVE_DIR,
|
||||||
|
self.timestamp,
|
||||||
|
domain(self.url),
|
||||||
|
))
|
||||||
|
|
||||||
|
def latest_outputs(self, status: str=None) -> Dict[str, Optional[str]]:
|
||||||
|
"""get the latest output that each archive method produced for link"""
|
||||||
|
|
||||||
|
latest = {
|
||||||
|
'title': None,
|
||||||
|
'favicon': None,
|
||||||
|
'wget': None,
|
||||||
|
'warc': None,
|
||||||
|
'pdf': None,
|
||||||
|
'screenshot': None,
|
||||||
|
'dom': None,
|
||||||
|
'git': None,
|
||||||
|
'media': None,
|
||||||
|
'archive_org': None,
|
||||||
|
}
|
||||||
|
for archive_method in latest.keys():
|
||||||
|
# get most recent succesful result in history for each archive method
|
||||||
|
history = self.history.get(archive_method) or []
|
||||||
|
history = filter(lambda result: result.output, reversed(history))
|
||||||
|
if status is not None:
|
||||||
|
history = filter(lambda result: result.status == status, history)
|
||||||
|
|
||||||
|
history = list(history)
|
||||||
|
if history:
|
||||||
|
latest[archive_method] = history[0].output
|
||||||
|
|
||||||
|
return latest
|
||||||
|
|
||||||
|
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||||
|
from util import wget_output_path
|
||||||
|
canonical = {
|
||||||
|
'index_url': 'index.html',
|
||||||
|
'favicon_url': 'favicon.ico',
|
||||||
|
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
|
||||||
|
'archive_url': wget_output_path(self),
|
||||||
|
'warc_url': 'warc',
|
||||||
|
'pdf_url': 'output.pdf',
|
||||||
|
'screenshot_url': 'screenshot.png',
|
||||||
|
'dom_url': 'output.html',
|
||||||
|
'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),
|
||||||
|
'git_url': 'git',
|
||||||
|
'media_url': 'media',
|
||||||
|
}
|
||||||
|
if self.is_static:
|
||||||
|
# static binary files like PDF and images are handled slightly differently.
|
||||||
|
# they're just downloaded once and aren't archived separately multiple times,
|
||||||
|
# so the wget, screenshot, & pdf urls should all point to the same file
|
||||||
|
|
||||||
|
static_url = wget_output_path(self)
|
||||||
|
canonical.update({
|
||||||
|
'title': self.basename,
|
||||||
|
'archive_url': static_url,
|
||||||
|
'pdf_url': static_url,
|
||||||
|
'screenshot_url': static_url,
|
||||||
|
'dom_url': static_url,
|
||||||
|
})
|
||||||
|
return canonical
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ArchiveIndex:
|
||||||
info: str
|
info: str
|
||||||
version: str
|
version: str
|
||||||
source: str
|
source: str
|
||||||
|
@ -14,33 +226,11 @@ class ArchiveIndex(NamedTuple):
|
||||||
updated: str
|
updated: str
|
||||||
links: List[Link]
|
links: List[Link]
|
||||||
|
|
||||||
class ArchiveResult(NamedTuple):
|
def _asdict(self):
|
||||||
cmd: List[str]
|
return asdict(self)
|
||||||
pwd: Optional[str]
|
|
||||||
cmd_version: Optional[str]
|
|
||||||
output: Union[str, Exception, None]
|
|
||||||
status: str
|
|
||||||
start_ts: datetime
|
|
||||||
end_ts: datetime
|
|
||||||
duration: int
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
class ArchiveError(Exception):
|
class RuntimeStats:
|
||||||
def __init__(self, message, hints=None):
|
|
||||||
super().__init__(message)
|
|
||||||
self.hints = hints
|
|
||||||
|
|
||||||
|
|
||||||
class LinkDict(NamedTuple):
|
|
||||||
timestamp: str
|
|
||||||
url: str
|
|
||||||
title: Optional[str]
|
|
||||||
tags: str
|
|
||||||
sources: List[str]
|
|
||||||
history: Dict[str, ArchiveResult]
|
|
||||||
|
|
||||||
|
|
||||||
class RuntimeStats(RecordClass):
|
|
||||||
skipped: int
|
skipped: int
|
||||||
succeeded: int
|
succeeded: int
|
||||||
failed: int
|
failed: int
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
<tr>
|
<tr>
|
||||||
<td title="$timestamp">$bookmarked_date</td>
|
<td title="$timestamp">$bookmarked_date</td>
|
||||||
<td style="text-align:left">
|
<td style="text-align:left">
|
||||||
<a href="$link_dir/$index_url"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
|
<a href="$archive_path/$index_url"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
|
||||||
<a href="$link_dir/$archive_url" title="$title">
|
<a href="$archive_path/$archive_url" title="$title">
|
||||||
<span data-title-for="$url" data-archived="$is_archived">$title</span>
|
<span data-title-for="$url" data-archived="$is_archived">$title</span>
|
||||||
<small>$tags</small>
|
<small>$tags</small>
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<a href="$link_dir/$index_url">📄
|
<a href="$archive_path/$index_url">📄
|
||||||
<span data-number-for="$url" title="Fetching any missing files...">$num_outputs <img src="static/spinner.gif" class="files-spinner" decoding="async"/></span>
|
<span data-number-for="$url" title="Fetching any missing files...">$num_outputs <img src="static/spinner.gif" class="files-spinner" decoding="async"/></span>
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
|
|
|
@ -4,9 +4,8 @@ import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from json import JSONEncoder
|
from json import JSONEncoder
|
||||||
|
from typing import List, Optional, Iterable
|
||||||
from typing import List, Dict, Optional, Iterable
|
from hashlib import sha256
|
||||||
|
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
from urllib.parse import urlparse, quote, unquote
|
from urllib.parse import urlparse, quote, unquote
|
||||||
from html import escape, unescape
|
from html import escape, unescape
|
||||||
|
@ -21,17 +20,17 @@ from subprocess import (
|
||||||
CalledProcessError,
|
CalledProcessError,
|
||||||
)
|
)
|
||||||
|
|
||||||
from schema import Link
|
from base32_crockford import encode as base32_encode
|
||||||
|
|
||||||
|
from schema import Link, LinkDict, ArchiveResult
|
||||||
from config import (
|
from config import (
|
||||||
ANSI,
|
ANSI,
|
||||||
TERM_WIDTH,
|
TERM_WIDTH,
|
||||||
SOURCES_DIR,
|
SOURCES_DIR,
|
||||||
ARCHIVE_DIR,
|
|
||||||
OUTPUT_PERMISSIONS,
|
OUTPUT_PERMISSIONS,
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
SHOW_PROGRESS,
|
SHOW_PROGRESS,
|
||||||
FETCH_TITLE,
|
FETCH_TITLE,
|
||||||
ARCHIVE_DIR_NAME,
|
|
||||||
CHECK_SSL_VALIDITY,
|
CHECK_SSL_VALIDITY,
|
||||||
WGET_USER_AGENT,
|
WGET_USER_AGENT,
|
||||||
CHROME_OPTIONS,
|
CHROME_OPTIONS,
|
||||||
|
@ -43,7 +42,7 @@ from logs import pretty_path
|
||||||
|
|
||||||
# All of these are (str) -> str
|
# All of these are (str) -> str
|
||||||
# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
|
# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
|
||||||
scheme = lambda url: urlparse(url).scheme
|
scheme = lambda url: urlparse(url).scheme.lower()
|
||||||
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
|
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
|
||||||
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
|
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
|
||||||
without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
|
without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
|
||||||
|
@ -56,11 +55,33 @@ fragment = lambda url: urlparse(url).fragment
|
||||||
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
|
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
|
||||||
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
||||||
|
|
||||||
short_ts = lambda ts: ts.split('.')[0]
|
|
||||||
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
|
without_www = lambda url: url.replace('://www.', '://', 1)
|
||||||
urldecode = lambda s: unquote(s)
|
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
|
||||||
htmlencode = lambda s: escape(s, quote=True)
|
fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
|
||||||
htmldecode = lambda s: unescape(s)
|
|
||||||
|
short_ts = lambda ts: (
|
||||||
|
str(ts.timestamp()).split('.')[0]
|
||||||
|
if isinstance(ts, datetime) else
|
||||||
|
str(ts).split('.')[0]
|
||||||
|
)
|
||||||
|
ts_to_date = lambda ts: (
|
||||||
|
ts.strftime('%Y-%m-%d %H:%M')
|
||||||
|
if isinstance(ts, datetime) else
|
||||||
|
datetime.fromtimestamp(float(ts)).strftime('%Y-%m-%d %H:%M')
|
||||||
|
)
|
||||||
|
ts_to_iso = lambda ts: (
|
||||||
|
ts.isoformat()
|
||||||
|
if isinstance(ts, datetime) else
|
||||||
|
datetime.fromtimestamp(float(ts)).isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
|
||||||
|
urldecode = lambda s: s and unquote(s)
|
||||||
|
htmlencode = lambda s: s and escape(s, quote=True)
|
||||||
|
htmldecode = lambda s: s and unescape(s)
|
||||||
|
|
||||||
|
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
|
||||||
|
|
||||||
URL_REGEX = re.compile(
|
URL_REGEX = re.compile(
|
||||||
r'http[s]?://' # start matching from allowed schemes
|
r'http[s]?://' # start matching from allowed schemes
|
||||||
|
@ -80,7 +101,8 @@ STATICFILE_EXTENSIONS = {
|
||||||
# that can be downloaded as-is, not html pages that need to be rendered
|
# that can be downloaded as-is, not html pages that need to be rendered
|
||||||
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
||||||
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
||||||
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8'
|
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
|
||||||
|
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8'
|
||||||
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
||||||
'atom', 'rss', 'css', 'js', 'json',
|
'atom', 'rss', 'css', 'js', 'json',
|
||||||
'dmg', 'iso', 'img',
|
'dmg', 'iso', 'img',
|
||||||
|
@ -100,7 +122,7 @@ STATICFILE_EXTENSIONS = {
|
||||||
|
|
||||||
### Checks & Tests
|
### Checks & Tests
|
||||||
|
|
||||||
def check_link_structure(link: Link) -> None:
|
def check_link_structure(link: LinkDict) -> None:
|
||||||
"""basic sanity check invariants to make sure the data is valid"""
|
"""basic sanity check invariants to make sure the data is valid"""
|
||||||
assert isinstance(link, dict)
|
assert isinstance(link, dict)
|
||||||
assert isinstance(link.get('url'), str)
|
assert isinstance(link.get('url'), str)
|
||||||
|
@ -112,7 +134,7 @@ def check_link_structure(link: Link) -> None:
|
||||||
assert isinstance(key, str)
|
assert isinstance(key, str)
|
||||||
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
|
assert isinstance(val, list), 'history must be a Dict[str, List], got: {}'.format(link['history'])
|
||||||
|
|
||||||
def check_links_structure(links: Iterable[Link]) -> None:
|
def check_links_structure(links: Iterable[LinkDict]) -> None:
|
||||||
"""basic sanity check invariants to make sure the data is valid"""
|
"""basic sanity check invariants to make sure the data is valid"""
|
||||||
assert isinstance(links, list)
|
assert isinstance(links, list)
|
||||||
if links:
|
if links:
|
||||||
|
@ -213,7 +235,7 @@ def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) ->
|
||||||
html = download_url(url, timeout=timeout)
|
html = download_url(url, timeout=timeout)
|
||||||
|
|
||||||
match = re.search(HTML_TITLE_REGEX, html)
|
match = re.search(HTML_TITLE_REGEX, html)
|
||||||
return match.group(1).strip() if match else None
|
return htmldecode(match.group(1).strip()) if match else None
|
||||||
except Exception as err: # noqa
|
except Exception as err: # noqa
|
||||||
# print('[!] Failed to fetch title because of {}: {}'.format(
|
# print('[!] Failed to fetch title because of {}: {}'.format(
|
||||||
# err.__class__.__name__,
|
# err.__class__.__name__,
|
||||||
|
@ -228,8 +250,8 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
See docs on wget --adjust-extension (-E)
|
See docs on wget --adjust-extension (-E)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link.url):
|
||||||
return without_scheme(without_fragment(link['url']))
|
return without_scheme(without_fragment(link.url))
|
||||||
|
|
||||||
# Wget downloads can save in a number of different ways depending on the url:
|
# Wget downloads can save in a number of different ways depending on the url:
|
||||||
# https://example.com
|
# https://example.com
|
||||||
|
@ -262,11 +284,10 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
# and there's no way to get the computed output path from wget
|
# and there's no way to get the computed output path from wget
|
||||||
# in order to avoid having to reverse-engineer how they calculate it,
|
# in order to avoid having to reverse-engineer how they calculate it,
|
||||||
# we just look in the output folder read the filename wget used from the filesystem
|
# we just look in the output folder read the filename wget used from the filesystem
|
||||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
||||||
full_path = without_fragment(without_query(path(link['url']))).strip('/')
|
|
||||||
search_dir = os.path.join(
|
search_dir = os.path.join(
|
||||||
link_dir,
|
link.link_dir,
|
||||||
domain(link['url']),
|
domain(link.url),
|
||||||
full_path,
|
full_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -278,13 +299,13 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
||||||
]
|
]
|
||||||
if html_files:
|
if html_files:
|
||||||
path_from_link_dir = search_dir.split(link_dir)[-1].strip('/')
|
path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
|
||||||
return os.path.join(path_from_link_dir, html_files[0])
|
return os.path.join(path_from_link_dir, html_files[0])
|
||||||
|
|
||||||
# Move up one directory level
|
# Move up one directory level
|
||||||
search_dir = search_dir.rsplit('/', 1)[0]
|
search_dir = search_dir.rsplit('/', 1)[0]
|
||||||
|
|
||||||
if search_dir == link_dir:
|
if search_dir == link.link_dir:
|
||||||
break
|
break
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
@ -314,19 +335,20 @@ def merge_links(a: Link, b: Link) -> Link:
|
||||||
"""deterministially merge two links, favoring longer field values over shorter,
|
"""deterministially merge two links, favoring longer field values over shorter,
|
||||||
and "cleaner" values over worse ones.
|
and "cleaner" values over worse ones.
|
||||||
"""
|
"""
|
||||||
|
a, b = a._asdict(), b._asdict()
|
||||||
longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
|
longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
|
||||||
earlier = lambda key: a[key] if a[key] < b[key] else b[key]
|
earlier = lambda key: a[key] if a[key] < b[key] else b[key]
|
||||||
|
|
||||||
url = longer('url')
|
url = longer('url')
|
||||||
longest_title = longer('title')
|
longest_title = longer('title')
|
||||||
cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
|
cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
|
||||||
return {
|
return Link(
|
||||||
'url': url,
|
url=url,
|
||||||
'timestamp': earlier('timestamp'),
|
timestamp=earlier('timestamp'),
|
||||||
'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
|
title=longest_title if '://' not in (longest_title or '') else cleanest_title,
|
||||||
'tags': longer('tags'),
|
tags=longer('tags'),
|
||||||
'sources': list(set(a.get('sources', []) + b.get('sources', []))),
|
sources=list(set(a.get('sources', []) + b.get('sources', []))),
|
||||||
}
|
)
|
||||||
|
|
||||||
def is_static_file(url: str) -> bool:
|
def is_static_file(url: str) -> bool:
|
||||||
"""Certain URLs just point to a single static file, and
|
"""Certain URLs just point to a single static file, and
|
||||||
|
@ -339,85 +361,11 @@ def is_static_file(url: str) -> bool:
|
||||||
def derived_link_info(link: Link) -> dict:
|
def derived_link_info(link: Link) -> dict:
|
||||||
"""extend link info with the archive urls and other derived data"""
|
"""extend link info with the archive urls and other derived data"""
|
||||||
|
|
||||||
url = link['url']
|
info = link._asdict(extended=True)
|
||||||
|
info.update(link.canonical_outputs())
|
||||||
|
|
||||||
to_date_str = lambda ts: datetime.fromtimestamp(float(ts)).strftime('%Y-%m-%d %H:%M')
|
return info
|
||||||
|
|
||||||
extended_info = {
|
|
||||||
**link,
|
|
||||||
'link_dir': '{}/{}'.format(ARCHIVE_DIR_NAME, link['timestamp']),
|
|
||||||
'bookmarked_date': to_date_str(link['timestamp']),
|
|
||||||
'updated_date': to_date_str(link['updated']) if 'updated' in link else None,
|
|
||||||
'domain': domain(url),
|
|
||||||
'path': path(url),
|
|
||||||
'basename': basename(url),
|
|
||||||
'extension': extension(url),
|
|
||||||
'base_url': base_url(url),
|
|
||||||
'is_static': is_static_file(url),
|
|
||||||
'is_archived': os.path.exists(os.path.join(
|
|
||||||
ARCHIVE_DIR,
|
|
||||||
link['timestamp'],
|
|
||||||
domain(url),
|
|
||||||
)),
|
|
||||||
'num_outputs': len([entry for entry in latest_output(link).values() if entry]),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Archive Method Output URLs
|
|
||||||
extended_info.update({
|
|
||||||
'index_url': 'index.html',
|
|
||||||
'favicon_url': 'favicon.ico',
|
|
||||||
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
|
|
||||||
'archive_url': wget_output_path(link),
|
|
||||||
'warc_url': 'warc',
|
|
||||||
'pdf_url': 'output.pdf',
|
|
||||||
'screenshot_url': 'screenshot.png',
|
|
||||||
'dom_url': 'output.html',
|
|
||||||
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
|
|
||||||
'git_url': 'git',
|
|
||||||
'media_url': 'media',
|
|
||||||
})
|
|
||||||
# static binary files like PDF and images are handled slightly differently.
|
|
||||||
# they're just downloaded once and aren't archived separately multiple times,
|
|
||||||
# so the wget, screenshot, & pdf urls should all point to the same file
|
|
||||||
if is_static_file(url):
|
|
||||||
extended_info.update({
|
|
||||||
'title': basename(url),
|
|
||||||
'archive_url': base_url(url),
|
|
||||||
'pdf_url': base_url(url),
|
|
||||||
'screenshot_url': base_url(url),
|
|
||||||
'dom_url': base_url(url),
|
|
||||||
})
|
|
||||||
|
|
||||||
return extended_info
|
|
||||||
|
|
||||||
|
|
||||||
def latest_output(link: Link, status: str=None) -> Dict[str, Optional[str]]:
|
|
||||||
"""get the latest output that each archive method produced for link"""
|
|
||||||
|
|
||||||
latest = {
|
|
||||||
'title': None,
|
|
||||||
'favicon': None,
|
|
||||||
'wget': None,
|
|
||||||
'warc': None,
|
|
||||||
'pdf': None,
|
|
||||||
'screenshot': None,
|
|
||||||
'dom': None,
|
|
||||||
'git': None,
|
|
||||||
'media': None,
|
|
||||||
'archive_org': None,
|
|
||||||
}
|
|
||||||
for archive_method in latest.keys():
|
|
||||||
# get most recent succesful result in history for each archive method
|
|
||||||
history = link.get('history', {}).get(archive_method) or []
|
|
||||||
history = filter(lambda result: result['output'], reversed(history))
|
|
||||||
if status is not None:
|
|
||||||
history = filter(lambda result: result['status'] == status, history)
|
|
||||||
|
|
||||||
history = list(history)
|
|
||||||
if history:
|
|
||||||
latest[archive_method] = history[0]['output']
|
|
||||||
|
|
||||||
return latest
|
|
||||||
|
|
||||||
|
|
||||||
### Python / System Helpers
|
### Python / System Helpers
|
||||||
|
@ -466,21 +414,13 @@ class TimedProgress:
|
||||||
self.p = Process(target=progress_bar, args=(seconds, prefix))
|
self.p = Process(target=progress_bar, args=(seconds, prefix))
|
||||||
self.p.start()
|
self.p.start()
|
||||||
|
|
||||||
self.stats = {
|
self.stats = {'start_ts': datetime.now(), 'end_ts': None}
|
||||||
'start_ts': datetime.now(),
|
|
||||||
'end_ts': None,
|
|
||||||
'duration': None,
|
|
||||||
}
|
|
||||||
|
|
||||||
def end(self):
|
def end(self):
|
||||||
"""immediately end progress, clear the progressbar line, and save end_ts"""
|
"""immediately end progress, clear the progressbar line, and save end_ts"""
|
||||||
|
|
||||||
end_ts = datetime.now()
|
end_ts = datetime.now()
|
||||||
self.stats.update({
|
self.stats['end_ts'] = end_ts
|
||||||
'end_ts': end_ts,
|
|
||||||
'duration': (end_ts - self.stats['start_ts']).seconds,
|
|
||||||
})
|
|
||||||
|
|
||||||
if SHOW_PROGRESS:
|
if SHOW_PROGRESS:
|
||||||
# protect from double termination
|
# protect from double termination
|
||||||
#if p is None or not hasattr(p, 'kill'):
|
#if p is None or not hasattr(p, 'kill'):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue