re-arrange and cleanup directory structure

This commit is contained in:
Nick Sweeting 2018-06-10 20:52:15 -04:00
parent 62e33c011b
commit d0f2e693b3
26 changed files with 80 additions and 98 deletions

BIN
archiver/.DS_Store vendored Normal file

Binary file not shown.

0
archiver/__init__.py Normal file
View file

162
archiver/archive.py Executable file
View file

@ -0,0 +1,162 @@
#!/usr/bin/env python3
# Bookmark Archiver
# Nick Sweeting 2017 | MIT License
# https://github.com/pirate/bookmark-archiver
import os
import sys
from datetime import datetime
from parse import parse_links
from links import validate_links
from archive_methods import archive_links, _RESULTS_TOTALS
from index import (
write_links_index,
write_link_index,
parse_json_links_index,
parse_json_link_index,
)
from config import (
ARCHIVE_PERMISSIONS,
OUTPUT_DIR,
ANSI,
TIMEOUT,
)
from util import (
download_url,
progress,
cleanup_archive,
)
__DESCRIPTION__ = 'Bookmark Archiver: Create a browsable html archive of a list of links.'
__DOCUMENTATION__ = 'https://github.com/pirate/bookmark-archiver'
def print_help():
print(__DESCRIPTION__)
print("Documentation: {}\n".format(__DOCUMENTATION__))
print("Usage:")
print(" ./bin/bookmark-archiver ~/Downloads/bookmarks_export.html\n")
def merge_links(archive_path=OUTPUT_DIR, import_path=None):
"""get new links from file and optionally append them to links in existing archive"""
all_links = []
if import_path:
# parse and validate the import file
raw_links = parse_links(import_path)
all_links = validate_links(raw_links)
# merge existing links in archive_path and new links
existing_links = []
if archive_path:
existing_links = parse_json_links_index(archive_path)
all_links = validate_links(existing_links + all_links)
num_new_links = len(all_links) - len(existing_links)
if num_new_links:
print('[{green}+{reset}] [{}] Adding {} new links from {} to {}/index.json'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
num_new_links,
import_path,
archive_path,
**ANSI,
))
# else:
# print('[*] [{}] No new links added to {}/index.json{}'.format(
# datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
# archive_path,
# ' from {}'.format(import_path) if import_path else '',
# **ANSI,
# ))
return all_links
def update_archive(archive_path, links, source=None, resume=None, append=True):
"""update or create index.html+json given a path to an export file containing new links"""
start_ts = datetime.now().timestamp()
if resume:
print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
resume,
**ANSI,
))
else:
print('{green}[▶] [{}] Updating files for {} links in archive...{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
len(links),
**ANSI,
))
# loop over links and archive them
archive_links(archive_path, links, source=source, resume=resume)
# print timing information & summary
end_ts = datetime.now().timestamp()
seconds = end_ts - start_ts
if seconds > 60:
duration = '{0:.2f} min'.format(seconds / 60, 2)
else:
duration = '{0:.2f} sec'.format(seconds, 2)
print('{}[√] [{}] Update of {} links complete ({}){}'.format(
ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
len(links),
duration,
ANSI['reset'],
))
print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
if __name__ == '__main__':
argc = len(sys.argv)
if set(sys.argv).intersection(('-h', '--help', 'help')):
print_help()
raise SystemExit(0)
source = sys.argv[1] if argc > 1 else None # path of links file to import
resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
if argc == 1:
source, resume = None, None
elif argc == 2:
if all(d.isdigit() for d in sys.argv[1].split('.')):
# argv[1] is a resume timestamp
source, resume = None, sys.argv[1]
else:
# argv[1] is a path to a file to import
source, resume = sys.argv[1].strip(), None
elif argc == 3:
source, resume = sys.argv[1].strip(), sys.argv[2]
else:
print_help()
raise SystemExit(1)
# See if archive folder already exists
for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
if os.path.exists(out_dir):
break
else:
out_dir = OUTPUT_DIR
# Step 0: Download url to local file (only happens if a URL is specified instead of local path)
if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
source = download_url(source)
# Step 1: Parse the links and dedupe them with existing archive
links = merge_links(archive_path=out_dir, import_path=source)
# Step 2: Write new index
write_links_index(out_dir=out_dir, links=links)
# Step 3: Verify folder structure is 1:1 with index
# cleanup_archive(out_dir, links)
# Step 4: Run the archive methods for each link
update_archive(out_dir, links, source=source, resume=resume, append=True)

494
archiver/archive_methods.py Normal file
View file

@ -0,0 +1,494 @@
import os
import sys
from functools import wraps
from collections import defaultdict
from datetime import datetime
from subprocess import run, PIPE, DEVNULL
from peekable import Peekable
from index import wget_output_path, parse_json_link_index, write_link_index
from links import links_after_timestamp
from config import (
CHROME_BINARY,
FETCH_WGET,
FETCH_WGET_REQUISITES,
FETCH_PDF,
FETCH_SCREENSHOT,
FETCH_DOM,
RESOLUTION,
CHECK_SSL_VALIDITY,
SUBMIT_ARCHIVE_DOT_ORG,
FETCH_AUDIO,
FETCH_VIDEO,
FETCH_FAVICON,
WGET_USER_AGENT,
CHROME_USER_DATA_DIR,
TIMEOUT,
ANSI,
)
from util import (
check_dependencies,
progress,
chmod_file,
pretty_path,
)
_RESULTS_TOTALS = { # globals are bad, mmkay
'skipped': 0,
'succeded': 0,
'failed': 0,
}
def archive_links(archive_path, links, source=None, resume=None):
check_dependencies()
to_archive = Peekable(links_after_timestamp(links, resume))
idx, link = 0, to_archive.peek(0)
try:
for idx, link in enumerate(to_archive):
link_dir = os.path.join(archive_path, 'archive', link['timestamp'])
archive_link(link_dir, link)
except (KeyboardInterrupt, SystemExit, Exception) as e:
print('{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI,
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=link['timestamp'],
total=len(links),
))
print(' Continue where you left off by running:')
print(' {} {}'.format(
sys.argv[0],
link['timestamp'],
))
if not isinstance(e, KeyboardInterrupt):
raise e
raise SystemExit(1)
def archive_link(link_dir, link, overwrite=True):
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
update_existing = os.path.exists(link_dir)
if update_existing:
link = {
**parse_json_link_index(link_dir),
**link,
}
else:
os.makedirs(link_dir)
log_link_archive(link_dir, link, update_existing)
if FETCH_WGET:
link = fetch_wget(link_dir, link, overwrite=overwrite)
if FETCH_PDF:
link = fetch_pdf(link_dir, link, overwrite=overwrite)
if FETCH_SCREENSHOT:
link = fetch_screenshot(link_dir, link, overwrite=overwrite)
if FETCH_DOM:
link = fetch_dom(link_dir, link, overwrite=overwrite)
if SUBMIT_ARCHIVE_DOT_ORG:
link = archive_dot_org(link_dir, link, overwrite=overwrite)
# if FETCH_AUDIO:
# link = fetch_audio(link_dir, link, overwrite=overwrite)
# if FETCH_VIDEO:
# link = fetch_video(link_dir, link, overwrite=overwrite)
if FETCH_FAVICON:
link = fetch_favicon(link_dir, link, overwrite=overwrite)
write_link_index(link_dir, link)
# print()
return link
def log_link_archive(link_dir, link, update_existing):
print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
symbol='*' if update_existing else '+',
symbol_color=ANSI['black' if update_existing else 'green'],
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
**link,
**ANSI,
))
print(' > {}{}'.format(pretty_path(link_dir), '' if update_existing else ' (new)'))
if link['type']:
print(' i {}'.format(link['type']))
def attach_result_to_link(method):
"""
Instead of returning a result={output:'...', status:'success'} object,
attach that result to the links's history & latest fields, then return
the updated link object.
"""
def decorator(fetch_func):
@wraps(fetch_func)
def timed_fetch_func(link_dir, link, overwrite=False, **kwargs):
# initialize methods and history json field on link
link['latest'] = link.get('latest') or {}
link['latest'][method] = link['latest'].get(method) or None
link['history'] = link.get('history') or {}
link['history'][method] = link['history'].get(method) or []
start_ts = datetime.now().timestamp()
# if a valid method output is already present, dont run the fetch function
if link['latest'][method] and not overwrite:
print('{}'.format(method))
result = None
else:
print(' > {}'.format(method))
result = fetch_func(link_dir, link, **kwargs)
end_ts = datetime.now().timestamp()
duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0]
# append a history item recording fail/success
history_entry = {
'timestamp': str(start_ts).split('.')[0],
}
if result is None:
history_entry['status'] = 'skipped'
elif isinstance(result.get('output'), Exception):
history_entry['status'] = 'failed'
history_entry['duration'] = duration
history_entry.update(result or {})
link['history'][method].append(history_entry)
else:
history_entry['status'] = 'succeded'
history_entry['duration'] = duration
history_entry.update(result or {})
link['history'][method].append(history_entry)
link['latest'][method] = result['output']
_RESULTS_TOTALS[history_entry['status']] += 1
return link
return timed_fetch_func
return decorator
@attach_result_to_link('wget')
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT):
"""download full site using wget"""
domain_dir = os.path.join(link_dir, link['domain'])
existing_file = wget_output_path(link)
if os.path.exists(domain_dir) and existing_file:
return {'output': existing_file, 'status': 'skipped'}
CMD = [
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
*'wget -N -E -np -x -H -k -K -S --restrict-file-names=unix'.split(' '),
*(('-p',) if FETCH_WGET_REQUISITES else ()),
*(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
*((() if CHECK_SSL_VALIDITY else ('--no-check-certificate',))),
link['url'],
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html
end()
output = wget_output_path(link, look_in=domain_dir)
if result.returncode > 0 and result.returncode != 8:
print(' got wget response code {}:'.format(result.returncode))
print('\n'.join(' ' + line for line in (result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:] if line.strip()))
if result.returncode == 4:
raise Exception('Failed to wget download')
except Exception as e:
end()
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
output = e
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('pdf')
def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
"""print PDF of site to file using chrome --headless"""
if link['type'] in ('PDF', 'image'):
return {'output': wget_output_path(link)}
if os.path.exists(os.path.join(link_dir, 'output.pdf')):
return {'output': 'output.pdf', 'status': 'skipped'}
CMD = [
*chrome_headless(user_data_dir=user_data_dir),
'--print-to-pdf',
link['url']
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf
end()
if result.returncode:
print(' ', (result.stderr or result.stdout).decode())
raise Exception('Failed to print PDF')
chmod_file('output.pdf', cwd=link_dir)
output = 'output.pdf'
except Exception as e:
end()
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
output = e
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('screenshot')
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
"""take screenshot of site using chrome --headless"""
if link['type'] in ('PDF', 'image'):
return {'output': wget_output_path(link)}
if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
return {'output': 'screenshot.png', 'status': 'skipped'}
CMD = [
*chrome_headless(user_data_dir=user_data_dir),
'--screenshot',
'--window-size={}'.format(resolution),
link['url']
]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png
end()
if result.returncode:
print(' ', (result.stderr or result.stdout).decode())
raise Exception('Failed to take screenshot')
chmod_file('screenshot.png', cwd=link_dir)
output = 'screenshot.png'
except Exception as e:
end()
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
output = e
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('dom')
def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
"""print HTML of site to file using chrome --dump-html"""
if link['type'] in ('PDF', 'image'):
return {'output': wget_output_path(link)}
output_path = os.path.join(link_dir, 'output.html')
if os.path.exists(output_path):
return {'output': 'output.html', 'status': 'skipped'}
CMD = [
*chrome_headless(user_data_dir=user_data_dir),
'--dump-dom',
link['url']
]
end = progress(timeout, prefix=' ')
try:
with open(output_path, 'w+') as f:
result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.html
end()
if result.returncode:
print(' ', (result.stderr).decode())
raise Exception('Failed to fetch DOM')
chmod_file('output.html', cwd=link_dir)
output = 'output.html'
except Exception as e:
end()
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
output = e
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('archive_org')
def archive_dot_org(link_dir, link, timeout=TIMEOUT):
"""submit site to archive.org for archiving via their service, save returned archive url"""
path = os.path.join(link_dir, 'archive.org.txt')
if os.path.exists(path):
archive_org_url = open(path, 'r').read().strip()
return {'output': archive_org_url, 'status': 'skipped'}
submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0])
success = False
CMD = ['curl', '-I', submit_url]
end = progress(timeout, prefix=' ')
try:
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt
end()
# Parse archive.org response headers
headers = defaultdict(list)
# lowercase all the header names and store in dict
for header in result.stdout.splitlines():
if b':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors
content_location = headers['content-location']
errors = headers['x-archive-wayback-runtime-error']
if content_location:
saved_url = 'https://web.archive.org{}'.format(content_location[0])
success = True
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
output = submit_url
# raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
elif errors:
raise Exception(', '.join(errors))
else:
raise Exception('Failed to find "content-location" URL header in Archive.org response.')
except Exception as e:
end()
print(' Visit url to see output:', ' '.join(CMD))
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
output = e
if success:
with open(os.path.join(link_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
f.write(saved_url)
chmod_file('archive.org.txt', cwd=link_dir)
output = saved_url
return {
'cmd': CMD,
'output': output,
}
@attach_result_to_link('favicon')
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
"""download site favicon from google's favicon api"""
if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
return {'output': 'favicon.ico', 'status': 'skipped'}
CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)]
fout = open('{}/favicon.ico'.format(link_dir), 'w')
end = progress(timeout, prefix=' ')
try:
run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # favicon.ico
fout.close()
end()
chmod_file('favicon.ico', cwd=link_dir)
output = 'favicon.ico'
except Exception as e:
fout.close()
end()
print(' Run to see full output:', ' '.join(CMD))
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
output = e
return {
'cmd': CMD,
'output': output,
}
# @attach_result_to_link('audio')
# def fetch_audio(link_dir, link, timeout=TIMEOUT):
# """Download audio rip using youtube-dl"""
# if link['type'] not in ('soundcloud',)\
# and 'audio' not in link['tags']:
# return
# path = os.path.join(link_dir, 'audio')
# if not os.path.exists(path) or overwrite:
# print(' - Downloading audio')
# CMD = [
# "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
# link['url'],
# ]
# end = progress(timeout, prefix=' ')
# try:
# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # audio/audio.mp3
# end()
# if result.returncode:
# print(' ', result.stderr.decode())
# raise Exception('Failed to download audio')
# chmod_file('audio.mp3', cwd=link_dir)
# return 'audio.mp3'
# except Exception as e:
# end()
# print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
# print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
# raise
# else:
# print(' √ Skipping audio download')
# @attach_result_to_link('video')
# def fetch_video(link_dir, link, timeout=TIMEOUT):
# """Download video rip using youtube-dl"""
# if link['type'] not in ('youtube', 'youku', 'vimeo')\
# and 'video' not in link['tags']:
# return
# path = os.path.join(link_dir, 'video')
# if not os.path.exists(path) or overwrite:
# print(' - Downloading video')
# CMD = [
# "youtube-dl -x --video-format mp4 --audio-quality 0 -o '%(title)s.%(ext)s'",
# link['url'],
# ]
# end = progress(timeout, prefix=' ')
# try:
# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # video/movie.mp4
# end()
# if result.returncode:
# print(' ', result.stderr.decode())
# raise Exception('Failed to download video')
# chmod_file('video.mp4', cwd=link_dir)
# return 'video.mp4'
# except Exception as e:
# end()
# print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
# print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
# raise
# else:
# print(' √ Skipping video download')
def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR):
args = [binary, '--headless'] # '--disable-gpu'
default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default')
if user_data_dir:
args.append('--user-data-dir={}'.format(user_data_dir))
elif os.path.exists(default_profile):
args.append('--user-data-dir={}'.format(default_profile))
return args

81
archiver/config.py Normal file
View file

@ -0,0 +1,81 @@
import os
import sys
import shutil
from subprocess import run, PIPE
# ******************************************************************************
# * TO SET YOUR CONFIGURATION, EDIT THE VALUES BELOW, or use the 'env' command *
# * e.g. *
# * env USE_COLOR=True CHROME_BINARY=google-chrome ./archive.py export.html *
# ******************************************************************************
IS_TTY = sys.stdout.isatty()
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true'
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' ).lower() == 'true'
FETCH_VIDEO = os.getenv('FETCH_VIDEO', 'False' ).lower() == 'true'
FETCH_PDF = os.getenv('FETCH_PDF', 'True' ).lower() == 'true'
FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true'
FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true'
FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true'
SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true'
RESOLUTION = os.getenv('RESOLUTION', '1440,1200' )
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
ARCHIVE_PERMISSIONS = os.getenv('ARCHIVE_PERMISSIONS', '755' )
CHROME_BINARY = os.getenv('CHROME_BINARY', 'chromium-browser' ) # change to google-chrome browser if using google-chrome
WGET_BINARY = os.getenv('WGET_BINARY', 'wget' )
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', None)
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
LINK_INDEX_TEMPLATE = os.getenv('LINK_INDEX_TEMPLATE', 'templates/link_index_fancy.html')
INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html')
INDEX_ROW_TEMPLATE = os.getenv('INDEX_ROW_TEMPLATE', 'templates/index_row.html')
TEMPLATE_STATICFILES = os.getenv('TEMPLATE_STATICFILES', 'templates/static')
FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',)
### Paths
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
OUTPUT_DIR = os.path.abspath(os.path.join(REPO_DIR, 'output'))
SOURCES_DIR = os.path.abspath(os.path.join(OUTPUT_DIR, 'sources'))
# ******************************************************************************
# ********************** Do not edit below this point **************************
# ******************************************************************************
### Terminal Configuration
os.chdir(os.path.join(REPO_DIR, 'archiver'))
TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns
ANSI = {
'reset': '\033[00;00m',
'lightblue': '\033[01;30m',
'lightyellow': '\033[01;33m',
'lightred': '\033[01;35m',
'red': '\033[01;31m',
'green': '\033[01;32m',
'blue': '\033[01;34m',
'white': '\033[01;37m',
'black': '\033[01;30m',
}
if not USE_COLOR:
# dont show colors if USE_COLOR is False
ANSI = {k: '' for k in ANSI.keys()}
### Confirm Environment Setup
try:
GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
except Exception:
GIT_SHA = None
print('[!] Warning, you need git installed for some archiving features to save correct version numbers!')
if sys.stdout.encoding.upper() != 'UTF-8':
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
print(' To fix it, add the line "export PYTHONIOENCODING=utf8" to your ~/.bashrc file (without quotes)')
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:')
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
print('')
print(' Alternatively, run this script with:')
print(' env PYTHONIOENCODING=utf8 ./archive.py export.html')

154
archiver/index.py Normal file
View file

@ -0,0 +1,154 @@
import os
import json
from datetime import datetime
from string import Template
from distutils.dir_util import copy_tree
from config import (
INDEX_TEMPLATE,
INDEX_ROW_TEMPLATE,
LINK_INDEX_TEMPLATE,
TEMPLATE_STATICFILES,
ARCHIVE_PERMISSIONS,
ANSI,
GIT_SHA,
FOOTER_INFO,
)
from util import (
chmod_file,
wget_output_path,
derived_link_info,
pretty_path,
)
### Homepage index for all the links
def write_links_index(out_dir, links):
"""create index.html file for a given list of links"""
if not os.path.exists(out_dir):
os.makedirs(out_dir)
write_json_links_index(out_dir, links)
write_html_links_index(out_dir, links)
print('{green}[√] [{}] Updated main index files:{reset}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
**ANSI))
print(' > {}/index.json'.format(pretty_path(out_dir)))
print(' > {}/index.html'.format(pretty_path(out_dir)))
def write_json_links_index(out_dir, links):
"""write the json link index to a given path"""
path = os.path.join(out_dir, 'index.json')
index_json = {
'info': 'Bookmark Archiver Index',
'help': 'https://github.com/pirate/bookmark-archiver',
'version': GIT_SHA,
'num_links': len(links),
'updated': str(datetime.now().timestamp()),
'links': links,
}
with open(path, 'w', encoding='utf-8') as f:
json.dump(index_json, f, indent=4, default=str)
chmod_file(path)
def parse_json_links_index(out_dir):
"""load the index in a given directory and merge it with the given link"""
index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
return json.load(f)['links']
return []
def write_html_links_index(out_dir, links):
"""write the html link index to a given path"""
path = os.path.join(out_dir, 'index.html')
copy_tree(TEMPLATE_STATICFILES, os.path.join(out_dir, "static"))
with open(INDEX_TEMPLATE, 'r', encoding='utf-8') as f:
index_html = f.read()
with open(INDEX_ROW_TEMPLATE, 'r', encoding='utf-8') as f:
link_row_html = f.read()
link_rows = '\n'.join(
Template(link_row_html).substitute(**derived_link_info(link))
for link in links
)
template_vars = {
'num_links': len(links),
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'footer_info': FOOTER_INFO,
'git_sha': GIT_SHA,
'short_git_sha': GIT_SHA[:8],
'rows': link_rows,
}
with open(path, 'w', encoding='utf-8') as f:
f.write(Template(index_html).substitute(**template_vars))
chmod_file(path)
### Individual link index
def write_link_index(out_dir, link):
link['updated'] = str(datetime.now().timestamp())
write_json_link_index(out_dir, link)
write_html_link_index(out_dir, link)
def write_json_link_index(out_dir, link):
"""write a json file with some info about the link"""
path = os.path.join(out_dir, 'index.json')
print(' √ index.json')
with open(path, 'w', encoding='utf-8') as f:
json.dump(link, f, indent=4, default=str)
chmod_file(path)
def parse_json_link_index(out_dir):
"""load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json')
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
return json.load(f)
return {}
def write_html_link_index(out_dir, link):
with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f:
link_html = f.read()
path = os.path.join(out_dir, 'index.html')
print(' √ index.html')
with open(path, 'w', encoding='utf-8') as f:
f.write(Template(link_html).substitute({
**link,
**link['latest'],
'type': link['type'] or 'website',
'tags': link['tags'] or 'untagged',
'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'),
'bookmarked_ts': link['timestamp'],
'updated_ts': link['updated'],
'archive_org': link['latest'].get('archive_org') or 'https://web.archive.org/save/{}'.format(link['url']),
'wget': link['latest'].get('wget') or wget_output_path(link),
}))
chmod_file(path)

141
archiver/links.py Normal file
View file

@ -0,0 +1,141 @@
"""
In Bookmark Archiver, a Link represents a single entry that we track in the
json index. All links pass through all archiver functions and the latest,
most up-to-date canonical output for each is stored in "latest".
Link {
timestamp: str, (how we uniquely id links) _ _ _ _ ___
url: str, | \ / \ |\| ' |
base_url: str, |_/ \_/ | | |
domain: str, _ _ _ _ _ _
tags: str, |_) /| |\| | / `
type: str, | /"| | | | \_,
title: str, ,-'"`-.
sources: [str], /// / @ @ \ \\\\
latest: { \ :=| ,._,. |=: /
..., || ,\ \_../ /. ||
pdf: 'output.pdf', ||','`-._))'`.`||
wget: 'example.com/1234/index.html' `-' (/ `-'
},
history: {
...
pdf: [
{timestamp: 15444234325, status: 'skipped', result='output.pdf'},
...
],
wget: [
{timestamp: 11534435345, status: 'succeded', result='donuts.com/eat/them.html'}
]
},
}
"""
import datetime
from html import unescape
from util import (
domain,
base_url,
str_between,
get_link_type,
merge_links,
wget_output_path,
)
from config import ANSI
def validate_links(links):
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
links = sorted_links(links) # deterministically sort the links based on timstamp, url
if not links:
print('[X] No links found :(')
raise SystemExit(1)
for link in links:
link['title'] = unescape(link['title'])
link['latest'] = link.get('latest') or {}
if not link['latest'].get('wget'):
link['latest']['wget'] = wget_output_path(link)
if not link['latest'].get('pdf'):
link['latest']['pdf'] = None
if not link['latest'].get('screenshot'):
link['latest']['screenshot'] = None
if not link['latest'].get('dom'):
link['latest']['dom'] = None
return list(links)
def archivable_links(links):
"""remove chrome://, about:// or other schemed links that cant be archived"""
return (
link
for link in links
if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
)
def uniquefied_links(sorted_links):
"""
ensures that all non-duplicate links have monotonically increasing timestamps
"""
unique_urls = {}
lower = lambda url: url.lower().strip()
without_www = lambda url: url.replace('://www.', '://', 1)
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
for link in sorted_links:
fuzzy_url = without_www(without_trailing_slash(lower(link['url'])))
if fuzzy_url in unique_urls:
# merge with any other links that share the same url
link = merge_links(unique_urls[fuzzy_url], link)
unique_urls[fuzzy_url] = link
unique_timestamps = {}
for link in unique_urls.values():
link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
unique_timestamps[link['timestamp']] = link
return unique_timestamps.values()
def sorted_links(links):
sort_func = lambda link: (link['timestamp'], link['url'])
return sorted(links, key=sort_func, reverse=True)
def links_after_timestamp(links, timestamp=None):
if not timestamp:
yield from links
return
for link in links:
try:
if float(link['timestamp']) <= float(timestamp):
yield link
except (ValueError, TypeError):
print('Resume value and all timestamp values must be valid numbers.')
def lowest_uniq_timestamp(used_timestamps, timestamp):
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
timestamp = timestamp.split('.')[0]
nonce = 0
# first try 152323423 before 152323423.0
if timestamp not in used_timestamps:
return timestamp
new_timestamp = '{}.{}'.format(timestamp, nonce)
while new_timestamp in used_timestamps:
nonce += 1
new_timestamp = '{}.{}'.format(timestamp, nonce)
return new_timestamp

231
archiver/parse.py Normal file
View file

@ -0,0 +1,231 @@
"""
Everything related to parsing links from bookmark services.
For a list of supported services, see the README.md.
For examples of supported files see examples/.
Parsed link schema: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'domain': 'example.com',
'base_url': 'example.com/example/',
'timestamp': '15442123124234',
'tags': 'abc,def',
'title': 'Example.com Page Title',
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
}
"""
import re
import json
import xml.etree.ElementTree as etree
from datetime import datetime
from util import (
domain,
base_url,
str_between,
get_link_type,
)
def get_parsers(file):
"""return all parsers that work on a given file, defaults to all of them"""
return {
'pocket': parse_pocket_export,
'pinboard': parse_json_export,
'bookmarks': parse_bookmarks_export,
'rss': parse_rss_export,
'pinboard_rss': parse_pinboard_rss_feed,
'medium_rss': parse_medium_rss_feed,
}
def parse_links(path):
"""parse a list of links dictionaries from a bookmark export file"""
links = []
with open(path, 'r', encoding='utf-8') as file:
for parser_func in get_parsers(file).values():
# otherwise try all parsers until one works
try:
links += list(parser_func(file))
if links:
break
except (ValueError, TypeError, IndexError, AttributeError, etree.ParseError):
# parser not supported on this file
pass
return links
def parse_pocket_export(html_file):
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0)
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
for line in html_file:
# example line
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
match = pattern.search(line)
if match:
fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
time = datetime.fromtimestamp(float(match.group(2)))
info = {
'url': fixed_url,
'domain': domain(fixed_url),
'base_url': base_url(fixed_url),
'timestamp': str(time.timestamp()),
'tags': match.group(3),
'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or base_url(fixed_url),
'sources': [html_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_json_export(json_file):
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/)"""
json_file.seek(0)
json_content = json.load(json_file)
for line in json_content:
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if line:
erg = line
time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')
info = {
'url': erg['href'],
'domain': domain(erg['href']),
'base_url': base_url(erg['href']),
'timestamp': erg.get('timestamp', str(time.timestamp())),
'tags': erg['tags'],
'title': erg['description'].replace(' — Readability', ''),
'sources': [json_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_rss_export(rss_file):
"""Parse RSS XML-format files into links"""
rss_file.seek(0)
items = rss_file.read().split('</item>\n<item>')
for item in items:
# example item:
# <item>
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
trailing_removed = item.split('</item>', 1)[0]
leading_removed = trailing_removed.split('<item>', 1)[-1]
rows = leading_removed.split('\n')
def get_row(key):
return [r for r in rows if r.startswith('<{}>'.format(key))][0]
title = str_between(get_row('title'), '<![CDATA[', ']]')
url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
info = {
'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': str(time.timestamp()),
'tags': '',
'title': title,
'sources': [rss_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_bookmarks_export(html_file):
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
html_file.seek(0)
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
for line in html_file:
# example line
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
match = pattern.search(line)
if match:
url = match.group(1)
time = datetime.fromtimestamp(float(match.group(2)))
info = {
'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': str(time.timestamp()),
'tags': "",
'title': match.group(3),
'sources': [html_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_pinboard_rss_feed(rss_file):
"""Parse Pinboard RSS feed files into links"""
rss_file.seek(0)
root = etree.parse(rss_file).getroot()
items = root.findall("{http://purl.org/rss/1.0/}item")
for item in items:
url = item.find("{http://purl.org/rss/1.0/}link").text
tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text
title = item.find("{http://purl.org/rss/1.0/}title").text
ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text
# = 🌈🌈🌈🌈
# = 🌈🌈🌈🌈
# = 🏆🏆🏆🏆
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ":" == ts_str[-3:-2]:
ts_str = ts_str[:-3]+ts_str[-2:]
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
info = {
'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': str(time.timestamp()),
'tags': tags,
'title': title,
'sources': [rss_file.name],
}
info['type'] = get_link_type(info)
yield info
def parse_medium_rss_feed(rss_file):
"""Parse Medium RSS feed files into links"""
rss_file.seek(0)
root = etree.parse(rss_file).getroot()
items = root.find("channel").findall("item")
for item in items:
# for child in item:
# print(child.tag, child.text)
url = item.find("link").text
title = item.find("title").text
ts_str = item.find("pubDate").text
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
info = {
'url': url,
'domain': domain(url),
'base_url': base_url(url),
'timestamp': str(time.timestamp()),
'tags': "",
'title': title,
'sources': [rss_file.name],
}
info['type'] = get_link_type(info)
yield info

113
archiver/peekable.py Normal file
View file

@ -0,0 +1,113 @@
from collections import deque
_marker = object()
class Peekable(object):
"""Peekable version of a normal python generator.
Useful when you don't want to evaluate the entire iterable to look at
a specific item at a given idx.
"""
def __init__(self, iterable):
self._it = iter(iterable)
self._cache = deque()
def __iter__(self):
return self
def __bool__(self):
try:
self.peek()
except StopIteration:
return False
return True
def __nonzero__(self):
# For Python 2 compatibility
return self.__bool__()
def peek(self, default=_marker):
"""Return the item that will be next returned from ``next()``.
Return ``default`` if there are no items left. If ``default`` is not
provided, raise ``StopIteration``.
"""
if not self._cache:
try:
self._cache.append(next(self._it))
except StopIteration:
if default is _marker:
raise
return default
return self._cache[0]
def prepend(self, *items):
"""Stack up items to be the next ones returned from ``next()`` or
``self.peek()``. The items will be returned in
first in, first out order::
>>> p = peekable([1, 2, 3])
>>> p.prepend(10, 11, 12)
>>> next(p)
10
>>> list(p)
[11, 12, 1, 2, 3]
It is possible, by prepending items, to "resurrect" a peekable that
previously raised ``StopIteration``.
>>> p = peekable([])
>>> next(p)
Traceback (most recent call last):
...
StopIteration
>>> p.prepend(1)
>>> next(p)
1
>>> next(p)
Traceback (most recent call last):
...
StopIteration
"""
self._cache.extendleft(reversed(items))
def __next__(self):
if self._cache:
return self._cache.popleft()
return next(self._it)
next = __next__ # For Python 2 compatibility
def _get_slice(self, index):
# Normalize the slice's arguments
step = 1 if (index.step is None) else index.step
if step > 0:
start = 0 if (index.start is None) else index.start
stop = maxsize if (index.stop is None) else index.stop
elif step < 0:
start = -1 if (index.start is None) else index.start
stop = (-maxsize - 1) if (index.stop is None) else index.stop
else:
raise ValueError('slice step cannot be zero')
# If either the start or stop index is negative, we'll need to cache
# the rest of the iterable in order to slice from the right side.
if (start < 0) or (stop < 0):
self._cache.extend(self._it)
# Otherwise we'll need to find the rightmost index and cache to that
# point.
else:
n = min(max(start, stop) + 1, maxsize)
cache_len = len(self._cache)
if n >= cache_len:
self._cache.extend(islice(self._it, n - cache_len))
return list(self._cache)[index]
def __getitem__(self, index):
if isinstance(index, slice):
return self._get_slice(index)
cache_len = len(self._cache)
if index < 0:
self._cache.extend(self._it)
elif index >= cache_len:
self._cache.extend(islice(self._it, index + 1 - cache_len))
return self._cache[index]

View file

@ -0,0 +1,146 @@
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>Archived Sites</title>
<style>
html, body {
width: 100%;
height: 100%;
font-size: 18px;
font-weight: 200;
text-align: center;
margin: 0px;
padding: 0px;
font-family: "Gill Sans", Helvetica, sans-serif;
}
header {
background-color: #aa1e55;
color: white;
padding: 10px;
padding-top: 0px;
padding-bottom: 15px;
height: 100px;
}
header h1 {
font-size: 38px;
font-weight: 300;
color: black;
padding-top: 14px;
line-height: 1.4;
width: 100%;
}
header h1 small {
color: white;
font-size:0.45em;
margin-left: 10px;
display: block;
}
header h1 small a {
text-decoration: none;
color: orange;
opacity: 0.6
font-weight: 300;
}
header h1 small a:hover {
opacity: 1;
}
.header-center {
width: 100%;
text-align: center;
}
.header-right {
float: right;
width: 50px;
height: 60px;
text-align: center;
padding: 20px;
margin-left: -100px;
}
table {
padding: 6px;
width: 100%;
}
table thead th {
font-weight: 400;
}
table tr {
height: 35px;
}
tbody tr:nth-child(odd) {
background-color: #ffebeb;
}
table tr td {
white-space: nowrap;
overflow: hidden;
/*padding-bottom: 0.4em;*/
/*padding-top: 0.4em;*/
padding-left: 2px;
text-align: center;
}
table tr td a {
text-decoration: none;
}
table tr td img, table tr td object {
display: inline-block;
margin: auto;
height: 24px;
width: 24px;
padding: 0px;
padding-right: 5px;
vertical-align: middle;
margin-left: 4px;
}
</style>
</head>
<body>
<header>
<div class="header-right">
<a href="?" title="Reload...">
<img src="static/archive.png" style="height: 100%;"/>
</a>
<br/>
<a href="https://pirate.github.io/bookmark-archiver">
Docs
</a>
</div>
<div class="header-center">
<h1>
Archived Sites
<br/>
<small>
<a href="?"> $num_links links</a> &nbsp; &nbsp; Last updated $time_updated<br/>
</small>
</h1>
</div>
</header>
<table style="width:100%;height: 90%; overflow-y: scroll;table-layout: fixed">
<thead>
<tr>
<th style="width: 120px;">Bookmarked On</th>
<th style="width: 2vw;">Status</th>
<th style="width: 43vw;">Saved Articles ($num_links)</th>
<th style="width: 50px">Index</th>
<th style="width: 50px">HTML</th>
<th style="width: 50px">PDF</th>
<th style="width: 60px;font-size:0.8em;">Screenshot</th>
<th style="width: 50px">A.org</th>
<th style="width: 100px;whitespace:nowrap;overflow-x:scroll;display:block">Original URL</th>
</tr>
</thead>
<tbody>$rows</tbody>
</table>
<footer>
<br/>
<center>
<small>
Created using <a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a>
version <a href="https://github.com/pirate/bookmark-archiver/commit/$git_sha">$short_git_sha</a> &nbsp; | &nbsp;
Download index as <a href="index.json">JSON</a>
<br/><br/>
$footer_info
</small>
</center>
<br/>
</footer>
</body>
</html>

View file

@ -0,0 +1,17 @@
<tr>
<td title="Bookmarked timestamp: $timestamp">$date</td>
<td>
<a href="?" title="Refresh status...">
<img src="$favicon_url" onerror="this.src='static/spinner.gif'" class="link-favicon">
</a>
</td>
<td style="text-align: left"><a href="$archive_url" style="font-size:1.4em;text-decoration:none;color:black;" title="$title">
$title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
</td>
<td><a href="$files_url" title="Files">📂</a></td>
<td><a href="$dom_link" title="HTML">📄</a></td>
<td><a href="$pdf_link" title="PDF">📜</a></td>
<td><a href="$screenshot_link" title="Screenshot">🖼</a></td>
<td><a href="$archive_org_url" title="Archive.org">🏛</a></td>
<td style="text-align: left"><!--🔗 <img src="$google_favicon_url" height="16px">--> <a href="$url">$url</a></td>
</tr>

View file

@ -0,0 +1,63 @@
<html>
<head>
<meta charset="utf-8">
<title>$title</title>
</head>
<body>
<header>
<h1>
<img src="$favicon" height="20px"> $title<br/>
<a href="$url" class="title-url">
<small>$base_url</small>
</a>
</h1>
</header>
<hr/>
<div>
Tags: $tags<br/>
Type: $type<br/>
<br/>
Bookmarked:<br/>
$bookmarked<br/>
Archived:<br/>
$updated<br/>
</div>
<hr/>
<ul>
<li>
<a href="$url"><b>Original</b></a><br/>
$base_url<br/>&nbsp;
</li>
<li>
<a href="$wget"><b>Local Archive</b></a><br/>
archive/$timestamp/$domain<br/>&nbsp;
</li>
<li>
<a href="$pdf" id="pdf-btn"><b>PDF</b></a><br/>
archive/$timestamp/output.pdf<br/>&nbsp;
</li>
<li>
<a href="$screenshot"><b>Screenshot</b></a><br/>
archive/$timestamp/screenshot.png<br/>&nbsp;
</li>
<li>
<a href="$dom"><b>HTML</b></a><br/>
archive/$timestamp/output.html<br/>&nbsp;
</li>
<li>
<a href="$archive_org"><b>Archive.Org</b></a><br/>
web.archive.org/web/$base_url<br/>&nbsp;
</li>
</ul>
<footer>
<hr/>
<a href="index.json">JSON</a> | <a href=".">Files</a>
<hr/>
<a href="./../../index.html" class="nav-icon" title="Archived Sites">
<img src="https://nicksweeting.com/images/archive.png" alt="Archive Icon" height="20px">
Bookmark Archiver: Link Index
</a>
</footer>
</body>
</html>

View file

@ -0,0 +1,316 @@
<html>
<head>
<meta charset="utf-8">
<title>$title</title>
<style>
html, body {
width: 100%;
height: 100%;
}
body {
background-color: #ddd;
}
header {
width: 100%;
height: 90px;
background-color: #aa1e55;
margin: 0px;
text-align: center;
color: white;
}
header h1 {
padding-top: 5px;
padding-bottom: 5px;
margin: 0px;
font-weight: 200;
font-family: "Gill Sans", Helvetica, sans-serif;
font-size: calc(16px + 1vw);
}
.collapse-icon {
float: left;
color: black;
width: 126px;
font-size: 0.8em;
margin-top: 20px;
margin-right: 0px;
margin-left: -35px;
}
.nav-icon img {
float: right;
display: block;
margin-right: 13px;
color: black;
height: 53px;
margin-top: 7px;
margin-left: 10px;
}
.nav-icon img:hover {
opacity: 0.5;
}
.title-url {
color: black;
display: block;
width: 75%;
white-space: nowrap;
overflow: hidden;
margin: auto;
}
.archive-page-header {
margin-top: 5px;
margin-bottom: 5px;
}
.archive-page-header .alert {
margin-bottom: 0px;
}
h1 small {
opacity: 0.4;
font-size: 0.6em;
}
h1 small:hover {
opacity: 0.8;
}
.card {
box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
}
.card h4 {
font-size: 1.4vw;
}
.card-body {
font-size: 1vw;
padding-top: 1.2vw;
padding-left: 1vw;
padding-right: 1vw;
padding-bottom: 1vw;
line-height: 1.1;
word-wrap: break-word;
max-height: 102px;
overflow: hidden;
}
.card-img-top {
border: 0px;
padding: 0px;
margin: 0px;
overflow: hidden;
opacity: 0.8;
border-top: 1px solid gray;
border-radius: 3px;
border-bottom: 1px solid #ddd;
height: 430px;
width: 400%;
margin-bottom: -330px;
transform: scale(0.25);
transform-origin: 0 0;
}
.full-page-iframe {
border-top: 1px solid #ddd;
width: 100%;
height: 69vh;
margin: 0px;
border: 0px;
border-top: 3px solid #aa1e55;
}
.card.selected-card {
border: 2px solid orange;
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
}
.iframe-large {
height: 93%;
margin-top: -10px;
}
img.external {
height: 30px;
margin-right: -10px;
padding: 3px;
border-radius: 4px;
vertical-align: middle;
border: 4px solid rgba(0,0,0,0);
}
img.external:hover {
border: 4px solid green;
}
@media(max-width: 1092px) {
iframe {
display: none;
}
}
@media(max-width: 728px) {
.card h4 {
font-size: 5vw;
}
.card-body {
font-size: 4vw;
}
.card {
margin-bottom: 5px;
}
header > h1 > a.collapse-icon, header > h1 > a.nav-icon {
display: none;
}
}
</style>
<script
src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g="
crossorigin="anonymous"></script>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" integrity="sha384-rwoIResjU2yc3z8GV/NPeZWAv56rSmLldC3R/AZzGRnGxQQKnKkoFVhFQhNUwEyJ" crossorigin="anonymous">
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
</head>
<body>
<header>
<h1 class="page-title">
<a href="../../index.html" class="nav-icon" title="Go to Main Index...">
<img src="../../static/archive.png" alt="Archive Icon">
</a>
<a href="#" class="collapse-icon" style="text-decoration: none" title="Toggle info panel...">
</a>
<img src="$favicon" height="20px"> $title<br/>
<a href="$url" class="title-url">
<small>$base_url</small>
</a>
</h1>
</header>
<div class="site-header container-fluid">
<div class="row archive-page-header">
<div class="col-lg-4 alert well">
Added: <small title="Timestamp: $bookmarked_ts">$bookmarked</small>
&nbsp; | &nbsp;
Last updated: <small title="Timestamp: $updated_ts">$updated</small>
</div>
<div class="col-lg-4 alert well">
Type:
<span class="badge badge-default">$type</span>
&nbsp; | &nbsp;
Tags:
<span class="badge badge-success">$tags</span>
</div>
<div class="col-lg-4 alert well">
Download:
<a href="index.json" title="JSON summary of archived link.">JSON</a> |
<a href="." title="Webserver-provided index of files directory.">Files</a>
</div>
<hr/>
<div class="col-lg-2">
<div class="card selected-card">
<iframe class="card-img-top" src="$wget" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
<div class="card-body">
<a href="$wget" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$wget" target="preview"><h4 class="card-title">Local Archive</h4></a>
<p class="card-text">archive/$domain</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$dom" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
<div class="card-body">
<a href="$dom" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$dom" target="preview"><h4 class="card-title">HTML</h4></a>
<p class="card-text">archive/output.html</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$pdf"></iframe>
<div class="card-body">
<a href="$pdf" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$pdf" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a>
<p class="card-text">archive/output.pdf</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$screenshot" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
<div class="card-body">
<a href="$screenshot" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$screenshot" target="preview"><h4 class="card-title">Screenshot</h4></a>
<p class="card-text">archive/screenshot.png</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$url" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
<div class="card-body">
<a href="$url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$url" target="preview"><h4 class="card-title">Original</h4></a>
<p class="card-text">$domain</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$archive_org" sandbox="allow-same-origin allow-scripts allow-forms"></iframe>
<div class="card-body">
<a href="$archive_org" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$archive_org" target="preview"><h4 class="card-title">Archive.Org</h4></a>
<p class="card-text">web.archive.org/web/...</p>
</div>
</div>
</div>
</div>
</div>
<iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="$wget" name="preview"></iframe>
</body>
<script>
// show selected file in iframe when preview card is clicked
jQuery('.card').on('click', function(e) {
jQuery('.selected-card').removeClass('selected-card')
jQuery(e.target).closest('.card').addClass('selected-card')
})
jQuery('.card a[target=preview]').on('click', function(e) {
if (e.currentTarget.href.endsWith('.pdf')) {
jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
} else {
jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms"
}
return true
})
// un-sandbox iframes showing pdfs (required to display pdf viewer)
jQuery('iframe').map(function() {
if (this.src.endsWith('.pdf')) {
this.removeAttribute('sandbox')
this.src = this.src
}
})
// hide header when collapse icon is clicked
jQuery('.collapse-icon').on('click', function() {
if (jQuery('.collapse-icon').text().includes('▾')) {
jQuery('.collapse-icon').text('▸')
jQuery('.site-header').hide()
jQuery('.full-page-iframe').addClass('iframe-large')
} else {
jQuery('.collapse-icon').text('▾')
jQuery('.site-header').show()
jQuery('.full-page-iframe').removeClass('iframe-large')
}
return true
})
// hide all preview iframes on small screens
if (window.innerWidth < 1091) {
jQuery('.card a[target=preview]').attr('target', '_self')
}
</script>
</html>

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

481
archiver/util.py Normal file
View file

@ -0,0 +1,481 @@
import os
import re
import sys
import time
import json
import requests
from datetime import datetime
from subprocess import run, PIPE, DEVNULL
from multiprocessing import Process
from urllib.parse import quote
from config import (
IS_TTY,
ARCHIVE_PERMISSIONS,
REPO_DIR,
SOURCES_DIR,
OUTPUT_DIR,
TIMEOUT,
TERM_WIDTH,
SHOW_PROGRESS,
ANSI,
CHROME_BINARY,
FETCH_WGET,
FETCH_PDF,
FETCH_SCREENSHOT,
FETCH_DOM,
FETCH_FAVICON,
FETCH_AUDIO,
FETCH_VIDEO,
SUBMIT_ARCHIVE_DOT_ORG,
)
# URL helpers
without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '')
without_query = lambda url: url.split('?', 1)[0]
without_hash = lambda url: url.split('#', 1)[0]
without_path = lambda url: url.split('/', 1)[0]
domain = lambda url: without_hash(without_query(without_path(without_scheme(url))))
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
short_ts = lambda ts: ts.split('.')[0]
def check_dependencies():
"""Check that all necessary dependencies are installed, and have valid versions"""
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
if python_vers < 3.5:
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
print(' See https://github.com/pirate/bookmark-archiver#troubleshooting for help upgrading your Python installation.')
raise SystemExit(1)
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/bookmark-archiver for help.')
raise SystemExit(1)
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
try:
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
version_str = result.stdout.decode('utf-8')
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
version = [l for l in version_lines if l.isdigit()][-1]
if int(version) < 59:
print(version_lines)
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
print(' See https://github.com/pirate/bookmark-archiver for help.')
raise SystemExit(1)
except (IndexError, TypeError, OSError):
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/bookmark-archiver for help.')
raise SystemExit(1)
if FETCH_WGET:
if run(['which', 'wget'], stdout=DEVNULL).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget'))
print(' See https://github.com/pirate/bookmark-archiver for help.')
raise SystemExit(1)
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
if run(['which', 'curl'], stdout=DEVNULL).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl'))
print(' See https://github.com/pirate/bookmark-archiver for help.')
raise SystemExit(1)
if FETCH_AUDIO or FETCH_VIDEO:
if run(['which', 'youtube-dl'], stdout=DEVNULL).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode:
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl'))
print(' See https://github.com/pirate/bookmark-archiver for help.')
raise SystemExit(1)
def chmod_file(path, cwd='.', permissions=ARCHIVE_PERMISSIONS, timeout=30):
"""chmod -R <permissions> <cwd>/<path>"""
if not os.path.exists(os.path.join(cwd, path)):
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
if chmod_result.returncode == 1:
print(' ', chmod_result.stderr.decode())
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
def progress(seconds=TIMEOUT, prefix=''):
"""Show a (subprocess-controlled) progress bar with a <seconds> timeout,
returns end() function to instantly finish the progress
"""
if not SHOW_PROGRESS:
return lambda: None
chunk = '' if sys.stdout.encoding == 'UTF-8' else '#'
chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
def progress_bar(seconds=seconds, prefix=prefix):
"""show timer in the form of progress bar, with percentage and seconds remaining"""
try:
for s in range(seconds * chunks):
progress = s / chunks / seconds * 100
bar_width = round(progress/(100/chunks))
# ████████████████████ 0.9% (1/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
prefix,
ANSI['green'],
(chunk * bar_width).ljust(chunks),
ANSI['reset'],
round(progress, 1),
round(s/chunks),
seconds,
))
sys.stdout.flush()
time.sleep(1 / chunks)
# ██████████████████████████████████ 100.0% (60/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
prefix,
ANSI['red'],
chunk * chunks,
ANSI['reset'],
100.0,
seconds,
seconds,
))
sys.stdout.flush()
except KeyboardInterrupt:
print()
pass
p = Process(target=progress_bar)
p.start()
def end():
"""immediately finish progress and clear the progressbar line"""
p.terminate()
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line
sys.stdout.flush()
return end
def pretty_path(path):
"""convert paths like .../bookmark-archiver/archiver/../output/abc into output/abc"""
return path.replace(REPO_DIR, '')
def download_url(url):
"""download a given url's content into downloads/domain.txt"""
if not os.path.exists(SOURCES_DIR):
os.makedirs(SOURCES_DIR)
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
print('[*] [{}] Downloading {} > {}'.format(
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
url,
pretty_path(source_path),
))
end = progress(TIMEOUT, prefix=' ')
try:
downloaded_xml = requests.get(url).content.decode()
end()
except Exception as e:
end()
print('[!] Failed to download {}\n'.format(url))
print(' ', e)
raise SystemExit(1)
with open(source_path, 'w', encoding='utf-8') as f:
f.write(downloaded_xml)
return source_path
def str_between(string, start, end=None):
"""(<abc>12345</def>, <abc>, </def>) -> 12345"""
content = string.split(start, 1)[-1]
if end is not None:
content = content.rsplit(end, 1)[0]
return content
def get_link_type(link):
"""Certain types of links need to be handled specially, this figures out when that's the case"""
if link['base_url'].endswith('.pdf'):
return 'PDF'
elif link['base_url'].rsplit('.', 1) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
return 'image'
elif 'wikipedia.org' in link['domain']:
return 'wiki'
elif 'youtube.com' in link['domain']:
return 'youtube'
elif 'soundcloud.com' in link['domain']:
return 'soundcloud'
elif 'youku.com' in link['domain']:
return 'youku'
elif 'vimeo.com' in link['domain']:
return 'vimeo'
return None
def merge_links(a, b):
"""deterministially merge two links, favoring longer field values over shorter,
and "cleaner" values over worse ones.
"""
longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
earlier = lambda key: a[key] if a[key] < b[key] else b[key]
url = longer('url')
longest_title = longer('title')
cleanest_title = a['title'] if '://' not in a['title'] else b['title']
link = {
'timestamp': earlier('timestamp'),
'url': url,
'domain': domain(url),
'base_url': base_url(url),
'tags': longer('tags'),
'title': longest_title if '://' not in longest_title else cleanest_title,
'sources': list(set(a.get('sources', []) + b.get('sources', []))),
}
link['type'] = get_link_type(link)
return link
def find_link(folder, links):
"""for a given archive folder, find the corresponding link object in links"""
url = parse_url(folder)
if url:
for link in links:
if (link['base_url'] in url) or (url in link['url']):
return link
timestamp = folder.split('.')[0]
for link in links:
if link['timestamp'].startswith(timestamp):
if link['domain'] in os.listdir(os.path.join(OUTPUT_DIR, 'archive', folder)):
return link # careful now, this isn't safe for most ppl
if link['domain'] in parse_url(folder):
return link
return None
def parse_url(folder):
"""for a given archive folder, figure out what url it's for"""
link_json = os.path.join(OUTPUT_DIR, 'archive', folder, 'index.json')
if os.path.exists(link_json):
with open(link_json, 'r') as f:
try:
link_json = f.read().strip()
if link_json:
link = json.loads(link_json)
return link['base_url']
except ValueError:
print('File contains invalid JSON: {}!'.format(link_json))
archive_org_txt = os.path.join(OUTPUT_DIR, 'archive', folder, 'archive.org.txt')
if os.path.exists(archive_org_txt):
with open(archive_org_txt, 'r') as f:
original_link = f.read().strip().split('/http', 1)[-1]
with_scheme = 'http{}'.format(original_link)
return with_scheme
return ''
def manually_merge_folders(source, target):
"""prompt for user input to resolve a conflict between two archive folders"""
if not IS_TTY:
return
fname = lambda path: path.split('/')[-1]
print(' {} and {} have conflicting files, which do you want to keep?'.format(fname(source), fname(target)))
print(' - [enter]: do nothing (keep both)')
print(' - a: prefer files from {}'.format(source))
print(' - b: prefer files from {}'.format(target))
print(' - q: quit and resolve the conflict manually')
try:
answer = input('> ').strip().lower()
except KeyboardInterrupt:
answer = 'q'
assert answer in ('', 'a', 'b', 'q'), 'Invalid choice.'
if answer == 'q':
print('\nJust run Bookmark Archiver again to pick up where you left off.')
raise SystemExit(0)
elif answer == '':
return
files_in_source = set(os.listdir(source))
files_in_target = set(os.listdir(target))
for file in files_in_source:
if file in files_in_target:
to_delete = target if answer == 'a' else source
run(['rm', '-Rf', os.path.join(to_delete, file)])
run(['mv', os.path.join(source, file), os.path.join(target, file)])
if not set(os.listdir(source)):
run(['rm', '-Rf', source])
def fix_folder_path(archive_path, link_folder, link):
"""given a folder, merge it to the canonical 'correct' path for the given link object"""
source = os.path.join(archive_path, link_folder)
target = os.path.join(archive_path, link['timestamp'])
url_in_folder = parse_url(source)
if not (url_in_folder in link['base_url']
or link['base_url'] in url_in_folder):
raise ValueError('The link does not match the url for this folder.')
if not os.path.exists(target):
# target doesn't exist so nothing needs merging, simply move A to B
run(['mv', source, target])
else:
# target folder exists, check for conflicting files and attempt manual merge
files_in_source = set(os.listdir(source))
files_in_target = set(os.listdir(target))
conflicting_files = files_in_source & files_in_target
if not conflicting_files:
for file in files_in_source:
run(['mv', os.path.join(source, file), os.path.join(target, file)])
if os.path.exists(source):
files_in_source = set(os.listdir(source))
if files_in_source:
manually_merge_folders(source, target)
else:
run(['rm', '-R', source])
def cleanup_archive(archive_path, links):
"""move any incorrectly named folders to their canonical locations"""
# for each folder that exists, see if we can match it up with a known good link
# if we can, then merge the two folders (TODO: if not, move it to lost & found)
unmatched = []
bad_folders = []
if not os.path.exists(archive_path):
return
for folder in os.listdir(archive_path):
try:
files = os.listdir(os.path.join(archive_path, folder))
except NotADirectoryError:
continue
if files:
link = find_link(folder, links)
if link is None:
unmatched.append(folder)
continue
if folder != link['timestamp']:
bad_folders.append((folder, link))
else:
# delete empty folders
run(['rm', '-R', os.path.join(archive_path, folder)])
if bad_folders and IS_TTY and input('[!] Cleanup archive? y/[n]: ') == 'y':
print('[!] Fixing {} improperly named folders in archive...'.format(len(bad_folders)))
for folder, link in bad_folders:
fix_folder_path(archive_path, folder, link)
elif bad_folders:
print('[!] Warning! {} folders need to be merged, fix by running bookmark archiver.'.format(len(bad_folders)))
if unmatched:
print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))
print(' '+ '\n '.join(unmatched))
def wget_output_path(link, look_in=None):
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension (-E)
"""
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
if link['type'] in ('PDF', 'image'):
return urlencode(link['base_url'])
# Since the wget algorithm to for -E (appending .html) is incredibly complex
# instead of trying to emulate it here, we just look in the output folder
# to see what html file wget actually created as the output
wget_folder = link['base_url'].rsplit('/', 1)[0].split('/')
look_in = os.path.join(OUTPUT_DIR, 'archive', link['timestamp'], *wget_folder)
if look_in and os.path.exists(look_in):
html_files = [
f for f in os.listdir(look_in)
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
]
if html_files:
return urlencode(os.path.join(*wget_folder, html_files[0]))
return None
# If finding the actual output file didn't work, fall back to the buggy
# implementation of the wget .html appending algorithm
# split_url = link['url'].split('#', 1)
# query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
# if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
# # already ends in .html
# return urlencode(link['base_url'])
# else:
# # .html needs to be appended
# without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
# if without_scheme.endswith('/'):
# if query:
# return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
# return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
# else:
# if query:
# return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
# elif '/' in without_scheme:
# return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
# return urlencode(link['base_url'] + '/index.html')
def derived_link_info(link):
"""extend link info with the archive urls and other derived data"""
link_info = {
**link,
'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
'files_url': 'archive/{timestamp}/index.html'.format(**link),
'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link)),
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
'dom_link': 'archive/{timestamp}/output.html'.format(**link),
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
}
# PDF and images are handled slightly differently
# wget, screenshot, & pdf urls all point to the same file
if link['type'] in ('PDF', 'image'):
link_info.update({
'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
'title': '{title} ({type})'.format(**link),
})
return link_info