diff --git a/.gitignore b/.gitignore index 12d26357..790b36d5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ pocket/ bookmarks/ pinboard/ +html/ +downloads/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index a0c3f9f2..71359ad4 100644 --- a/README.md +++ b/README.md @@ -174,7 +174,7 @@ My published archive as an example: [sweeting.me/pocket](https://home.sweeting.m If you don't like running random setup scripts off the internet (:+1:), you can follow these manual setup instructions. -**1. Install dependencies:** `chromium >= 59`,` wget >= 1.16`, `python3 >= 3.5` (google-chrome >= v59 also works well) +**1. Install dependencies:** `chromium >= 59`,` wget >= 1.16`, `python3 >= 3.5` (`google-chrome >= v59` works fine as well) If you already have Google Chrome installed, or wish to use that instead of Chromium, follow the [Google Chrome Instructions](#google-chrome-instructions). diff --git a/archive.py b/archive.py index 9e85a645..0c71e7dc 100755 --- a/archive.py +++ b/archive.py @@ -3,74 +3,79 @@ # Nick Sweeting 2017 | MIT License # https://github.com/pirate/bookmark-archiver -import os import sys from datetime import datetime +from links import validate_links from parse import parse_export -from index import dump_index -from fetch import dump_website +from archive_methods import archive_links, _RESULTS_TOTALS +from index import ( + write_links_index, + write_link_index, + parse_json_links_index, + parse_json_link_index, +) from config import ( ARCHIVE_PERMISSIONS, - ARCHIVE_DIR, + HTML_FOLDER, + ARCHIVE_FOLDER, ANSI, + TIMEOUT, +) +from util import ( + download_url, check_dependencies, + progress, ) DESCRIPTION = 'Bookmark Archiver: Create a browsable html archive of a list of links.' __DOCUMENTATION__ = 'https://github.com/pirate/bookmark-archiver' -def create_archive(export_file, service=None, resume=None): + +def update_archive(export_path, resume=None, append=True): """update or create index.html and download archive of all links""" - print('[*] [{}] Starting archive from {} export file.'.format( + start_ts = datetime.now().timestamp() + + # parse an validate the export file + new_links = validate_links(parse_export(export_path)) + + # load existing links if archive folder is present + if append: + existing_links = parse_json_links_index(HTML_FOLDER) + links = validate_links(existing_links + new_links) + else: + existing_links = [] + + # merge existing links and new links + num_new_links = len(links) - len(existing_links) + print('[*] [{}] Adding {} new links from {} to index'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - export_file, + num_new_links, + export_path, )) - with open(export_file, 'r', encoding='utf-8') as f: - links, service = parse_export(f, service=service) + # write link index html & json + write_links_index(HTML_FOLDER, links) - if resume: - try: - links = [ - link - for link in links - if float(link['timestamp']) >= float(resume) - ] - except TypeError: - print('Resume value and all timestamp values must be valid numbers.') + # loop over links and archive them + archive_links(ARCHIVE_FOLDER, links, export_path, resume=resume) - if not links or not service: - print('[X] No links found in {}, is it a {} export file?'.format(export_file, service)) - raise SystemExit(1) - - if not os.path.exists(os.path.join(ARCHIVE_DIR, service)): - os.makedirs(os.path.join(ARCHIVE_DIR, service)) - - if not os.path.exists(os.path.join(ARCHIVE_DIR, service, 'archive')): - os.makedirs(os.path.join(ARCHIVE_DIR, service, 'archive')) - - dump_index(links, service) - check_dependencies() - try: - for link in links: - dump_website(link, service) - except (KeyboardInterrupt, SystemExit, Exception) as e: - print('{red}[X] Archive creation stopped.{reset}'.format(**ANSI)) - print(' Continue where you left off by running:') - print(' ./archive.py {} {} {}'.format( - export_file, - service, - link['timestamp'], - )) - if not isinstance(e, KeyboardInterrupt): - raise e - raise SystemExit(1) - - print('{}[√] [{}] Archive update complete.{}'.format(ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), ANSI['reset'])) + # print timing information & summary + end_ts = datetime.now().timestamp() + seconds = round(end_ts - start_ts, 1) + duration = '{} min'.format(seconds / 60) if seconds > 60 else '{} sec'.format(seconds) + print('{}[√] [{}] Archive update complete ({}){}'.format( + ANSI['green'], + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + duration, + ANSI['reset'], + )) + print(' - {} skipped'.format(_RESULTS_TOTALS['skipped'])) + print(' - {} updates'.format(_RESULTS_TOTALS['succeded'])) + print(' - {} errors'.format(_RESULTS_TOTALS['failed'])) if __name__ == '__main__': @@ -85,8 +90,10 @@ if __name__ == '__main__': print("") raise SystemExit(0) - export_file = sys.argv[1] # path to export file - export_type = sys.argv[2] if argc > 2 else None # select export_type for file format select - resume_from = sys.argv[3] if argc > 3 else None # timestamp to resume dowloading from + export_path = sys.argv[1] # path to export file + resume_from = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from - create_archive(export_file, service=export_type, resume=resume_from) + if any(export_path.startswith(s) for s in ('http://', 'https://', 'ftp://')): + export_path = download_url(export_path) + + update_archive(export_path, resume=resume_from) diff --git a/archive_methods.py b/archive_methods.py new file mode 100644 index 00000000..adec0634 --- /dev/null +++ b/archive_methods.py @@ -0,0 +1,410 @@ +import os + +from functools import wraps +from datetime import datetime +from subprocess import run, PIPE, DEVNULL + +from index import html_appended_url, parse_json_link_index, write_link_index +from links import links_after_timestamp +from config import ( + ARCHIVE_PERMISSIONS, + ARCHIVE_DIR, + CHROME_BINARY, + FETCH_WGET, + FETCH_WGET_REQUISITES, + FETCH_PDF, + FETCH_SCREENSHOT, + RESOLUTION, + SUBMIT_ARCHIVE_DOT_ORG, + FETCH_AUDIO, + FETCH_VIDEO, + FETCH_FAVICON, + WGET_USER_AGENT, + TIMEOUT, + ANSI, +) +from util import ( + check_dependencies, + progress, + chmod_file, +) + +_RESULTS_TOTALS = { + 'skipped': 0, + 'succeded': 0, + 'failed': 0, +} + +def attach_result_to_link(method): + def decorator(fetch_func): + @wraps(fetch_func) + def timed_fetch_func(out_dir, link, overwrite=False, **kwargs): + # initialize methods and history json field on link + link['methods'] = link.get('methods') or {} + link['methods'][method] = link['methods'].get(method) or None + link['history'] = link.get('history') or {} + link['history'][method] = link['history'].get(method) or [] + + start_ts = datetime.now().timestamp() + + # if a valid method output is already present, dont run the fetch function + if link['methods'][method] and not overwrite: + print(' √ Skipping: {}'.format(method)) + result = None + else: + print(' - Fetching: {}'.format(method)) + result = fetch_func(out_dir, link, **kwargs) + + end_ts = datetime.now().timestamp() + duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0] + + # append a history item recording fail/success + history_entry = { + 'timestamp': str(start_ts).split('.')[0], + } + if result is None: + history_entry['status'] = 'skipped' + elif isinstance(result.get('output'), Exception): + history_entry['status'] = 'failed' + history_entry['duration'] = duration + history_entry.update(result or {}) + link['history'][method].append(history_entry) + else: + history_entry['status'] = 'succeded' + history_entry['duration'] = duration + history_entry.update(result or {}) + link['history'][method].append(history_entry) + link['methods'][method] = result['output'] + + _RESULTS_TOTALS[history_entry['status']] += 1 + + return link + return timed_fetch_func + return decorator + + +@attach_result_to_link('wget') +def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT): + """download full site using wget""" + + if os.path.exists(os.path.join(out_dir, link['domain'])): + return {'output': html_appended_url(link), 'status': 'skipped'} + + CMD = [ + *'wget --timestamping --adjust-extension --no-parent'.split(' '), # Docs: https://www.gnu.org/software/wget/manual/wget.html + *(('--page-requisites', '--convert-links') if FETCH_WGET_REQUISITES else ()), + *(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()), + link['url'], + ] + end = progress(timeout, prefix=' ') + try: + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # index.html + end() + output = html_appended_url(link) + if result.returncode > 0: + print(' got wget response code {}:'.format(result.returncode)) + print('\n'.join(' ' + line for line in result.stderr.decode().rsplit('\n', 10)[-10:] if line.strip())) + # raise Exception('Failed to wget download') + chmod_file(link['domain'], cwd=out_dir) + except Exception as e: + end() + print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) + output = e + + return { + 'cmd': CMD, + 'output': output, + } + + +@attach_result_to_link('pdf') +def fetch_pdf(out_dir, link, timeout=TIMEOUT): + """print PDF of site to file using chrome --headless""" + + if link['type'] in ('PDF', 'image'): + return {'output': html_appended_url(link)} + + if os.path.exists(os.path.join(out_dir, 'output.pdf')): + return {'output': 'output.pdf', 'status': 'skipped'} + + CMD = [ + CHROME_BINARY, + *'--headless --disable-gpu --print-to-pdf'.split(' '), + link['url'] + ] + end = progress(timeout, prefix=' ') + try: + result = run(CMD, stdout=DEVNULL, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # output.pdf + end() + if result.returncode: + print(' ', result.stderr.decode()) + raise Exception('Failed to print PDF') + chmod_file('output.pdf', cwd=out_dir) + output = 'output.pdf' + except Exception as e: + end() + print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) + output = e + + return { + 'cmd': CMD, + 'output': output, + } + + +@attach_result_to_link('screenshot') +def fetch_screenshot(out_dir, link, timeout=TIMEOUT, resolution=RESOLUTION): + """take screenshot of site using chrome --headless""" + + if link['type'] in ('PDF', 'image'): + return {'output': html_appended_url(link)} + + if os.path.exists(os.path.join(out_dir, 'screenshot.png')): + return {'output': 'screenshot.png', 'status': 'skipped'} + + CMD = [ + CHROME_BINARY, + *'--headless --disable-gpu --screenshot'.split(' '), + '--window-size={}'.format(resolution), + link['url'] + ] + end = progress(timeout, prefix=' ') + try: + result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # sreenshot.png + end() + if result.returncode: + print(' ', result.stderr.decode()) + raise Exception('Failed to take screenshot') + chmod_file('screenshot.png', cwd=out_dir) + output = 'screenshot.png' + except Exception as e: + end() + print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) + output = e + + return { + 'cmd': CMD, + 'output': output, + } + + +@attach_result_to_link('archive_org') +def archive_dot_org(out_dir, link, timeout=TIMEOUT): + """submit site to archive.org for archiving via their service, save returned archive url""" + + path = os.path.join(out_dir, 'archive.org.txt') + if os.path.exists(path): + archive_org_url = open(path, 'r').read().strip() + return {'output': archive_org_url, 'status': 'skipped'} + + submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0]) + + success = False + CMD = ['curl', '-I', submit_url] + end = progress(timeout, prefix=' ') + try: + result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # archive.org.txt + end() + + # Parse archive.org response headers + headers = result.stdout.splitlines() + content_location = [h for h in headers if b'Content-Location: ' in h] + errors = [h for h in headers if b'X-Archive-Wayback-Runtime-Error: ' in h] + + if content_location: + archive_path = content_location[0].split(b'Content-Location: ', 1)[-1].decode('utf-8') + saved_url = 'https://web.archive.org{}'.format(archive_path) + success = True + + elif len(errors) == 1 and b'RobotAccessControlException' in errors[0]: + output = submit_url + # raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain'])) + elif errors: + raise Exception(', '.join(e.decode() for e in errors)) + else: + raise Exception('Failed to find "Content-Location" URL header in Archive.org response.') + except Exception as e: + end() + print(' Visit url to see output:', ' '.join(CMD)) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) + output = e + + if success: + with open(os.path.join(out_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f: + f.write(saved_url) + chmod_file('archive.org.txt', cwd=out_dir) + output = saved_url + + return { + 'cmd': CMD, + 'output': output, + } + +@attach_result_to_link('favicon') +def fetch_favicon(out_dir, link, timeout=TIMEOUT): + """download site favicon from google's favicon api""" + + if os.path.exists(os.path.join(out_dir, 'favicon.ico')): + return {'output': 'favicon.ico', 'status': 'skipped'} + + CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)] + fout = open('{}/favicon.ico'.format(out_dir), 'w') + end = progress(timeout, prefix=' ') + try: + run(CMD, stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # favicon.ico + fout.close() + end() + chmod_file('favicon.ico', cwd=out_dir) + output = 'favicon.ico' + except Exception as e: + fout.close() + end() + print(' Run to see full output:', ' '.join(CMD)) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) + output = e + + return { + 'cmd': CMD, + 'output': output, + } + +# @attach_result_to_link('audio') +# def fetch_audio(out_dir, link, timeout=TIMEOUT): +# """Download audio rip using youtube-dl""" + +# if link['type'] not in ('soundcloud',)\ +# and 'audio' not in link['tags']: +# return + +# path = os.path.join(out_dir, 'audio') + +# if not os.path.exists(path) or overwrite: +# print(' - Downloading audio') +# CMD = [ +# "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'", +# link['url'], +# ] +# end = progress(timeout, prefix=' ') +# try: +# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # audio/audio.mp3 +# end() +# if result.returncode: +# print(' ', result.stderr.decode()) +# raise Exception('Failed to download audio') +# chmod_file('audio.mp3', cwd=out_dir) +# return 'audio.mp3' +# except Exception as e: +# end() +# print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) +# print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) +# raise +# else: +# print(' √ Skipping audio download') + +# @attach_result_to_link('video') +# def fetch_video(out_dir, link, timeout=TIMEOUT): +# """Download video rip using youtube-dl""" + +# if link['type'] not in ('youtube', 'youku', 'vimeo')\ +# and 'video' not in link['tags']: +# return + +# path = os.path.join(out_dir, 'video') + +# if not os.path.exists(path) or overwrite: +# print(' - Downloading video') +# CMD = [ +# "youtube-dl -x --video-format mp4 --audio-quality 0 -o '%(title)s.%(ext)s'", +# link['url'], +# ] +# end = progress(timeout, prefix=' ') +# try: +# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # video/movie.mp4 +# end() +# if result.returncode: +# print(' ', result.stderr.decode()) +# raise Exception('Failed to download video') +# chmod_file('video.mp4', cwd=out_dir) +# return 'video.mp4' +# except Exception as e: +# end() +# print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) +# print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) +# raise +# else: +# print(' √ Skipping video download') + + +def archive_links(out_dir, links, export_path, resume=None): + check_dependencies() + + to_archive = links_after_timestamp(links, resume) + try: + for idx, link in enumerate(to_archive): + out_dir = os.path.join(out_dir, link['timestamp']) + archive_link(out_dir, link) + + except (KeyboardInterrupt, SystemExit, Exception) as e: + print('{red}[X] Archive update stopped on #{idx} out of {total} links{reset}'.format( + **ANSI, + idx=idx, + total=len(list(to_archive)), + )) + print(' Continue where you left off by running:') + print(' ./archive.py {} {}'.format( + export_path, + link['timestamp'], + )) + if not isinstance(e, KeyboardInterrupt): + raise e + raise SystemExit(1) + + +def archive_link(out_dir, link, overwrite=False, permissions=ARCHIVE_PERMISSIONS): + """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" + + link = {**parse_json_link_index(out_dir), **link} + log_link_archive(out_dir, link) + + if FETCH_WGET: + link = fetch_wget(out_dir, link, overwrite=overwrite) + + if FETCH_PDF: + link = fetch_pdf(out_dir, link, overwrite=overwrite) + + if FETCH_SCREENSHOT: + link = fetch_screenshot(out_dir, link, overwrite=overwrite) + + if SUBMIT_ARCHIVE_DOT_ORG: + link = archive_dot_org(out_dir, link, overwrite=overwrite) + + # if FETCH_AUDIO: + # link = fetch_audio(out_dir, link, overwrite=overwrite) + + # if FETCH_VIDEO: + # link = fetch_video(out_dir, link, overwrite=overwrite) + + if FETCH_FAVICON: + link = fetch_favicon(out_dir, link, overwrite=overwrite) + + write_link_index(out_dir, link) + + return link + +def log_link_archive(out_dir, link): + update_existing = os.path.exists(out_dir) + if not update_existing: + os.makedirs(out_dir) + run(['chmod', ARCHIVE_PERMISSIONS, out_dir], timeout=5) + + print('[{symbol_color}{symbol}{reset}] [{timestamp}] "{title}": {blue}{base_url}{reset}'.format( + symbol='*' if update_existing else '+', + symbol_color=ANSI['black' if update_existing else 'green'], + **link, + **ANSI, + )) + if link['type']: + print(' i Type: {}'.format(link['type'])) diff --git a/config.py b/config.py index c5721f3b..bca95d4b 100644 --- a/config.py +++ b/config.py @@ -1,10 +1,8 @@ import os import sys -import time import shutil -from subprocess import run, PIPE, DEVNULL -from multiprocessing import Process +from subprocess import run, PIPE # os.getenv('VARIABLE', 'DEFAULT') gets the value of environment # variable "VARIABLE" and if it is not set, sets it to 'DEFAULT' @@ -12,8 +10,10 @@ from multiprocessing import Process # for boolean values, check to see if the string is 'true', and # if so, the python variable will be True -IS_TTY = sys.stdout.isatty() +# ******************************************************************************* +# *** TO SET YOUR PREFERENCES, EDIT THE VALUES HERE, or use the 'env' command *** +IS_TTY = sys.stdout.isatty() USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true' SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true' FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true' @@ -31,9 +31,12 @@ CHROME_BINARY = os.getenv('CHROME_BINARY', 'chromium-browser' WGET_BINARY = os.getenv('WGET_BINARY', 'wget' ) WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', None) TIMEOUT = int(os.getenv('TIMEOUT', '60')) +LINK_INDEX_TEMPLATE = os.getenv('LINK_INDEX_TEMPLATE', 'templates/link_index.html') INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html') INDEX_ROW_TEMPLATE = os.getenv('INDEX_ROW_TEMPLATE', 'templates/index_row.html') +# ******************************************************************************* + TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns ANSI = { 'reset': '\033[00;00m', @@ -50,6 +53,17 @@ if not USE_COLOR: # dont show colors if USE_COLOR is False ANSI = {k: '' for k in ANSI.keys()} + +ROOT_FOLDER = os.path.dirname(os.path.abspath(__file__)) +HTML_FOLDER = os.path.join(ARCHIVE_DIR, 'html') +ARCHIVE_FOLDER = os.path.join(HTML_FOLDER, 'archive') +try: + GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=ROOT_FOLDER).stdout.strip().decode() +except Exception: + GIT_SHA = None + print('[!] Warning, you need git installed for some archiving features to save correct version numbers!') + + if sys.stdout.encoding.upper() != 'UTF-8': print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding)) print(' To fix it, add the line "export PYTHONIOENCODING=utf8" to your ~/.bashrc file (without quotes)') @@ -59,128 +73,3 @@ if sys.stdout.encoding.upper() != 'UTF-8': print('') print(' Alternatively, run this script with:') print(' env PYTHONIOENCODING=utf8 ./archive.py export.html') - -### Util Functions - -def check_dependencies(): - """Check that all necessary dependencies are installed, and have valid versions""" - - print('[*] Checking Dependencies:') - - python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) - if python_vers < 3.5: - print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) - print(' See https://github.com/pirate/bookmark-archiver#troubleshooting for help upgrading your Python installation.') - raise SystemExit(1) - - if FETCH_PDF or FETCH_SCREENSHOT: - if run(['which', CHROME_BINARY]).returncode: - print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) - print(' See https://github.com/pirate/bookmark-archiver for help.') - raise SystemExit(1) - - # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04 - try: - result = run([CHROME_BINARY, '--version'], stdout=PIPE) - version = result.stdout.decode('utf-8').replace('Google Chrome ', '').replace('Chromium ', '').split(' ', 1)[0].split('.', 1)[0] # TODO: regex might be better - if int(version) < 59: - print('{red}[X] Chrome version must be 59 or greater for headless PDF and screenshot saving{reset}'.format(**ANSI)) - print(' See https://github.com/pirate/bookmark-archiver for help.') - raise SystemExit(1) - except (TypeError, OSError): - print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) - print(' See https://github.com/pirate/bookmark-archiver for help.') - raise SystemExit(1) - - if FETCH_WGET: - if run(['which', 'wget']).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode: - print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget')) - print(' See https://github.com/pirate/bookmark-archiver for help.') - raise SystemExit(1) - - if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: - if run(['which', 'curl']).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode: - print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl')) - print(' See https://github.com/pirate/bookmark-archiver for help.') - raise SystemExit(1) - - if FETCH_AUDIO or FETCH_VIDEO: - if run(['which', 'youtube-dl']).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode: - print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) - print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl')) - print(' See https://github.com/pirate/bookmark-archiver for help.') - raise SystemExit(1) - - -def chmod_file(path, cwd='.', permissions=ARCHIVE_PERMISSIONS, timeout=30): - """chmod -R /""" - - if not os.path.exists(os.path.join(cwd, path)): - raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) - - chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) - if chmod_result.returncode == 1: - print(' ', chmod_result.stderr.decode()) - raise Exception('Failed to chmod {}/{}'.format(cwd, path)) - - -def progress(seconds=TIMEOUT, prefix=''): - """Show a (subprocess-controlled) progress bar with a timeout, - returns end() function to instantly finish the progress - """ - - if not SHOW_PROGRESS: - return lambda: None - - chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#' - chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width) - - def progress_bar(seconds=seconds, prefix=prefix): - """show timer in the form of progress bar, with percentage and seconds remaining""" - try: - for s in range(seconds * chunks): - progress = s / chunks / seconds * 100 - bar_width = round(progress/(100/chunks)) - - # ████████████████████ 0.9% (1/60sec) - sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( - prefix, - ANSI['green'], - (chunk * bar_width).ljust(chunks), - ANSI['reset'], - round(progress, 1), - round(s/chunks), - seconds, - )) - sys.stdout.flush() - time.sleep(1 / chunks) - - # ██████████████████████████████████ 100.0% (60/60sec) - sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format( - prefix, - ANSI['red'], - chunk * chunks, - ANSI['reset'], - 100.0, - seconds, - seconds, - )) - sys.stdout.flush() - except KeyboardInterrupt: - print() - pass - - p = Process(target=progress_bar) - p.start() - - def end(): - """immediately finish progress and clear the progressbar line""" - p.terminate() - sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line - sys.stdout.flush() - - return end diff --git a/fetch.py b/fetch.py deleted file mode 100644 index 1b944aa2..00000000 --- a/fetch.py +++ /dev/null @@ -1,299 +0,0 @@ -import os -import json - -from datetime import datetime -from subprocess import run, PIPE, DEVNULL - -from parse import derived_link_info -from config import ( - ARCHIVE_PERMISSIONS, - ARCHIVE_DIR, - CHROME_BINARY, - FETCH_WGET, - FETCH_WGET_REQUISITES, - FETCH_PDF, - FETCH_SCREENSHOT, - RESOLUTION, - SUBMIT_ARCHIVE_DOT_ORG, - FETCH_AUDIO, - FETCH_VIDEO, - FETCH_FAVICON, - WGET_USER_AGENT, - TIMEOUT, - ANSI, - progress, - chmod_file, -) - - -def fetch_wget(out_dir, link, overwrite=False, requisites=True, timeout=TIMEOUT): - """download full site using wget""" - - if not os.path.exists(os.path.join(out_dir, link['domain'])) or overwrite: - print(' - Downloading full site') - CMD = [ - *'wget --timestamping --adjust-extension --no-parent'.split(' '), # Docs: https://www.gnu.org/software/wget/manual/wget.html - *(('--page-requisites', '--convert-links') if requisites else ()), - *(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()), - link['url'], - ] - end = progress(timeout, prefix=' ') - try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # index.html - end() - if result.returncode > 0: - print(' wget output:') - print('\n'.join(' ' + line for line in result.stderr.decode().rsplit('\n', 10)[-10:] if line.strip())) - raise Exception('Failed to wget download') - chmod_file(link['domain'], cwd=out_dir) - except Exception as e: - end() - print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) - print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) - else: - print(' √ Skipping site download') - -def fetch_pdf(out_dir, link, overwrite=False, timeout=TIMEOUT, chrome_binary=CHROME_BINARY): - """print PDF of site to file using chrome --headless""" - - path = os.path.join(out_dir, 'output.pdf') - - if (not os.path.exists(path) or overwrite) and link['type'] not in ('PDF', 'image'): - print(' - Printing PDF') - CMD = [ - chrome_binary, - *'--headless --disable-gpu --print-to-pdf'.split(' '), - link['url'] - ] - end = progress(timeout, prefix=' ') - try: - result = run(CMD, stdout=DEVNULL, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # output.pdf - end() - if result.returncode: - print(' ', result.stderr.decode()) - raise Exception('Failed to print PDF') - chmod_file('output.pdf', cwd=out_dir) - except Exception as e: - end() - print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) - print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) - else: - print(' √ Skipping PDF print') - -def fetch_screenshot(out_dir, link, overwrite=False, timeout=TIMEOUT, chrome_binary=CHROME_BINARY, resolution=RESOLUTION): - """take screenshot of site using chrome --headless""" - - path = os.path.join(out_dir, 'screenshot.png') - - if (not os.path.exists(path) or overwrite) and link['type'] not in ('PDF', 'image'): - print(' - Snapping Screenshot') - CMD = [ - chrome_binary, - *'--headless --disable-gpu --screenshot'.split(' '), - '--window-size={}'.format(resolution), - link['url'] - ] - end = progress(timeout, prefix=' ') - try: - result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # sreenshot.png - end() - if result.returncode: - print(' ', result.stderr.decode()) - raise Exception('Failed to take screenshot') - chmod_file('screenshot.png', cwd=out_dir) - except Exception as e: - end() - print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) - print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) - else: - print(' √ Skipping screenshot') - -def archive_dot_org(out_dir, link, overwrite=False, timeout=TIMEOUT): - """submit site to archive.org for archiving via their service, save returned archive url""" - - path = os.path.join(out_dir, 'archive.org.txt') - - if not os.path.exists(path) or overwrite: - print(' - Submitting to archive.org') - submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0]) - - success = False - CMD = ['curl', '-I', submit_url] - end = progress(timeout, prefix=' ') - try: - result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # archive.org.txt - end() - - # Parse archive.org response headers - headers = result.stdout.splitlines() - content_location = [h for h in headers if b'Content-Location: ' in h] - errors = [h for h in headers if b'X-Archive-Wayback-Runtime-Error: ' in h] - - if content_location: - archive_path = content_location[0].split(b'Content-Location: ', 1)[-1].decode('utf-8') - saved_url = 'https://web.archive.org{}'.format(archive_path) - success = True - - elif len(errors) == 1 and b'RobotAccessControlException' in errors[0]: - raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain'])) - elif errors: - raise Exception(', '.join(e.decode() for e in errors)) - else: - raise Exception('Failed to find "Content-Location" URL header in Archive.org response.') - except Exception as e: - end() - print(' Visit url to see output:', ' '.join(CMD)) - print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) - - if success: - with open(os.path.join(out_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f: - f.write(saved_url) - chmod_file('archive.org.txt', cwd=out_dir) - - else: - print(' √ Skipping archive.org') - -def fetch_favicon(out_dir, link, overwrite=False, timeout=TIMEOUT): - """download site favicon from google's favicon api""" - - path = os.path.join(out_dir, 'favicon.ico') - - if not os.path.exists(path) or overwrite: - print(' - Fetching Favicon') - CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)] - fout = open('{}/favicon.ico'.format(out_dir), 'w') - end = progress(timeout, prefix=' ') - try: - run(CMD, stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # favicon.ico - end() - chmod_file('favicon.ico', cwd=out_dir) - except Exception as e: - end() - print(' Run to see full output:', ' '.join(CMD)) - print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) - fout.close() - else: - print(' √ Skipping favicon') - -def fetch_audio(out_dir, link, overwrite=False, timeout=TIMEOUT): - """Download audio rip using youtube-dl""" - - if link['type'] not in ('soundcloud',): - return - - path = os.path.join(out_dir, 'audio') - - if not os.path.exists(path) or overwrite: - print(' - Downloading audio') - CMD = [ - "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'", - link['url'], - ] - end = progress(timeout, prefix=' ') - try: - result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # audio/audio.mp3 - end() - if result.returncode: - print(' ', result.stderr.decode()) - raise Exception('Failed to download audio') - chmod_file('audio', cwd=out_dir) - except Exception as e: - end() - print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) - print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) - else: - print(' √ Skipping audio download') - -def fetch_video(out_dir, link, overwrite=False, timeout=TIMEOUT): - """Download video rip using youtube-dl""" - - if link['type'] not in ('youtube', 'youku', 'vimeo'): - return - - path = os.path.join(out_dir, 'video') - - if not os.path.exists(path) or overwrite: - print(' - Downloading video') - CMD = [ - "youtube-dl -x --video-format mp4 --audio-quality 0 -o '%(title)s.%(ext)s'", - link['url'], - ] - end = progress(timeout, prefix=' ') - try: - result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # video/movie.mp4 - end() - if result.returncode: - print(' ', result.stderr.decode()) - raise Exception('Failed to download video') - chmod_file('video', cwd=out_dir) - except Exception as e: - end() - print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) - print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) - else: - print(' √ Skipping video download') - -def dump_link_info(out_dir, link, overwrite=False): - """write a json file with some info about the link""" - - info_file_path = os.path.join(out_dir, 'link.json') - - if (not os.path.exists(info_file_path) or overwrite): - print(' - Creating link info file') - try: - link_json = derived_link_info(link) - link_json['archived_timstamp'] = str(datetime.now().timestamp()).split('.')[0] - - with open(info_file_path, 'w', encoding='utf-8') as link_file: - link_file.write(json.dumps( - link_json, - indent=4, - default=str) + '\n') - - chmod_file('link.json', cwd=out_dir) - except Exception as e: - print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) - else: - print(' √ Skipping link info file') - - -def dump_website(link, service, overwrite=False, permissions=ARCHIVE_PERMISSIONS): - """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" - - print('[{green}+{reset}] [{timestamp} ({time})] "{title}": {blue}{base_url}{reset}'.format(**link, **ANSI)) - - out_dir = os.path.join(ARCHIVE_DIR, service, 'archive', link['timestamp']) - if not os.path.exists(out_dir): - os.makedirs(out_dir) - - run(['chmod', permissions, out_dir], timeout=5) - - if link['type']: - print(' i Type: {}'.format(link['type'])) - - if not (link['url'].startswith('http') or link['url'].startswith('ftp')): - print(' {}X Skipping: invalid link.{}', ANSI['red'], ANSI['yellow']) - return - - if FETCH_WGET: - fetch_wget(out_dir, link, overwrite=overwrite, requisites=FETCH_WGET_REQUISITES) - - if FETCH_PDF: - fetch_pdf(out_dir, link, overwrite=overwrite, chrome_binary=CHROME_BINARY) - - if FETCH_SCREENSHOT: - fetch_screenshot(out_dir, link, overwrite=overwrite, chrome_binary=CHROME_BINARY, resolution=RESOLUTION) - - if SUBMIT_ARCHIVE_DOT_ORG: - archive_dot_org(out_dir, link, overwrite=overwrite) - - if FETCH_AUDIO: - fetch_audio(out_dir, link, overwrite=overwrite) - - if FETCH_VIDEO: - fetch_video(out_dir, link, overwrite=overwrite) - - if FETCH_FAVICON: - fetch_favicon(out_dir, link, overwrite=overwrite) - - dump_link_info(out_dir, link, overwrite=overwrite) diff --git a/index.py b/index.py index 1a8dda33..29bfabb1 100644 --- a/index.py +++ b/index.py @@ -1,47 +1,196 @@ import os +import re +import json + from datetime import datetime from string import Template -from parse import derived_link_info from config import ( INDEX_TEMPLATE, INDEX_ROW_TEMPLATE, + LINK_INDEX_TEMPLATE, ARCHIVE_PERMISSIONS, ARCHIVE_DIR, ANSI, - chmod_file, + GIT_SHA, ) +from util import chmod_file -def dump_index(links, service): - """create index.html file for a given list of links and service""" +### Homepage index for all the links + +def parse_json_links_index(out_dir): + """load the index in a given directory and merge it with the given link""" + index_path = os.path.join(out_dir, 'index.json') + if os.path.exists(index_path): + with open(index_path, 'r', encoding='utf-8') as f: + return json.load(f)['links'] + + return [] + +def write_links_index(out_dir, links): + """create index.html file for a given list of links""" + + if not os.path.exists(out_dir): + os.makedirs(out_dir) + + print('[i] [{}] Updating {}{}{} links in archive index...'.format( + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + ANSI['green'], + len(links), + ANSI['reset'], + )) + + write_json_links_index(out_dir, links) + write_html_links_index(out_dir, links) + + chmod_file(out_dir, permissions=ARCHIVE_PERMISSIONS) + +def write_json_links_index(out_dir, links): + """write the json link index to a given path""" + + path = os.path.join(out_dir, 'index.json') + + index_json = { + 'info': 'Bookmark Archiver Index', + 'help': 'https://github.com/pirate/bookmark-archiver', + 'version': GIT_SHA, + 'num_links': len(links), + 'updated': str(datetime.now().timestamp()), + 'links': links, + } + + with open(path, 'w', encoding='utf-8') as f: + json.dump(index_json, f, indent=4, default=str) + + chmod_file(path) + +def write_html_links_index(out_dir, links): + """write the html link index to a given path""" + + path = os.path.join(out_dir, 'index.html') with open(INDEX_TEMPLATE, 'r', encoding='utf-8') as f: index_html = f.read() - # TODO: refactor this out into index_template.html with open(INDEX_ROW_TEMPLATE, 'r', encoding='utf-8') as f: - link_html = f.read() + link_row_html = f.read() - article_rows = '\n'.join( - Template(link_html).substitute(**derived_link_info(link)) for link in links + link_rows = '\n'.join( + Template(link_row_html).substitute(**derived_link_info(link)) + for link in links ) template_vars = { 'num_links': len(links), 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), - 'rows': article_rows, + 'rows': link_rows, } - with open(os.path.join(ARCHIVE_DIR, service, 'index.html'), 'w', encoding='utf-8') as f: + with open(path, 'w', encoding='utf-8') as f: f.write(Template(index_html).substitute(**template_vars)) - chmod_file(os.path.join(ARCHIVE_DIR, service), permissions=ARCHIVE_PERMISSIONS) - print('[+] [{}] Created archive index with {}{}{} links.'.format( - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - ANSI['green'], - len(links), - ANSI['reset'], - )) +### Individual link index + +def parse_json_link_index(out_dir): + """load the index in a given directory and merge it with the given link""" + existing_index = os.path.join(out_dir, 'index.json') + if os.path.exists(existing_index): + with open(existing_index, 'r', encoding='utf-8') as f: + return json.load(f) + return {} + +def write_link_index(out_dir, link): + link['updated'] = str(datetime.now().timestamp()) + write_json_link_index(out_dir, link) + write_html_link_index(out_dir, link) + +def write_json_link_index(out_dir, link): + """write a json file with some info about the link""" + + path = os.path.join(out_dir, 'index.json') + + with open(path, 'w', encoding='utf-8') as f: + json.dump(link, f, indent=4, default=str) + + chmod_file(path) + +def write_html_link_index(out_dir, link): + with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f: + link_html = f.read() + + path = os.path.join(out_dir, 'index.html') + + with open(path, 'w', encoding='utf-8') as f: + f.write(Template(link_html).substitute({ + **link, + **link['methods'], + 'type': link['type'] or 'website', + 'tags': link['tags'] or '', + 'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'), + 'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'), + 'archive_org': link['methods']['archive_org'] or 'https://web.archive.org/save/{}'.format(link['url']), + 'wget': link['methods']['wget'] or link['domain'], + })) + + chmod_file(path) + + + +def html_appended_url(link): + """calculate the path to the wgetted .html file, since wget may + adjust some paths to be different than the base_url path. + + See docs on wget --adjust-extension.""" + + if link['type'] in ('PDF', 'image'): + return link['base_url'] + + split_url = link['url'].split('#', 1) + query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' + + if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): + # already ends in .html + return link['base_url'] + else: + # .html needs to be appended + without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] + if without_scheme.endswith('/'): + if query: + return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]) + return '#'.join([without_scheme + 'index.html', *split_url[1:]]) + else: + if query: + return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]) + elif '/' in without_scheme: + return '#'.join([without_scheme + '.html', *split_url[1:]]) + return link['base_url'] + '/index.html' + + +def derived_link_info(link): + """extend link info with the archive urls and other derived data""" + + link_info = { + **link, + 'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'), + 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), + 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link), + 'files_url': 'archive/{timestamp}/'.format(**link), + 'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)), + 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link), + 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link), + 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link), + } + + # PDF and images are handled slightly differently + # wget, screenshot, & pdf urls all point to the same file + if link['type'] in ('PDF', 'image'): + link_info.update({ + 'archive_url': 'archive/{timestamp}/{base_url}'.format(**link), + 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link), + 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link), + 'title': '{title} ({type})'.format(**link), + }) + return link_info diff --git a/links.py b/links.py new file mode 100644 index 00000000..b0adbdc9 --- /dev/null +++ b/links.py @@ -0,0 +1,112 @@ +from util import ( + domain, + base_url, + get_str_between, + get_link_type, +) + + +def validate_links(links): + links = valid_links(links) # remove chrome://, about:, mailto: etc. + links = uniquefied_links(links) # fix duplicate timestamps, returns sorted list + links = sorted_links(links) # deterministically sort the links + + if not links: + print('[X] No links found :(') + raise SystemExit(1) + + return list(links) + +def sorted_links(links): + return sorted( + links, + key=lambda link: (link['timestamp'], link['url']), + reverse=True, + ) + +def merge_links(link1, link2): + longer = lambda a, b, key: a[key] if len(a[key]) > len(b[key]) else b[key] + earlier = lambda a, b, key: a[key] if a[key] < b[key] else b[key] + + url = longer(link1, link2, 'url') + earliest_ts = earlier(link1, link2, 'timestamp') + longest_title = longer(link1, link2, 'title') + cleanest_title = link1['title'] if '://' not in link1['title'] else link2['title'] + link = { + 'url': url, + 'domain': domain(url), + 'base_url': base_url(url), + 'timestamp': earliest_ts, + 'tags': longer(link1, link2, 'tags'), + 'title': longest_title if '://' not in longest_title else cleanest_title, + 'sources': list(set(link1['sources'] + link2['sources'])), + } + link['type'] = get_link_type(link) + return link + +def uniquefied_links(sorted_links): + """ + ensures that all non-duplicate links have monotonically increasing timestamps + """ + + seen_urls = {} + seen_timestamps = set() + + lower = lambda url: url.lower().strip() + without_www = lambda url: url.replace('://www.', '://', 1) + without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?') + + for link in sorted_links: + url = without_www(without_trailing_slash(lower(link['url']))) + if url in seen_urls: + # merge with any other links that share the same url + link = merge_links(seen_urls[url], link) + elif link['timestamp'] in seen_timestamps: + # add with incremented timestamp if earlier link exist with same timestamp + link['timestamp'] = next_uniq_timestamp(seen_timestamps, link['timestamp']) + + seen_urls[url] = link + seen_timestamps.add(link['timestamp']) + + return seen_urls.values() + +def valid_links(links): + """remove chrome://, about:// or other schemed links that cant be archived""" + return ( + link + for link in links + if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://')) + ) + +def links_after_timestamp(links, timestamp=None): + if not timestamp: + yield from links + return + + print('[.] [{}] Resuming...'.format(timestamp)) + for link in links: + try: + if float(link['timestamp']) <= float(timestamp): + yield link + except (ValueError, TypeError): + print('Resume value and all timestamp values must be valid numbers.') + +def next_uniq_timestamp(used_timestamps, timestamp): + """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" + + if timestamp not in used_timestamps: + return timestamp + + if '.' in timestamp: + timestamp, nonce = timestamp.split('.') + nonce = int(nonce) + else: + nonce = 1 + + new_timestamp = '{}.{}'.format(timestamp, nonce) + + while new_timestamp in used_timestamps: + nonce += 1 + new_timestamp = '{}.{}'.format(timestamp, nonce) + + return new_timestamp diff --git a/parse.py b/parse.py index 823ed944..c1d23a5b 100644 --- a/parse.py +++ b/parse.py @@ -1,56 +1,38 @@ import re -import time import json - from datetime import datetime +from util import ( + domain, + base_url, + get_str_between, + get_link_type, +) -def parse_export(file, service=None): + +def parse_export(path): """parse a list of links dictionaries from a bookmark export file""" + + links = [] + with open(path, 'r', encoding='utf-8') as file: + for service, parser_func in get_parsers().items(): + # otherwise try all parsers until one works + try: + links += list(parser_func(file)) + if links: + break + except Exception as e: + pass - # if specific service was passed via command line - if service == "pocket": - links = parse_pocket_export(file) - elif service == "pinboard": - links = parse_json_export(file) - elif service == "bookmarks": - links = parse_bookmarks_export(file) - else: - # otherwise try all parsers until one works - try: - links = list(parse_json_export(file)) - service = 'pinboard' - except Exception: - links = list(parse_pocket_export(file)) - if links: - service = 'pocket' - else: - links = list(parse_bookmarks_export(file)) - service = 'bookmarks' + return links - links = valid_links(links) # remove chrome://, about:, mailto: etc. - links = uniquefied_links(links) # fix duplicate timestamps, returns sorted list - return links, service - - -def get_link_type(link): - """Certain types of links need to be handled specially, this figures out when that's the case""" - - if link['base_url'].endswith('.pdf'): - return 'PDF' - elif link['base_url'].rsplit('.', 1) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'): - return 'image' - elif 'wikipedia.org' in link['domain']: - return 'wiki' - elif 'youtube.com' in link['domain']: - return 'youtube' - elif 'soundcloud.com' in link['domain']: - return 'soundcloud' - elif 'youku.com' in link['domain']: - return 'youku' - elif 'vimeo.com' in link['domain']: - return 'vimeo' - return None +def get_parsers(): + return { + 'pocket': parse_pocket_export, + 'pinboard': parse_json_export, + 'bookmarks': parse_bookmarks_export, + 'rss': parse_rss_export, + } def parse_pocket_export(html_file): """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" @@ -61,15 +43,15 @@ def parse_pocket_export(html_file): match = pattern.search(line) if match: fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url - without_scheme = fixed_url.replace('http://', '').replace('https://', '') + time = datetime.fromtimestamp(float(match.group(2))) info = { 'url': fixed_url, - 'domain': without_scheme.split('/', 1)[0], # without pathname - 'base_url': without_scheme.split('?', 1)[0], # without query args - 'time': datetime.fromtimestamp(int(match.group(2))).strftime('%Y-%m-%d %H:%M'), - 'timestamp': match.group(2), + 'domain': domain(fixed_url), + 'base_url': base_url(fixed_url), + 'timestamp': str(time.timestamp()), 'tags': match.group(3), - 'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or without_scheme, + 'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or base_url(fixed_url), + 'sources': [html_file.name], } info['type'] = get_link_type(info) yield info @@ -82,18 +64,59 @@ def parse_json_export(json_file): for line in json_content: if line: erg = line + time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ') info = { 'url': erg['href'], - 'domain': erg['href'].replace('http://', '').replace('https://', '').split('/', 1)[0], - 'base_url': erg['href'].replace('https://', '').replace('http://', '').split('?', 1)[0], - 'time': datetime.fromtimestamp(int(time.mktime(time.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')))), - 'timestamp': str(int(time.mktime(time.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')))), + 'domain': domain(erg['href']), + 'base_url': base_url(erg['href']), + 'timestamp': str(time.timestamp()), 'tags': erg['tags'], 'title': erg['description'].replace(' — Readability', ''), + 'sources': [json_file.name], } info['type'] = get_link_type(info) yield info +def parse_rss_export(rss_file): + """Parse RSS XML-format files into links""" + + rss_file.seek(0) + items = rss_file.read().split('\n') + for item in items: + # example item: + # + # <![CDATA[How JavaScript works: inside the V8 engine]]> + # Unread + # https://blog.sessionstack.com/how-javascript-works-inside + # https://blog.sessionstack.com/how-javascript-works-inside + # Mon, 21 Aug 2017 14:21:58 -0500 + # + + trailing_removed = item.split('', 1)[0] + leading_removed = trailing_removed.split('', 1)[-1] + rows = leading_removed.split('\n') + + row = lambda key: [r for r in rows if r.startswith('<{}>'.format(key))][0] + + title = get_str_between(row('title'), '', '') + ts_str = get_str_between(row('pubDate'), '', '') + time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") + + info = { + 'url': url, + 'domain': domain(url), + 'base_url': base_url(url), + 'timestamp': str(time.timestamp()), + 'tags': '', + 'title': title, + 'sources': [rss_file.name], + } + + info['type'] = get_link_type(info) + # import ipdb; ipdb.set_trace() + yield info + def parse_bookmarks_export(html_file): """Parse netscape-format bookmarks export files (produced by all browsers)""" @@ -103,118 +126,17 @@ def parse_bookmarks_export(html_file): match = pattern.search(line) if match: url = match.group(1) - secs = match.group(2) - dt = datetime.fromtimestamp(int(secs)) + time = datetime.fromtimestamp(float(match.group(2))) info = { 'url': url, - 'domain': url.replace('http://', '').replace('https://', '').split('/', 1)[0], - 'base_url': url.replace('https://', '').replace('http://', '').split('?', 1)[0], - 'time': dt, - 'timestamp': secs, + 'domain': domain(url), + 'base_url': base_url(url), + 'timestamp': str(time.timestamp()), 'tags': "", 'title': match.group(3), + 'sources': [html_file.name], } info['type'] = get_link_type(info) yield info - - -def next_uniq_timestamp(used_timestamps, timestamp): - """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" - - if timestamp not in used_timestamps: - return timestamp - - if '.' in timestamp: - timestamp, nonce = timestamp.split('.') - nonce = int(nonce) - else: - nonce = 1 - - new_timestamp = '{}.{}'.format(timestamp, nonce) - - while new_timestamp in used_timestamps: - nonce += 1 - new_timestamp = '{}.{}'.format(timestamp, nonce) - - return new_timestamp - -def uniquefied_links(links): - """uniqueify link timestamps by de-duping using url, returns links sorted most recent -> oldest - - needed because firefox will produce exports where many links share the same timestamp, this func - ensures that all non-duplicate links have monotonically increasing timestamps - """ - - links = list(reversed(sorted(links, key=lambda l: (l['timestamp'], l['url'])))) - seen_timestamps = {} - - for link in links: - t = link['timestamp'] - if t in seen_timestamps: - if link['url'] == seen_timestamps[t]['url']: - # don't create new unique timestamp if link is the same - continue - else: - # resolve duplicate timstamp by appending a decimal - link['timestamp'] = next_uniq_timestamp(seen_timestamps, link['timestamp']) - seen_timestamps[link['timestamp']] = link - - return links - -def valid_links(links): - """remove chrome://, about:// or other schemed links that cant be archived""" - return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp')) - - -def html_appended_url(link): - """calculate the path to the wgetted .html file, since wget may - adjust some paths to be different than the base_url path. - - See docs on wget --adjust-extension.""" - - split_url = link['url'].split('#', 1) - query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' - - if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): - # already ends in .html - return link['base_url'] - else: - # .html needs to be appended - without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] - if without_scheme.endswith('/'): - if query: - return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]) - return '#'.join([without_scheme + 'index.html', *split_url[1:]]) - else: - if query: - return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]) - return '#'.join([without_scheme + '.html', *split_url[1:]]) - - -def derived_link_info(link): - """extend link info with the archive urls and other derived data""" - - link_info = { - **link, - 'date': str(link['time'])[:-3], - 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), - 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link), - 'files_url': 'archive/{timestamp}/'.format(**link), - 'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)), - 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link), - 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link), - 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link), - } - - # PDF and images are handled slightly differently - # wget, screenshot, & pdf urls all point to the same file - if link['type'] in ('PDF', 'image'): - link_info.update({ - 'archive_url': 'archive/{timestamp}/{base_url}'.format(**link), - 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link), - 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link), - 'title': '{title} ({type})'.format(**link), - }) - return link_info diff --git a/templates/index_row.html b/templates/index_row.html index b33a6dee..8ea4a4e9 100644 --- a/templates/index_row.html +++ b/templates/index_row.html @@ -1,10 +1,10 @@ - $date + $date $title $tags - 📂 + 📂 📄 🖼 🏛 diff --git a/templates/link_index.html b/templates/link_index.html new file mode 100644 index 00000000..9f6a5d10 --- /dev/null +++ b/templates/link_index.html @@ -0,0 +1,258 @@ + + + + $title + + + + + + +
+

+ + [-] + + + Archive Icon + + $title
+ + $base_url + +

+
+ + + + + + diff --git a/util.py b/util.py new file mode 100644 index 00000000..19966a10 --- /dev/null +++ b/util.py @@ -0,0 +1,216 @@ +import os +import sys +import time +import requests + +from datetime import datetime +from subprocess import run, PIPE, DEVNULL +from multiprocessing import Process + +from config import ( + ARCHIVE_PERMISSIONS, + ARCHIVE_DIR, + TIMEOUT, + TERM_WIDTH, + SHOW_PROGRESS, + ANSI, + CHROME_BINARY, + FETCH_WGET, + FETCH_PDF, + FETCH_SCREENSHOT, + FETCH_FAVICON, + FETCH_AUDIO, + FETCH_VIDEO, + SUBMIT_ARCHIVE_DOT_ORG, +) + +def check_dependencies(): + """Check that all necessary dependencies are installed, and have valid versions""" + + print('[*] Checking Dependencies:') + + python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) + if python_vers < 3.5: + print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) + print(' See https://github.com/pirate/bookmark-archiver#troubleshooting for help upgrading your Python installation.') + raise SystemExit(1) + + if FETCH_PDF or FETCH_SCREENSHOT: + if run(['which', CHROME_BINARY]).returncode: + print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) + print(' See https://github.com/pirate/bookmark-archiver for help.') + raise SystemExit(1) + + # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04 + try: + result = run([CHROME_BINARY, '--version'], stdout=PIPE) + version = result.stdout.decode('utf-8').replace('Google Chrome ', '').replace('Chromium ', '').split(' ', 1)[0].split('.', 1)[0] # TODO: regex might be better + if int(version) < 59: + print('{red}[X] Chrome version must be 59 or greater for headless PDF and screenshot saving{reset}'.format(**ANSI)) + print(' See https://github.com/pirate/bookmark-archiver for help.') + raise SystemExit(1) + except (TypeError, OSError): + print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) + print(' See https://github.com/pirate/bookmark-archiver for help.') + raise SystemExit(1) + + if FETCH_WGET: + if run(['which', 'wget']).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode: + print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget')) + print(' See https://github.com/pirate/bookmark-archiver for help.') + raise SystemExit(1) + + if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: + if run(['which', 'curl']).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode: + print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl')) + print(' See https://github.com/pirate/bookmark-archiver for help.') + raise SystemExit(1) + + if FETCH_AUDIO or FETCH_VIDEO: + if run(['which', 'youtube-dl']).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode: + print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl')) + print(' See https://github.com/pirate/bookmark-archiver for help.') + raise SystemExit(1) + + +def chmod_file(path, cwd='.', permissions=ARCHIVE_PERMISSIONS, timeout=30): + """chmod -R /""" + + if not os.path.exists(os.path.join(cwd, path)): + raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) + + chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) + if chmod_result.returncode == 1: + print(' ', chmod_result.stderr.decode()) + raise Exception('Failed to chmod {}/{}'.format(cwd, path)) + + +def progress(seconds=TIMEOUT, prefix=''): + """Show a (subprocess-controlled) progress bar with a timeout, + returns end() function to instantly finish the progress + """ + + if not SHOW_PROGRESS: + return lambda: None + + chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#' + chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width) + + def progress_bar(seconds=seconds, prefix=prefix): + """show timer in the form of progress bar, with percentage and seconds remaining""" + try: + for s in range(seconds * chunks): + progress = s / chunks / seconds * 100 + bar_width = round(progress/(100/chunks)) + + # ████████████████████ 0.9% (1/60sec) + sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( + prefix, + ANSI['green'], + (chunk * bar_width).ljust(chunks), + ANSI['reset'], + round(progress, 1), + round(s/chunks), + seconds, + )) + sys.stdout.flush() + time.sleep(1 / chunks) + + # ██████████████████████████████████ 100.0% (60/60sec) + sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format( + prefix, + ANSI['red'], + chunk * chunks, + ANSI['reset'], + 100.0, + seconds, + seconds, + )) + sys.stdout.flush() + except KeyboardInterrupt: + print() + pass + + p = Process(target=progress_bar) + p.start() + + def end(): + """immediately finish progress and clear the progressbar line""" + p.terminate() + sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line + sys.stdout.flush() + + return end + + +def download_url(url): + if not os.path.exists(os.path.join(ARCHIVE_DIR, 'downloads')): + os.makedirs(os.path.join(ARCHIVE_DIR, 'downloads')) + + url_domain = url.split('/', 3)[2] + output_path = os.path.join(ARCHIVE_DIR, 'downloads', '{}.txt'.format(url_domain)) + + print('[*] [{}] Downloading {} > {}'.format( + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + url, + output_path, + )) + end = progress(TIMEOUT, prefix=' ') + try: + downloaded_xml = requests.get(url).content.decode() + end() + except Exception as e: + end() + print('[!] Failed to download {}\n'.format(url)) + print(' ', e) + raise SystemExit(1) + + with open(output_path, 'w', encoding='utf-8') as f: + f.write(downloaded_xml) + return output_path + + +def get_str_between(string, start, end=None): + """(12345, , ) -> 12345""" + + content = string.split(start, 1)[-1] + if end is not None: + content = content.rsplit(end, 1)[0] + + return content + + + + +def get_link_type(link): + """Certain types of links need to be handled specially, this figures out when that's the case""" + + if link['base_url'].endswith('.pdf'): + return 'PDF' + elif link['base_url'].rsplit('.', 1) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'): + return 'image' + elif 'wikipedia.org' in link['domain']: + return 'wiki' + elif 'youtube.com' in link['domain']: + return 'youtube' + elif 'soundcloud.com' in link['domain']: + return 'soundcloud' + elif 'youku.com' in link['domain']: + return 'youku' + elif 'vimeo.com' in link['domain']: + return 'vimeo' + return None + + +# URL helpers +without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '') +without_query = lambda url: url.split('?', 1)[0] +without_hash = lambda url: url.split('#', 1)[0] +without_path = lambda url: url.split('/', 1)[0] +domain = lambda url: without_hash(without_query(without_path(without_scheme(url)))) +base_url = lambda url: without_query(without_scheme(url))