diff --git a/archivebox/archive.py b/archivebox/archive.py index e13d83c9..5c0d195d 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -22,7 +22,6 @@ from config import ( GIT_SHA, ) from util import ( - check_dependencies, save_remote_source, save_stdin_source, ) @@ -33,7 +32,7 @@ from logs import ( ) __AUTHOR__ = 'Nick Sweeting ' -__VERSION__ = GIT_SHA +__VERSION__ = GIT_SHA[:9] __DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.' __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki' @@ -42,11 +41,13 @@ def print_help(): print('ArchiveBox: The self-hosted internet archive.\n') print("Documentation:") print(" https://github.com/pirate/ArchiveBox/wiki\n") - print("Usage:") - print(" echo 'https://examplecom' | ./bin/archivebox\n") - print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n") - print(" ./bin/archivebox https://example.com/feed.rss\n") - print(" ./bin/archivebox 15109948213.123\n") + print("UI Usage:") + print(" Open output/index.html to view your archive.\n") + print("CLI Usage:") + print(" echo 'https://example.com' | ./archive\n") + print(" ./archive ~/Downloads/bookmarks_export.html\n") + print(" ./archive https://example.com/feed.rss\n") + print(" ./archive 15109948213.123\n") def main(*args): @@ -54,6 +55,10 @@ def main(*args): print_help() raise SystemExit(0) + if set(args).intersection(('--version', 'version')): + print('ArchiveBox version {}'.format(__VERSION__)) + raise SystemExit(0) + ### Handle CLI arguments # ./archive bookmarks.html # ./archive 1523422111.234 @@ -95,7 +100,6 @@ def main(*args): def update_archive_data(import_path=None, resume=None): """The main ArchiveBox entrancepoint. Everything starts here.""" - check_dependencies() # Step 1: Load list of links from the existing index # merge in and dedupe new links from import_path diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 283365a4..9ef0dc83 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -297,7 +297,7 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT): output = 'output.pdf' cmd = [ - *chrome_args(timeout=timeout), + *chrome_args(TIMEOUT=timeout), '--print-to-pdf', link['url'], ] @@ -339,7 +339,7 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT): output = 'screenshot.png' cmd = [ - *chrome_args(timeout=timeout), + *chrome_args(TIMEOUT=timeout), '--screenshot', link['url'], ] @@ -382,7 +382,7 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT): output = 'output.html' output_path = os.path.join(link_dir, output) cmd = [ - *chrome_args(timeout=timeout), + *chrome_args(TIMEOUT=timeout), '--dump-dom', link['url'] ] diff --git a/archivebox/config.py b/archivebox/config.py index 749a0e26..fabaf1df 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -1,8 +1,9 @@ import os +import re import sys import shutil -from subprocess import run, PIPE +from subprocess import run, PIPE, DEVNULL # ****************************************************************************** # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration @@ -68,7 +69,6 @@ SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME) PYTHON_PATH = os.path.join(REPO_DIR, 'archivebox') TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates') - CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC @@ -137,44 +137,115 @@ if not USE_COLOR: # dont show colors if USE_COLOR is False ANSI = {k: '' for k in ANSI.keys()} + ### Confirm Environment Setup -GIT_SHA = 'unknown' -try: - GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() -except Exception: - print('[!] Warning: unable to determine git version, is git installed and in your $PATH?') - -CHROME_VERSION = 'unknown' -try: - chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() - CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0] -except Exception: - if USE_CHROME: - print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?') - -WGET_VERSION = 'unknown' -try: - wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() - WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2] -except Exception: - if USE_WGET: - print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?') - -WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION) try: - COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None -except Exception: - print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?') - raise + ### Check Python environment + python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) + if python_vers < 3.5: + print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) + print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') + raise SystemExit(1) -if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'): - print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding)) - print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)') - print('') - print(' Confirm that it\'s fixed by opening a new shell and running:') - print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8') - print('') - print(' Alternatively, run this script with:') - print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html') + if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'): + print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding)) + print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)') + print('') + print(' Confirm that it\'s fixed by opening a new shell and running:') + print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8') + print('') + print(' Alternatively, run this script with:') + print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html') + ### Get code version by parsing git log + GIT_SHA = 'unknown' + try: + GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() + except Exception: + print('[!] Warning: unable to determine git version, is git installed and in your $PATH?') + + ### Get absolute path for cookies file + try: + COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None + except Exception: + print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?') + raise + + ### Make sure curl is installed + if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: + if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) + print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + ### Make sure wget is installed and calculate version + if FETCH_WGET or FETCH_WARC: + if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) + print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + WGET_VERSION = 'unknown' + try: + wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() + WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2] + except Exception: + if USE_WGET: + print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?') + + WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION) + + ### Make sure chrome is installed and calculate version + if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM: + if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) + print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04 + try: + result = run([CHROME_BINARY, '--version'], stdout=PIPE) + version_str = result.stdout.decode('utf-8') + version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n') + version = [l for l in version_lines if l.isdigit()][-1] + if int(version) < 59: + print(version_lines) + print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + except (IndexError, TypeError, OSError): + print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI)) + print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + CHROME_VERSION = 'unknown' + try: + chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() + CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0] + except Exception: + if USE_CHROME: + print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?') + + ### Make sure git is installed + if FETCH_GIT: + if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{red}[X] Missing dependency: git{reset}'.format(**ANSI)) + print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + ### Make sure youtube-dl is installed + if FETCH_MEDIA: + if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) + print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + +except KeyboardInterrupt: + raise SystemExit(1) diff --git a/archivebox/util.py b/archivebox/util.py index 9dc47540..57fc173d 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -25,11 +25,6 @@ from config import ( OUTPUT_PERMISSIONS, TIMEOUT, SHOW_PROGRESS, - CURL_BINARY, - WGET_BINARY, - CHROME_BINARY, - GIT_BINARY, - YOUTUBEDL_BINARY, FETCH_TITLE, FETCH_FAVICON, FETCH_WGET, @@ -124,70 +119,6 @@ def check_links_structure(links): if links: check_link_structure(links[0]) -def check_dependencies(): - """Check that all necessary dependencies are installed, and have valid versions""" - - try: - python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) - if python_vers < 3.5: - print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) - print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') - raise SystemExit(1) - - if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: - if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - if FETCH_WGET or FETCH_WARC: - if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM: - if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) - print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04 - try: - result = run([CHROME_BINARY, '--version'], stdout=PIPE) - version_str = result.stdout.decode('utf-8') - version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n') - version = [l for l in version_lines if l.isdigit()][-1] - if int(version) < 59: - print(version_lines) - print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - except (IndexError, TypeError, OSError): - print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - if FETCH_GIT: - if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: git{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - if FETCH_MEDIA: - if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) - print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - except (KeyboardInterrupt, Exception): - raise SystemExit(1) - def check_url_parsing_invariants(): """Check that plain text regex URL parsing works as expected""" @@ -284,7 +215,7 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS): match = re.search(HTML_TITLE_REGEX, html) return match.group(1).strip() if match else None - except Exception as err: + except Exception as err: # noqa # print('[!] Failed to fetch title because of {}: {}'.format( # err.__class__.__name__, # err, @@ -560,10 +491,13 @@ def progress_bar(seconds, prefix): pass class TimedProgress: + """Show a progress bar and measure elapsed time until .end() is called""" + def __init__(self, seconds, prefix=''): if SHOW_PROGRESS: self.p = Process(target=progress_bar, args=(seconds, prefix)) self.p.start() + self.stats = { 'start_ts': datetime.now(), 'end_ts': None, @@ -571,7 +505,7 @@ class TimedProgress: } def end(self): - """immediately finish progress and clear the progressbar line""" + """immediately end progress, clear the progressbar line, and save end_ts""" end_ts = datetime.now() self.stats.update({ @@ -591,6 +525,8 @@ class TimedProgress: sys.stdout.flush() def download_url(url, timeout=TIMEOUT): + """Download the contents of a remote url and return the text""" + req = Request(url, headers={'User-Agent': WGET_USER_AGENT}) if CHECK_SSL_VALIDITY: @@ -615,34 +551,31 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30): raise Exception('Failed to chmod {}/{}'.format(cwd, path)) -def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR, - headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX, - check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT, - resolution=RESOLUTION, timeout=TIMEOUT): +def chrome_args(**options): """helper to build up a chrome shell command with arguments""" - cmd_args = [binary] + cmd_args = [options['CHROME_BINARY']] - if headless: + if options['HEADLESS']: cmd_args += ('--headless',) - if not sandbox: + if not options['CHROME_SANDBOX']: # dont use GPU or sandbox when running inside docker container cmd_args += ('--no-sandbox', '--disable-gpu') - if not check_ssl_validity: + if not options['CHECK_SSL_VALIDITY']: cmd_args += ('--disable-web-security', '--ignore-certificate-errors') - if user_agent: - cmd_args += ('--user-agent={}'.format(user_agent),) + if options['CHROME_USER_AGENT']: + cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),) - if resolution: - cmd_args += ('--window-size={}'.format(RESOLUTION),) + if options['RESOLUTION']: + cmd_args += ('--window-size={}'.format(options['RESOLUTION']),) - if timeout: - cmd_args += ('--timeout={}'.format((timeout) * 1000),) + if options['TIMEOUT']: + cmd_args += ('--timeout={}'.format((options['TIMEOUT']) * 1000),) - if user_data_dir: - cmd_args.append('--user-data-dir={}'.format(user_data_dir)) + if options['CHROME_USER_DATA_DIR']: + cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) return cmd_args