From 52871f42e71e6fe399010611c104283b8c921492 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 26 Mar 2019 05:31:27 -0400 Subject: [PATCH] cleaner config with type hints --- archivebox/config.py | 282 ++++++++++++++++++++----------------------- 1 file changed, 130 insertions(+), 152 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 7865e976..13d64c3a 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -3,8 +3,12 @@ import re import sys import shutil +from typing import Optional from subprocess import run, PIPE, DEVNULL + +OUTPUT_DIR: str + # ****************************************************************************** # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration # Use the 'env' command to pass config options to ArchiveBox. e.g.: @@ -14,9 +18,11 @@ from subprocess import run, PIPE, DEVNULL IS_TTY = sys.stdout.isatty() USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true' SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true' + +OUTPUT_DIR = os.getenv('OUTPUT_DIR', '') ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true' -MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600')) TIMEOUT = int(os.getenv('TIMEOUT', '60')) +MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600')) OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' ) FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',) @@ -47,20 +53,18 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') CHROME_BINARY = os.getenv('CHROME_BINARY', None) -try: - OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) -except Exception: - OUTPUT_DIR = None - +CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' # ****************************************************************************** -# **************************** Derived Settings ******************************** +# *************************** Directory Settings ******************************* # ****************************************************************************** REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) -if not OUTPUT_DIR: +if OUTPUT_DIR: + OUTPUT_DIR = os.path.abspath(OUTPUT_DIR) +else: OUTPUT_DIR = os.path.join(REPO_DIR, 'output') - + ARCHIVE_DIR_NAME = 'archive' SOURCES_DIR_NAME = 'sources' ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME) @@ -69,13 +73,87 @@ SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME) PYTHON_PATH = os.path.join(REPO_DIR, 'archivebox') TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates') -CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' -USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM -USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC +if COOKIES_FILE: + COOKIES_FILE = os.path.abspath(COOKIES_FILE) + +# ****************************************************************************** +# ************************ Environment & Dependencies ************************** +# ****************************************************************************** + +def check_version(binary: str) -> str: + """check the presence and return valid version line of a specified binary""" + if run(['which', binary], stdout=DEVNULL, stderr=DEVNULL).returncode: + print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) + print(' Install it, then confirm it works with: {} --version'.format(binary)) + print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') + raise SystemExit(1) + + try: + version_str = run([binary, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() + return version_str.split('\n')[0].strip() + except Exception: + print('{red}[X] Unable to find a working version of {cmd}, is it installed and in your $PATH?'.format(cmd=binary, **ANSI)) + raise SystemExit(1) + +def find_chrome_binary() -> Optional[str]: + """find any installed chrome binaries in the default locations""" + # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev + default_executable_paths = ( + 'chromium-browser', + 'chromium', + '/Applications/Chromium.app/Contents/MacOS/Chromium', + 'google-chrome', + '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + 'google-chrome-stable', + 'google-chrome-beta', + 'google-chrome-canary', + '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary', + 'google-chrome-unstable', + 'google-chrome-dev', + ) + for name in default_executable_paths: + full_path_exists = shutil.which(name) + if full_path_exists: + return name + + print('{red}[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?'.format(**ANSI)) + raise SystemExit(1) + +def find_chrome_data_dir() -> Optional[str]: + """find any installed chrome user data directories in the default locations""" + # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev + default_profile_paths = ( + '~/.config/chromium', + '~/Library/Application Support/Chromium', + '~/AppData/Local/Chromium/User Data', + '~/.config/google-chrome', + '~/Library/Application Support/Google/Chrome', + '~/AppData/Local/Google/Chrome/User Data', + '~/.config/google-chrome-stable', + '~/.config/google-chrome-beta', + '~/Library/Application Support/Google/Chrome Canary', + '~/AppData/Local/Google/Chrome SxS/User Data', + '~/.config/google-chrome-unstable', + '~/.config/google-chrome-dev', + ) + for path in default_profile_paths: + full_path = os.path.expanduser(path) + if os.path.exists(full_path): + return full_path + return None + +def get_git_version() -> str: + """get the git commit hash of the python code folder (aka code version)""" + try: + return run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() + except Exception: + print('[!] Warning: unable to determine git version, is git installed and in your $PATH?') + return 'unknown' -########################### Environment & Dependencies ######################### try: + GIT_SHA = get_git_version() + ### Terminal Configuration TERM_WIDTH = lambda: shutil.get_terminal_size((100, 10)).columns ANSI = { @@ -93,66 +171,6 @@ try: # dont show colors if USE_COLOR is False ANSI = {k: '' for k in ANSI.keys()} - - if not CHROME_BINARY: - # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev - default_executable_paths = ( - 'chromium-browser', - 'chromium', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - 'google-chrome', - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - 'google-chrome-stable', - 'google-chrome-beta', - 'google-chrome-canary', - '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary', - 'google-chrome-unstable', - 'google-chrome-dev', - ) - for name in default_executable_paths: - full_path_exists = shutil.which(name) - if full_path_exists: - CHROME_BINARY = name - break - else: - CHROME_BINARY = 'chromium-browser' - # print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) - - if CHROME_USER_DATA_DIR is None: - # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev - default_profile_paths = ( - '~/.config/chromium', - '~/Library/Application Support/Chromium', - '~/AppData/Local/Chromium/User Data', - '~/.config/google-chrome', - '~/Library/Application Support/Google/Chrome', - '~/AppData/Local/Google/Chrome/User Data', - '~/.config/google-chrome-stable', - '~/.config/google-chrome-beta', - '~/Library/Application Support/Google/Chrome Canary', - '~/AppData/Local/Google/Chrome SxS/User Data', - '~/.config/google-chrome-unstable', - '~/.config/google-chrome-dev', - ) - for path in default_profile_paths: - full_path = os.path.expanduser(path) - if os.path.exists(full_path): - CHROME_USER_DATA_DIR = full_path - break - # print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) - - CHROME_OPTIONS = { - 'TIMEOUT': TIMEOUT, - 'RESOLUTION': RESOLUTION, - 'CHECK_SSL_VALIDITY': CHECK_SSL_VALIDITY, - 'CHROME_BINARY': CHROME_BINARY, - 'CHROME_HEADLESS': CHROME_HEADLESS, - 'CHROME_SANDBOX': CHROME_SANDBOX, - 'CHROME_USER_AGENT': CHROME_USER_AGENT, - 'CHROME_USER_DATA_DIR': CHROME_USER_DATA_DIR, - } - - ### Check Python environment python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) if python_vers < 3.5: @@ -170,98 +188,58 @@ try: print(' Alternatively, run this script with:') print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html') - ### Get code version by parsing git log - GIT_SHA = 'unknown' - try: - GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() - except Exception: - print('[!] Warning: unable to determine git version, is git installed and in your $PATH?') - - ### Get absolute path for cookies file - try: - COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None - except Exception: - print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?') - raise ### Make sure curl is installed - if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: - if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) + USE_CURL = FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG + CURL_VERSION = USE_CURL and check_version(CURL_BINARY) ### Make sure wget is installed and calculate version - if FETCH_WGET or FETCH_WARC: - if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - WGET_VERSION = 'unknown' - try: - wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() - WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2] - except Exception: - if USE_WGET: - print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?') - - WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION) - + USE_WGET = FETCH_WGET or FETCH_WARC + WGET_VERSION = USE_WGET and check_version(WGET_BINARY) + WGET_USER_AGENT = WGET_USER_AGENT.format( + GIT_SHA=GIT_SHA[:9], + WGET_VERSION=WGET_VERSION or '', + ) + ### Make sure chrome is installed and calculate version - if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM: - if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) - print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) + USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM + CHROME_VERSION = None + if USE_CHROME: + if CHROME_BINARY is None: + CHROME_BINARY = find_chrome_binary() + CHROME_VERSION = check_version(CHROME_BINARY) + # print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) - # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04 - try: - result = run([CHROME_BINARY, '--version'], stdout=PIPE) - version_str = result.stdout.decode('utf-8') - version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n') - version = [l for l in version_lines if l.isdigit()][-1] - if int(version) < 59: - print(version_lines) - print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - except (IndexError, TypeError, OSError): - print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - CHROME_VERSION = 'unknown' - try: - chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() - CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0] - except Exception: - if USE_CHROME: - print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?') + if CHROME_USER_DATA_DIR is None: + CHROME_USER_DATA_DIR = find_chrome_data_dir() + # print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) ### Make sure git is installed - if FETCH_GIT: - if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: git{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) + GIT_VERSION = FETCH_GIT and check_version(GIT_BINARY) ### Make sure youtube-dl is installed - if FETCH_MEDIA: - if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) - print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) + YOUTUBEDL_VERSION = FETCH_MEDIA and check_version(YOUTUBEDL_BINARY) + + ### Chrome housekeeping options + CHROME_OPTIONS = { + 'TIMEOUT': TIMEOUT, + 'RESOLUTION': RESOLUTION, + 'CHECK_SSL_VALIDITY': CHECK_SSL_VALIDITY, + 'CHROME_BINARY': CHROME_BINARY, + 'CHROME_HEADLESS': CHROME_HEADLESS, + 'CHROME_SANDBOX': CHROME_SANDBOX, + 'CHROME_USER_AGENT': CHROME_USER_AGENT, + 'CHROME_USER_DATA_DIR': CHROME_USER_DATA_DIR, + } + # PYPPETEER_ARGS = { + # 'headless': CHROME_HEADLESS, + # 'ignoreHTTPSErrors': not CHECK_SSL_VALIDITY, + # # 'executablePath': CHROME_BINARY, + # } except KeyboardInterrupt: raise SystemExit(1) except: - print('[X] There was an error during the startup procedure, your archive data is unaffected.') + print('[X] There was an error while reading configuration. Your archive data is unaffected.') raise