diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index 153ff712..8955adaf 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -9,12 +9,14 @@ import sys import argparse from ..legacy.util import reject_stdin +from ..legacy.index import write_links_index from ..legacy.config import ( OUTPUT_DIR, SOURCES_DIR, ARCHIVE_DIR, DATABASE_DIR, ANSI, + stderr, ) @@ -28,16 +30,16 @@ def init(output_dir: str=OUTPUT_DIR): if not is_empty: if existing_index: - print('[√] You already have an archive setup up in this folder. To add new links, you can run:') - print(' archivebox add https://example.com') - print() - print('[i] Fore more usage and examples, run "archivebox help" or visit:') - print(' https://github.com/pirate/ArchiveBox/wiki/Usage') + stderr('[√] You already have an archive setup up in this folder. To add new links, you can run:') + stderr(' archivebox add https://example.com') + stderr() + stderr('[i] Fore more usage and examples, run "archivebox help" or visit:') + stderr(' https://github.com/pirate/ArchiveBox/wiki/Usage') # TODO: import old archivebox version's archive data folder raise SystemExit(1) else: - print( + stderr( ("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}" "\n\n" " {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n" @@ -48,14 +50,17 @@ def init(output_dir: str=OUTPUT_DIR): raise SystemExit(1) - print('{green}[+] Initializing new archive directory: {}{reset}'.format(output_dir, **ANSI)) + stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(output_dir, **ANSI)) os.makedirs(SOURCES_DIR) - print(f' > {SOURCES_DIR}') + stderr(f' > {SOURCES_DIR}') os.makedirs(ARCHIVE_DIR) - print(f' > {ARCHIVE_DIR}') + stderr(f' > {ARCHIVE_DIR}') os.makedirs(DATABASE_DIR) - print(f' > {DATABASE_DIR}') - print('{green}[√] Done.{reset}'.format(**ANSI)) + stderr(f' > {DATABASE_DIR}') + + write_links_index([], out_dir=OUTPUT_DIR, finished=True) + + stderr('{green}[√] Done.{reset}'.format(**ANSI)) def main(args=None): diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index d5eb7954..e8f1815b 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -4,42 +4,18 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox version' __description__ = 'Print the ArchiveBox version and dependency information' +import os +import re import sys -import shutil import argparse from ..legacy.util import reject_stdin from ..legacy.config import ( + ANSI, VERSION, - - REPO_DIR, - PYTHON_DIR, - LEGACY_DIR, - TEMPLATES_DIR, - OUTPUT_DIR, - SOURCES_DIR, - ARCHIVE_DIR, - DATABASE_DIR, - - USE_CURL, - USE_WGET, - USE_CHROME, - FETCH_GIT, - FETCH_MEDIA, - - DJANGO_BINARY, - CURL_BINARY, - GIT_BINARY, - WGET_BINARY, - YOUTUBEDL_BINARY, - CHROME_BINARY, - - DJANGO_VERSION, - CURL_VERSION, - GIT_VERSION, - WGET_VERSION, - YOUTUBEDL_VERSION, - CHROME_VERSION, + FOLDERS, + DEPENDENCIES, + check_dependencies, ) @@ -51,51 +27,84 @@ def main(args=None): description=__description__, add_help=True, ) - parser.parse_args(args) + parser.add_argument( + '--quiet', '-q', + action='store_true', + help='Only print ArchiveBox version number and nothing else.', + ) + command = parser.parse_args(args) reject_stdin(__command__) - print('ArchiveBox v{}'.format(VERSION)) - print() - print('[i] Folder locations:') - print(' REPO_DIR: ', REPO_DIR) - print(' PYTHON_DIR: ', PYTHON_DIR) - print(' LEGACY_DIR: ', LEGACY_DIR) - print(' TEMPLATES_DIR: ', TEMPLATES_DIR) - print() - print(' OUTPUT_DIR: ', OUTPUT_DIR) - print(' SOURCES_DIR: ', SOURCES_DIR) - print(' ARCHIVE_DIR: ', ARCHIVE_DIR) - print(' DATABASE_DIR: ', DATABASE_DIR) - print() + if command.quiet: + print(VERSION) + else: + print('ArchiveBox v{}'.format(VERSION)) + print() + + print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) + for name, dependency in DEPENDENCIES.items(): + print_dependency_version(name, dependency) + print() + print('{white}[i] Folder locations:{reset}'.format(**ANSI)) + for name, folder in FOLDERS.items(): + print_folder_status(name, folder) + + print() + check_dependencies() + + +def print_folder_status(name, folder): + if folder['enabled']: + if folder['is_valid']: + color, symbol, note = 'green', '√', 'valid' + else: + color, symbol, note, num_files = 'red', 'X', 'invalid', '?' + else: + color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-' + + if folder['path']: + if os.path.exists(folder['path']): + num_files = ( + f'{len(os.listdir(folder["path"]))} files' + if os.path.isdir(folder['path']) else + 'exists' + ) + else: + num_files = '?' + print( - '[√] Django:'.ljust(14), - 'python3 {} --version\n'.format(DJANGO_BINARY), - ' '*13, DJANGO_VERSION, '\n', + ANSI[color], + symbol, + ANSI['reset'], + name.ljust(24), + (folder["path"] or '').ljust(70), + num_files.ljust(14), + ANSI[color], + note, + ANSI['reset'], ) + + +def print_dependency_version(name, dependency): + if dependency['enabled']: + if dependency['is_valid']: + color, symbol, note = 'green', '√', 'valid' + version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0] + else: + color, symbol, note, version = 'red', 'X', 'invalid', '?' + else: + color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' + print( - '[{}] CURL:'.format('√' if USE_CURL else 'X').ljust(14), - '{} --version\n'.format(shutil.which(CURL_BINARY)), - ' '*13, CURL_VERSION, '\n', - ) - print( - '[{}] GIT:'.format('√' if FETCH_GIT else 'X').ljust(14), - '{} --version\n'.format(shutil.which(GIT_BINARY)), - ' '*13, GIT_VERSION, '\n', - ) - print( - '[{}] WGET:'.format('√' if USE_WGET else 'X').ljust(14), - '{} --version\n'.format(shutil.which(WGET_BINARY)), - ' '*13, WGET_VERSION, '\n', - ) - print( - '[{}] YOUTUBEDL:'.format('√' if FETCH_MEDIA else 'X').ljust(14), - '{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)), - ' '*13, YOUTUBEDL_VERSION, '\n', - ) - print( - '[{}] CHROME:'.format('√' if USE_CHROME else 'X').ljust(14), - '{} --version\n'.format(shutil.which(CHROME_BINARY)), - ' '*13, CHROME_VERSION, '\n', + ANSI[color], + symbol, + ANSI['reset'], + name.ljust(24), + (dependency["path"] or '').ljust(70), + version.ljust(14), + ANSI[color], + note, + ANSI['reset'], ) diff --git a/archivebox/legacy/config.py b/archivebox/legacy/config.py index d270c561..db8aadf3 100644 --- a/archivebox/legacy/config.py +++ b/archivebox/legacy/config.py @@ -109,45 +109,57 @@ URL_BLACKLIST_PTN = re.compile(URL_BLACKLIST, re.IGNORECASE) if URL_BLACKLIST el VERSION = open(os.path.join(REPO_DIR, 'VERSION'), 'r').read().strip() GIT_SHA = VERSION.split('+')[-1] or 'unknown' +HAS_INVALID_DEPENDENCIES = False +HAS_INVALID_DB = not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')) + +def stderr(*args): + sys.stderr.write(' '.join(str(a) for a in args) + '\n') ### Check Python environment python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) if python_vers < 3.5: - print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) - print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') + stderr('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) + stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') raise SystemExit(1) if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'): - print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding)) - print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)') - print('') - print(' Confirm that it\'s fixed by opening a new shell and running:') - print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8') - print('') - print(' Alternatively, run this script with:') - print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html') + stderr('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding)) + stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)') + stderr('') + stderr(' Confirm that it\'s fixed by opening a new shell and running:') + stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8') + stderr('') + stderr(' Alternatively, run this script with:') + stderr(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html') # ****************************************************************************** # ***************************** Helper Functions ******************************* # ****************************************************************************** -def bin_version(binary: str) -> str: +def bin_version(binary: str) -> Optional[str]: """check the presence and return valid version line of a specified binary""" - if not shutil.which(binary): - print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(binary)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - + global HAS_INVALID_DEPENDENCIES + binary = os.path.expanduser(binary) try: + if not shutil.which(binary): + raise Exception + version_str = run([binary, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() - return version_str.split('\n')[0].strip() + # take first 3 columns of first line of version info + return ' '.join(version_str.split('\n')[0].strip().split()[:3]) except Exception: - print('{red}[X] Unable to find a working version of {cmd}, is it installed and in your $PATH?'.format(cmd=binary, **ANSI)) - raise SystemExit(1) + HAS_INVALID_DEPENDENCIES = True + stderr('{red}[X] Unable to find working version of dependency: {}{reset}'.format(binary, **ANSI)) + stderr(' Make sure it\'s installed, then confirm it\'s working by running:') + stderr(' {} --version'.format(binary)) + stderr() + stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:') + stderr(' https://github.com/pirate/ArchiveBox/wiki/Install') + stderr() + return None -def find_chrome_binary() -> str: +def find_chrome_binary() -> Optional[str]: """find any installed chrome binaries in the default locations""" # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev # make sure data dir finding precedence order always matches binary finding order @@ -169,8 +181,9 @@ def find_chrome_binary() -> str: if full_path_exists: return name - print('{red}[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?'.format(**ANSI)) - raise SystemExit(1) + stderr('{red}[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?'.format(**ANSI)) + stderr() + return None def find_chrome_data_dir() -> Optional[str]: @@ -251,14 +264,122 @@ try: if not CHROME_BINARY: CHROME_BINARY = find_chrome_binary() or 'chromium-browser' CHROME_VERSION = None + if USE_CHROME: if CHROME_BINARY: CHROME_VERSION = bin_version(CHROME_BINARY) - # print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) + # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) if CHROME_USER_DATA_DIR is None: CHROME_USER_DATA_DIR = find_chrome_data_dir() - # print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) + elif CHROME_USER_DATA_DIR == '': + CHROME_USER_DATA_DIR = None + else: + if not os.path.exists(os.path.join(CHROME_USER_DATA_DIR, 'Default')): + stderr('{red}[X] Could not find profile "Default" in CHROME_USER_DATA_DIR:{reset} {}'.format(CHROME_USER_DATA_DIR, **ANSI)) + stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.') + stderr(' For more info see:') + stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR') + if 'Default' in CHROME_USER_DATA_DIR: + stderr() + stderr(' Try removing /Default from the end e.g.:') + stderr(' CHROME_USER_DATA_DIR="{}"'.format(CHROME_USER_DATA_DIR.split('/Default')[0])) + raise SystemExit(1) + # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) + + + ### Summary Lookup Dicts + FOLDERS = { + 'REPO_DIR': { + 'path': os.path.abspath(REPO_DIR), + 'enabled': True, + 'is_valid': os.path.exists(os.path.join(REPO_DIR, '.github')), + }, + 'PYTHON_DIR': { + 'path': os.path.abspath(PYTHON_DIR), + 'enabled': True, + 'is_valid': os.path.exists(os.path.join(PYTHON_DIR, '__main__.py')), + }, + 'LEGACY_DIR': { + 'path': os.path.abspath(LEGACY_DIR), + 'enabled': True, + 'is_valid': os.path.exists(os.path.join(LEGACY_DIR, 'util.py')), + }, + 'TEMPLATES_DIR': { + 'path': os.path.abspath(TEMPLATES_DIR), + 'enabled': True, + 'is_valid': os.path.exists(os.path.join(TEMPLATES_DIR, 'static')), + }, + 'OUTPUT_DIR': { + 'path': os.path.abspath(OUTPUT_DIR), + 'enabled': True, + 'is_valid': os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')), + }, + 'SOURCES_DIR': { + 'path': os.path.abspath(SOURCES_DIR), + 'enabled': True, + 'is_valid': os.path.exists(SOURCES_DIR), + }, + 'ARCHIVE_DIR': { + 'path': os.path.abspath(ARCHIVE_DIR), + 'enabled': True, + 'is_valid': os.path.exists(ARCHIVE_DIR), + }, + 'DATABASE_DIR': { + 'path': os.path.abspath(DATABASE_DIR), + 'enabled': True, + 'is_valid': os.path.exists(os.path.join(DATABASE_DIR, DATABASE_FILE)), + }, + 'CHROME_USER_DATA_DIR': { + 'path': CHROME_USER_DATA_DIR and os.path.abspath(CHROME_USER_DATA_DIR), + 'enabled': USE_CHROME and CHROME_USER_DATA_DIR, + 'is_valid': os.path.exists(os.path.join(CHROME_USER_DATA_DIR, 'Default')) if CHROME_USER_DATA_DIR else False, + }, + 'COOKIES_FILE': { + 'path': COOKIES_FILE and os.path.abspath(COOKIES_FILE), + 'enabled': USE_WGET and COOKIES_FILE, + 'is_valid': COOKIES_FILE and os.path.exists(COOKIES_FILE), + }, + } + + DEPENDENCIES = { + 'DJANGO_BINARY': { + 'path': DJANGO_BINARY, + 'version': DJANGO_VERSION, + 'enabled': True, + 'is_valid': bool(DJANGO_VERSION), + }, + 'CURL_BINARY': { + 'path': CURL_BINARY and shutil.which(CURL_BINARY), + 'version': CURL_VERSION, + 'enabled': USE_CURL, + 'is_valid': bool(CURL_VERSION), + }, + 'WGET_BINARY': { + 'path': WGET_BINARY and shutil.which(WGET_BINARY), + 'version': WGET_VERSION, + 'enabled': USE_WGET, + 'is_valid': bool(WGET_VERSION), + }, + 'GIT_BINARY': { + 'path': GIT_BINARY and shutil.which(GIT_BINARY), + 'version': GIT_VERSION, + 'enabled': FETCH_GIT, + 'is_valid': bool(GIT_VERSION), + }, + 'YOUTUBEDL_BINARY': { + 'path': YOUTUBEDL_BINARY and shutil.which(YOUTUBEDL_BINARY), + 'version': YOUTUBEDL_VERSION, + 'enabled': FETCH_MEDIA, + 'is_valid': bool(YOUTUBEDL_VERSION), + }, + 'CHROME_BINARY': { + 'path': CHROME_BINARY and shutil.which(CHROME_BINARY), + 'version': CHROME_VERSION, + 'enabled': USE_CHROME, + 'is_valid': bool(CHROME_VERSION), + }, + } CHROME_OPTIONS = { 'TIMEOUT': TIMEOUT, @@ -270,14 +391,39 @@ try: 'CHROME_USER_AGENT': CHROME_USER_AGENT, 'CHROME_USER_DATA_DIR': CHROME_USER_DATA_DIR, } + # PYPPETEER_ARGS = { # 'headless': CHROME_HEADLESS, # 'ignoreHTTPSErrors': not CHECK_SSL_VALIDITY, # # 'executablePath': CHROME_BINARY, # } + except KeyboardInterrupt: raise SystemExit(1) -except: - print('[X] There was an error while reading configuration. Your archive data is unaffected.') +except Exception as e: + stderr() + stderr('{red}[X] Error during configuration: {} {}{reset}'.format(e.__class__.__name__, e, **ANSI)) + stderr(' Your archive data is unaffected.') + stderr(' Check your config or environemnt variables for mistakes and try again.') + stderr(' For more info see:') + stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration') + stderr() raise + + +def check_dependencies() -> None: + if HAS_INVALID_DEPENDENCIES: + stderr('{red}[X] Missing some required dependencies.{reset}'.format(**ANSI)) + raise SystemExit(1) + + if HAS_INVALID_DB: + stderr('{red}[X] No archive data found in:{reset} {}'.format(OUTPUT_DIR, **ANSI)) + stderr(' Are you running archivebox in the right folder?') + stderr(' cd path/to/your/archive') + stderr(' archivebox [command]') + stderr() + stderr(' To create a new archive folder, run:') + stderr(' mkdir new_archive_dir && cd new_archive_dir') + stderr(' archivebox init') + raise SystemExit(1) diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py index 12680f5b..7597945a 100644 --- a/archivebox/legacy/main.py +++ b/archivebox/legacy/main.py @@ -14,6 +14,7 @@ from .archive_methods import archive_link from .config import ( ONLY_NEW, OUTPUT_DIR, + check_dependencies, ) from .logs import ( log_archiving_started, @@ -26,6 +27,8 @@ from .logs import ( def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]: """The main ArchiveBox entrancepoint. Everything starts here.""" + check_dependencies() + # Step 1: Load list of links from the existing index # merge in and dedupe new links from import_path all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)