better dependency checking system and guards

This commit is contained in:
Nick Sweeting 2019-04-11 03:42:35 -04:00
parent fafe6e75c5
commit 718e25c973
4 changed files with 271 additions and 108 deletions

View file

@ -9,12 +9,14 @@ import sys
import argparse
from ..legacy.util import reject_stdin
from ..legacy.index import write_links_index
from ..legacy.config import (
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
ANSI,
stderr,
)
@ -28,16 +30,16 @@ def init(output_dir: str=OUTPUT_DIR):
if not is_empty:
if existing_index:
print('[√] You already have an archive setup up in this folder. To add new links, you can run:')
print(' archivebox add https://example.com')
print()
print('[i] Fore more usage and examples, run "archivebox help" or visit:')
print(' https://github.com/pirate/ArchiveBox/wiki/Usage')
stderr('[√] You already have an archive setup up in this folder. To add new links, you can run:')
stderr(' archivebox add https://example.com')
stderr()
stderr('[i] Fore more usage and examples, run "archivebox help" or visit:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Usage')
# TODO: import old archivebox version's archive data folder
raise SystemExit(1)
else:
print(
stderr(
("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
"\n\n"
" {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
@ -48,14 +50,17 @@ def init(output_dir: str=OUTPUT_DIR):
raise SystemExit(1)
print('{green}[+] Initializing new archive directory: {}{reset}'.format(output_dir, **ANSI))
stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(output_dir, **ANSI))
os.makedirs(SOURCES_DIR)
print(f' > {SOURCES_DIR}')
stderr(f' > {SOURCES_DIR}')
os.makedirs(ARCHIVE_DIR)
print(f' > {ARCHIVE_DIR}')
stderr(f' > {ARCHIVE_DIR}')
os.makedirs(DATABASE_DIR)
print(f' > {DATABASE_DIR}')
print('{green}[√] Done.{reset}'.format(**ANSI))
stderr(f' > {DATABASE_DIR}')
write_links_index([], out_dir=OUTPUT_DIR, finished=True)
stderr('{green}[√] Done.{reset}'.format(**ANSI))
def main(args=None):

View file

@ -4,42 +4,18 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox version'
__description__ = 'Print the ArchiveBox version and dependency information'
import os
import re
import sys
import shutil
import argparse
from ..legacy.util import reject_stdin
from ..legacy.config import (
ANSI,
VERSION,
REPO_DIR,
PYTHON_DIR,
LEGACY_DIR,
TEMPLATES_DIR,
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
USE_CURL,
USE_WGET,
USE_CHROME,
FETCH_GIT,
FETCH_MEDIA,
DJANGO_BINARY,
CURL_BINARY,
GIT_BINARY,
WGET_BINARY,
YOUTUBEDL_BINARY,
CHROME_BINARY,
DJANGO_VERSION,
CURL_VERSION,
GIT_VERSION,
WGET_VERSION,
YOUTUBEDL_VERSION,
CHROME_VERSION,
FOLDERS,
DEPENDENCIES,
check_dependencies,
)
@ -51,51 +27,84 @@ def main(args=None):
description=__description__,
add_help=True,
)
parser.parse_args(args)
parser.add_argument(
'--quiet', '-q',
action='store_true',
help='Only print ArchiveBox version number and nothing else.',
)
command = parser.parse_args(args)
reject_stdin(__command__)
print('ArchiveBox v{}'.format(VERSION))
print()
print('[i] Folder locations:')
print(' REPO_DIR: ', REPO_DIR)
print(' PYTHON_DIR: ', PYTHON_DIR)
print(' LEGACY_DIR: ', LEGACY_DIR)
print(' TEMPLATES_DIR: ', TEMPLATES_DIR)
print()
print(' OUTPUT_DIR: ', OUTPUT_DIR)
print(' SOURCES_DIR: ', SOURCES_DIR)
print(' ARCHIVE_DIR: ', ARCHIVE_DIR)
print(' DATABASE_DIR: ', DATABASE_DIR)
print()
if command.quiet:
print(VERSION)
else:
print('ArchiveBox v{}'.format(VERSION))
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
for name, dependency in DEPENDENCIES.items():
print_dependency_version(name, dependency)
print()
print('{white}[i] Folder locations:{reset}'.format(**ANSI))
for name, folder in FOLDERS.items():
print_folder_status(name, folder)
print()
check_dependencies()
def print_folder_status(name, folder):
if folder['enabled']:
if folder['is_valid']:
color, symbol, note = 'green', '', 'valid'
else:
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']:
if os.path.exists(folder['path']):
num_files = (
f'{len(os.listdir(folder["path"]))} files'
if os.path.isdir(folder['path']) else
'exists'
)
else:
num_files = '?'
print(
'[√] Django:'.ljust(14),
'python3 {} --version\n'.format(DJANGO_BINARY),
' '*13, DJANGO_VERSION, '\n',
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(24),
(folder["path"] or '').ljust(70),
num_files.ljust(14),
ANSI[color],
note,
ANSI['reset'],
)
def print_dependency_version(name, dependency):
if dependency['enabled']:
if dependency['is_valid']:
color, symbol, note = 'green', '', 'valid'
version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
else:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
print(
'[{}] CURL:'.format('' if USE_CURL else 'X').ljust(14),
'{} --version\n'.format(shutil.which(CURL_BINARY)),
' '*13, CURL_VERSION, '\n',
)
print(
'[{}] GIT:'.format('' if FETCH_GIT else 'X').ljust(14),
'{} --version\n'.format(shutil.which(GIT_BINARY)),
' '*13, GIT_VERSION, '\n',
)
print(
'[{}] WGET:'.format('' if USE_WGET else 'X').ljust(14),
'{} --version\n'.format(shutil.which(WGET_BINARY)),
' '*13, WGET_VERSION, '\n',
)
print(
'[{}] YOUTUBEDL:'.format('' if FETCH_MEDIA else 'X').ljust(14),
'{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)),
' '*13, YOUTUBEDL_VERSION, '\n',
)
print(
'[{}] CHROME:'.format('' if USE_CHROME else 'X').ljust(14),
'{} --version\n'.format(shutil.which(CHROME_BINARY)),
' '*13, CHROME_VERSION, '\n',
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(24),
(dependency["path"] or '').ljust(70),
version.ljust(14),
ANSI[color],
note,
ANSI['reset'],
)

View file

@ -109,45 +109,57 @@ URL_BLACKLIST_PTN = re.compile(URL_BLACKLIST, re.IGNORECASE) if URL_BLACKLIST el
VERSION = open(os.path.join(REPO_DIR, 'VERSION'), 'r').read().strip()
GIT_SHA = VERSION.split('+')[-1] or 'unknown'
HAS_INVALID_DEPENDENCIES = False
HAS_INVALID_DB = not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
def stderr(*args):
sys.stderr.write(' '.join(str(a) for a in args) + '\n')
### Check Python environment
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
if python_vers < 3.5:
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
stderr('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:')
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
print('')
print(' Alternatively, run this script with:')
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
stderr('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
stderr('')
stderr(' Confirm that it\'s fixed by opening a new shell and running:')
stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
stderr('')
stderr(' Alternatively, run this script with:')
stderr(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
# ******************************************************************************
# ***************************** Helper Functions *******************************
# ******************************************************************************
def bin_version(binary: str) -> str:
def bin_version(binary: str) -> Optional[str]:
"""check the presence and return valid version line of a specified binary"""
if not shutil.which(binary):
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(binary))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
global HAS_INVALID_DEPENDENCIES
binary = os.path.expanduser(binary)
try:
if not shutil.which(binary):
raise Exception
version_str = run([binary, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
return version_str.split('\n')[0].strip()
# take first 3 columns of first line of version info
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
except Exception:
print('{red}[X] Unable to find a working version of {cmd}, is it installed and in your $PATH?'.format(cmd=binary, **ANSI))
raise SystemExit(1)
HAS_INVALID_DEPENDENCIES = True
stderr('{red}[X] Unable to find working version of dependency: {}{reset}'.format(binary, **ANSI))
stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
stderr(' {} --version'.format(binary))
stderr()
stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Install')
stderr()
return None
def find_chrome_binary() -> str:
def find_chrome_binary() -> Optional[str]:
"""find any installed chrome binaries in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# make sure data dir finding precedence order always matches binary finding order
@ -169,8 +181,9 @@ def find_chrome_binary() -> str:
if full_path_exists:
return name
print('{red}[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?'.format(**ANSI))
raise SystemExit(1)
stderr('{red}[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?'.format(**ANSI))
stderr()
return None
def find_chrome_data_dir() -> Optional[str]:
@ -251,14 +264,122 @@ try:
if not CHROME_BINARY:
CHROME_BINARY = find_chrome_binary() or 'chromium-browser'
CHROME_VERSION = None
if USE_CHROME:
if CHROME_BINARY:
CHROME_VERSION = bin_version(CHROME_BINARY)
# print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
if CHROME_USER_DATA_DIR is None:
CHROME_USER_DATA_DIR = find_chrome_data_dir()
# print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
elif CHROME_USER_DATA_DIR == '':
CHROME_USER_DATA_DIR = None
else:
if not os.path.exists(os.path.join(CHROME_USER_DATA_DIR, 'Default')):
stderr('{red}[X] Could not find profile "Default" in CHROME_USER_DATA_DIR:{reset} {}'.format(CHROME_USER_DATA_DIR, **ANSI))
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
stderr(' For more info see:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if 'Default' in CHROME_USER_DATA_DIR:
stderr()
stderr(' Try removing /Default from the end e.g.:')
stderr(' CHROME_USER_DATA_DIR="{}"'.format(CHROME_USER_DATA_DIR.split('/Default')[0]))
raise SystemExit(1)
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
### Summary Lookup Dicts
FOLDERS = {
'REPO_DIR': {
'path': os.path.abspath(REPO_DIR),
'enabled': True,
'is_valid': os.path.exists(os.path.join(REPO_DIR, '.github')),
},
'PYTHON_DIR': {
'path': os.path.abspath(PYTHON_DIR),
'enabled': True,
'is_valid': os.path.exists(os.path.join(PYTHON_DIR, '__main__.py')),
},
'LEGACY_DIR': {
'path': os.path.abspath(LEGACY_DIR),
'enabled': True,
'is_valid': os.path.exists(os.path.join(LEGACY_DIR, 'util.py')),
},
'TEMPLATES_DIR': {
'path': os.path.abspath(TEMPLATES_DIR),
'enabled': True,
'is_valid': os.path.exists(os.path.join(TEMPLATES_DIR, 'static')),
},
'OUTPUT_DIR': {
'path': os.path.abspath(OUTPUT_DIR),
'enabled': True,
'is_valid': os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')),
},
'SOURCES_DIR': {
'path': os.path.abspath(SOURCES_DIR),
'enabled': True,
'is_valid': os.path.exists(SOURCES_DIR),
},
'ARCHIVE_DIR': {
'path': os.path.abspath(ARCHIVE_DIR),
'enabled': True,
'is_valid': os.path.exists(ARCHIVE_DIR),
},
'DATABASE_DIR': {
'path': os.path.abspath(DATABASE_DIR),
'enabled': True,
'is_valid': os.path.exists(os.path.join(DATABASE_DIR, DATABASE_FILE)),
},
'CHROME_USER_DATA_DIR': {
'path': CHROME_USER_DATA_DIR and os.path.abspath(CHROME_USER_DATA_DIR),
'enabled': USE_CHROME and CHROME_USER_DATA_DIR,
'is_valid': os.path.exists(os.path.join(CHROME_USER_DATA_DIR, 'Default')) if CHROME_USER_DATA_DIR else False,
},
'COOKIES_FILE': {
'path': COOKIES_FILE and os.path.abspath(COOKIES_FILE),
'enabled': USE_WGET and COOKIES_FILE,
'is_valid': COOKIES_FILE and os.path.exists(COOKIES_FILE),
},
}
DEPENDENCIES = {
'DJANGO_BINARY': {
'path': DJANGO_BINARY,
'version': DJANGO_VERSION,
'enabled': True,
'is_valid': bool(DJANGO_VERSION),
},
'CURL_BINARY': {
'path': CURL_BINARY and shutil.which(CURL_BINARY),
'version': CURL_VERSION,
'enabled': USE_CURL,
'is_valid': bool(CURL_VERSION),
},
'WGET_BINARY': {
'path': WGET_BINARY and shutil.which(WGET_BINARY),
'version': WGET_VERSION,
'enabled': USE_WGET,
'is_valid': bool(WGET_VERSION),
},
'GIT_BINARY': {
'path': GIT_BINARY and shutil.which(GIT_BINARY),
'version': GIT_VERSION,
'enabled': FETCH_GIT,
'is_valid': bool(GIT_VERSION),
},
'YOUTUBEDL_BINARY': {
'path': YOUTUBEDL_BINARY and shutil.which(YOUTUBEDL_BINARY),
'version': YOUTUBEDL_VERSION,
'enabled': FETCH_MEDIA,
'is_valid': bool(YOUTUBEDL_VERSION),
},
'CHROME_BINARY': {
'path': CHROME_BINARY and shutil.which(CHROME_BINARY),
'version': CHROME_VERSION,
'enabled': USE_CHROME,
'is_valid': bool(CHROME_VERSION),
},
}
CHROME_OPTIONS = {
'TIMEOUT': TIMEOUT,
@ -270,14 +391,39 @@ try:
'CHROME_USER_AGENT': CHROME_USER_AGENT,
'CHROME_USER_DATA_DIR': CHROME_USER_DATA_DIR,
}
# PYPPETEER_ARGS = {
# 'headless': CHROME_HEADLESS,
# 'ignoreHTTPSErrors': not CHECK_SSL_VALIDITY,
# # 'executablePath': CHROME_BINARY,
# }
except KeyboardInterrupt:
raise SystemExit(1)
except:
print('[X] There was an error while reading configuration. Your archive data is unaffected.')
except Exception as e:
stderr()
stderr('{red}[X] Error during configuration: {} {}{reset}'.format(e.__class__.__name__, e, **ANSI))
stderr(' Your archive data is unaffected.')
stderr(' Check your config or environemnt variables for mistakes and try again.')
stderr(' For more info see:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
stderr()
raise
def check_dependencies() -> None:
if HAS_INVALID_DEPENDENCIES:
stderr('{red}[X] Missing some required dependencies.{reset}'.format(**ANSI))
raise SystemExit(1)
if HAS_INVALID_DB:
stderr('{red}[X] No archive data found in:{reset} {}'.format(OUTPUT_DIR, **ANSI))
stderr(' Are you running archivebox in the right folder?')
stderr(' cd path/to/your/archive')
stderr(' archivebox [command]')
stderr()
stderr(' To create a new archive folder, run:')
stderr(' mkdir new_archive_dir && cd new_archive_dir')
stderr(' archivebox init')
raise SystemExit(1)

View file

@ -14,6 +14,7 @@ from .archive_methods import archive_link
from .config import (
ONLY_NEW,
OUTPUT_DIR,
check_dependencies,
)
from .logs import (
log_archiving_started,
@ -26,6 +27,8 @@ from .logs import (
def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
"""The main ArchiveBox entrancepoint. Everything starts here."""
check_dependencies()
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)