rename archive toggles from FETCH_ to SAVE_ for clarity

This commit is contained in:
Nick Sweeting 2019-04-24 04:09:25 -04:00
parent 0f2497a2a6
commit 5ef5415f3f
7 changed files with 547 additions and 388 deletions

View file

@ -9,64 +9,83 @@ import getpass
import shutil
from hashlib import md5
from typing import Optional
from typing import Any, Optional, Dict, Tuple
from subprocess import run, PIPE, DEVNULL
CONFIG_TYPE = Dict[str, Any]
# ******************************************************************************
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
# Use the 'env' command to pass config options to ArchiveBox. e.g.:
# env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
# ******************************************************************************
IS_TTY = sys.stdout.isatty()
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true'
################################# User Config ##################################
OUTPUT_DIR = os.getenv('OUTPUT_DIR', '')
ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true'
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600'))
OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' )
FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',)
URL_BLACKLIST = os.getenv('URL_BLACKLIST', None)
SHELL_CONFIG_DEFAULTS = {
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']},
}
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
FETCH_PDF = os.getenv('FETCH_PDF', 'True' ).lower() == 'true'
FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true'
FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true'
FETCH_WARC = os.getenv('FETCH_WARC', 'True' ).lower() == 'true'
FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true'
FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true'
FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true'
FETCH_TITLE = os.getenv('FETCH_TITLE', 'True' ).lower() == 'true'
SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true'
ARCHIVE_CONFIG_DEFAULTS = {
'OUTPUT_DIR': {'type': str, 'default': None},
'ONLY_NEW': {'type': bool, 'default': False},
'TIMEOUT': {'type': int, 'default': 60},
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
'URL_BLACKLIST': {'type': str, 'default': None},
}
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
RESOLUTION = os.getenv('RESOLUTION', '1440,2000' )
GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
COOKIES_FILE = os.getenv('COOKIES_FILE', None)
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true'
CHROME_USER_AGENT = os.getenv('CHROME_USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36')
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True' ).lower() == 'true'
ARCHIVE_METHOD_TOGGLES_DEFAULTS = {
'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
}
USE_CURL = os.getenv('USE_CURL', 'True' ).lower() == 'true'
USE_WGET = os.getenv('USE_WGET', 'True' ).lower() == 'true'
USE_CHROME = os.getenv('USE_CHROME', 'True' ).lower() == 'true'
ARCHIVE_METHOD_OPTIONS_DEFAULTS = {
'RESOLUTION': {'type': str, 'default': '1440,2000'},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
CURL_BINARY = os.getenv('CURL_BINARY', 'curl')
GIT_BINARY = os.getenv('GIT_BINARY', 'git')
WGET_BINARY = os.getenv('WGET_BINARY', 'wget')
YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl')
CHROME_BINARY = os.getenv('CHROME_BINARY', None)
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
# ******************************************************************************
'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
### Terminal Configuration
TERM_WIDTH = lambda: shutil.get_terminal_size((100, 10)).columns
ANSI = {
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': True},
}
DEPENDENCY_CONFIG_DEFAULTS = {
'USE_CURL': {'type': bool, 'default': True},
'USE_WGET': {'type': bool, 'default': True},
'USE_GIT': {'type': bool, 'default': True},
'USE_CHROME': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True},
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'CHROME_BINARY': {'type': str, 'default': None},
}
############################## Derived Config ##############################
# Constants
DEFAULT_CLI_COLORS = {
'reset': '\033[00;00m',
'lightblue': '\033[01;30m',
'lightyellow': '\033[01;33m',
@ -77,102 +96,178 @@ ANSI = {
'white': '\033[01;37m',
'black': '\033[01;30m',
}
if not USE_COLOR:
# dont show colors if USE_COLOR is False
ANSI = {k: '' for k in ANSI.keys()}
ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
def stderr(*args):
sys.stderr.write(' '.join(str(a) for a in args) + '\n')
USER = getpass.getuser() or os.getlogin()
ARCHIVEBOX_BINARY = sys.argv[0]
VERSION_FILENAME = 'VERSION'
PYTHON_DIR_NAME = 'archivebox'
LEGACY_DIR_NAME = 'legacy'
TEMPLATES_DIR_NAME = 'templates'
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))
if OUTPUT_DIR:
OUTPUT_DIR = os.path.abspath(os.path.expanduser(OUTPUT_DIR))
else:
OUTPUT_DIR = os.path.abspath(os.curdir)
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
LOGS_DIR_NAME = 'logs'
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME)
SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
LOGS_DIR = os.path.join(OUTPUT_DIR, LOGS_DIR_NAME)
STATIC_DIR_NAME = 'static'
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
ROBOTS_TXT_FILENAME = 'robots.txt'
FAVICON_FILENAME = 'favicon.ico'
PYTHON_DIR = os.path.join(REPO_DIR, 'archivebox')
LEGACY_DIR = os.path.join(PYTHON_DIR, 'legacy')
TEMPLATES_DIR = os.path.join(LEGACY_DIR, 'templates')
if COOKIES_FILE:
COOKIES_FILE = os.path.abspath(os.path.expanduser(COOKIES_FILE))
if CHROME_USER_DATA_DIR:
CHROME_USER_DATA_DIR = os.path.abspath(os.path.expanduser(CHROME_USER_DATA_DIR))
DERIVED_CONFIG_DEFAULTS = {
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
'USER': {'default': lambda c: getpass.getuser() or os.getlogin()},
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
'REPO_DIR': {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))},
'PYTHON_DIR': {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)},
'LEGACY_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], LEGACY_DIR_NAME)},
'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['LEGACY_DIR'], TEMPLATES_DIR_NAME)},
'OUTPUT_DIR': {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)},
'ARCHIVE_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
'SOURCES_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], SOURCES_DIR_NAME)},
'LOGS_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], LOGS_DIR_NAME)},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and os.path.abspath(os.path.expanduser(c['COOKIES_FILE']))},
'CHROME_USER_DATA_DIR': {'default': lambda c: c['CHROME_USER_DATA_DIR'] and os.path.abspath(os.path.expanduser(c['CHROME_USER_DATA_DIR']))},
'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'], re.IGNORECASE)},
URL_BLACKLIST_PTN = re.compile(URL_BLACKLIST, re.IGNORECASE) if URL_BLACKLIST else None
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]},
'VERSION': {'default': lambda c: open(os.path.join(c['REPO_DIR'], VERSION_FILENAME), 'r').read().strip()},
'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
########################### Environment & Dependencies #########################
'PYTHON_BINARY': {'default': lambda c: sys.executable},
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
'PYTHON_VERSION': {'default': lambda c: '{}.{}'.format(sys.version_info.major, sys.version_info.minor)},
VERSION = open(os.path.join(REPO_DIR, 'VERSION'), 'r').read().strip()
GIT_SHA = VERSION.split('+')[-1] or 'unknown'
HAS_INVALID_DEPENDENCIES = False
'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
### Check system environment
if USER == 'root':
stderr('{red}[!] ArchiveBox should never be run as root!{reset}'.format(**ANSI))
stderr(' For more information, see the security overview documentation:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
raise SystemExit(1)
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL']},
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL']},
### Check Python environment
PYTHON_BINARY = sys.executable
PYTHON_VERSION = '{}.{}'.format(sys.version_info.major, sys.version_info.minor)
if float(PYTHON_VERSION) < 3.6:
stderr('{}[X] Python version is not new enough: {} (>3.6 is required){}'.format(ANSI['red'], PYTHON_VERSION, ANSI['reset']))
stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
'SAVE_WGET': {'default': lambda c: c['USE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET']},
if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
stderr('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
stderr('')
stderr(' Confirm that it\'s fixed by opening a new shell and running:')
stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
stderr('')
stderr(' Alternatively, run this script with:')
stderr(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
raise SystemExit(1)
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'SAVE_GIT': {'default': lambda c: c['USE_GIT']},
# ******************************************************************************
# ***************************** Helper Functions *******************************
# ******************************************************************************
'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL']},
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'])},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (c['CHROME_USER_DATA_DIR'] or None)},
'SAVE_PDF': {'default': lambda c: c['USE_CHROME']},
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME']},
'SAVE_DOM': {'default': lambda c: c['USE_CHROME']},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
'CONFIG_LOCATIONS': {'default': lambda c: get_config_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
}
################################### Helpers ####################################
def get_config_val(key: str, default: Any=None, type=None, aliases: Optional[Tuple[str, ...]]=None, config: CONFIG_TYPE=None) -> Any:
# check the canonical option name first, then check any older aliases
possible_env_keys = (key, *(aliases or ()))
for key in possible_env_keys:
val = os.getenv(key, None)
if val:
break
if type is None or val is None:
if hasattr(default, '__call__'):
return default(config)
return default
elif type is bool:
if val.lower() in ('true', 'yes', '1'):
return True
elif val.lower() in ('false', 'no', '0'):
return False
else:
raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
elif type is str:
if val.lower() in ('true', 'false', 'yes', 'no', '1', '0'):
raise ValueError(f'Invalid configuration option {key}={val} (expected a string)')
return val.strip()
elif type is int:
if not val.isdigit():
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
return int(val)
raise Exception('Config values can only be str, bool, or int')
def load_config(defaults: dict, config: Optional[CONFIG_TYPE]=None) -> CONFIG_TYPE:
config = {**(config or {})}
for key, default in defaults.items():
try:
config[key] = get_config_val(key, **default, config=config)
except KeyboardInterrupt:
raise SystemExit(1)
except Exception as e:
stderr()
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=config)
stderr(' {}: {}'.format(e.__class__.__name__, e))
stderr()
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
stderr()
stderr(' For config documentation and examples see:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
stderr()
raise SystemExit(1)
return config
def stderr(*args, color: Optional[str]=None, config: Optional[CONFIG_TYPE]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if color:
strs = (ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n')
else:
strs = (' '.join(str(a) for a in args), '\n')
sys.stderr.write(''.join(strs))
def bin_version(binary: str) -> Optional[str]:
"""check the presence and return valid version line of a specified binary"""
global HAS_INVALID_DEPENDENCIES
binary = os.path.expanduser(binary)
try:
if not shutil.which(binary):
raise Exception
version_str = run([binary, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
version_str = run([binary, "--version"], stdout=PIPE).stdout.strip().decode()
# take first 3 columns of first line of version info
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
except Exception:
HAS_INVALID_DEPENDENCIES = True
stderr('{red}[X] Unable to find working version of dependency: {}{reset}'.format(binary, **ANSI))
stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
stderr(' {} --version'.format(binary))
stderr()
stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Install')
stderr()
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
# stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
# stderr(f' {binary} --version')
# stderr()
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
# stderr(' https://github.com/pirate/ArchiveBox/wiki/Install')
# stderr()
return None
def bin_hash(binary: str) -> Optional[str]:
@ -187,7 +282,6 @@ def bin_hash(binary: str) -> Optional[str]:
return f'md5:{file_hash.hexdigest()}'
def find_chrome_binary() -> Optional[str]:
"""find any installed chrome binaries in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
@ -210,11 +304,10 @@ def find_chrome_binary() -> Optional[str]:
if full_path_exists:
return name
stderr('{red}[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?'.format(**ANSI))
stderr('[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?', color='red')
stderr()
return None
def find_chrome_data_dir() -> Optional[str]:
"""find any installed chrome user data directories in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
@ -239,244 +332,293 @@ def find_chrome_data_dir() -> Optional[str]:
return full_path
return None
def wget_supports_compression(config):
cmd = [
config['WGET_BINARY'],
"--compression=auto",
"--help",
]
return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
def setup_django(out_dir: str=OUTPUT_DIR, check_db=False):
import django
sys.path.append(PYTHON_DIR)
os.environ.setdefault('OUTPUT_DIR', out_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup()
if check_db:
assert os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)), (
f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {out_dir}')
# ******************************************************************************
# ************************ Environment & Dependencies **************************
# ******************************************************************************
try:
### Get Django version
DJANGO_BINARY = django.__file__.replace('__init__.py', 'bin/django-admin.py')
DJANGO_VERSION = '{}.{}.{} {} ({})'.format(*django.VERSION)
### Make sure curl is installed
if USE_CURL:
USE_CURL = FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG
else:
FETCH_FAVICON = SUBMIT_ARCHIVE_DOT_ORG = False
CURL_VERSION = None
if USE_CURL:
CURL_VERSION = bin_version(CURL_BINARY)
### Make sure wget is installed and calculate version
if USE_WGET:
USE_WGET = FETCH_WGET or FETCH_WARC
else:
FETCH_WGET = FETCH_WARC = False
WGET_VERSION = None
WGET_AUTO_COMPRESSION = False
if USE_WGET:
WGET_VERSION = bin_version(WGET_BINARY)
WGET_AUTO_COMPRESSION = not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL, stderr=DEVNULL).returncode
WGET_USER_AGENT = WGET_USER_AGENT.format(
VERSION=VERSION,
WGET_VERSION=WGET_VERSION or '',
)
### Make sure git is installed
GIT_VERSION = None
if FETCH_GIT:
GIT_VERSION = bin_version(GIT_BINARY)
### Make sure youtube-dl is installed
YOUTUBEDL_VERSION = None
if FETCH_MEDIA:
YOUTUBEDL_VERSION = bin_version(YOUTUBEDL_BINARY)
### Make sure chrome is installed and calculate version
if USE_CHROME:
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
else:
FETCH_PDF = FETCH_SCREENSHOT = FETCH_DOM = False
if not CHROME_BINARY:
CHROME_BINARY = find_chrome_binary() or 'chromium-browser'
CHROME_VERSION = None
if USE_CHROME:
if CHROME_BINARY:
CHROME_VERSION = bin_version(CHROME_BINARY)
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
if CHROME_USER_DATA_DIR is None:
CHROME_USER_DATA_DIR = find_chrome_data_dir()
elif CHROME_USER_DATA_DIR == '':
CHROME_USER_DATA_DIR = None
else:
if not os.path.exists(os.path.join(CHROME_USER_DATA_DIR, 'Default')):
stderr('{red}[X] Could not find profile "Default" in CHROME_USER_DATA_DIR:{reset} {}'.format(CHROME_USER_DATA_DIR, **ANSI))
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
stderr(' For more info see:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if 'Default' in CHROME_USER_DATA_DIR:
stderr()
stderr(' Try removing /Default from the end e.g.:')
stderr(' CHROME_USER_DATA_DIR="{}"'.format(CHROME_USER_DATA_DIR.split('/Default')[0]))
raise SystemExit(1)
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
### Summary Lookup Dicts
FOLDERS = {
def get_code_locations(config: CONFIG_TYPE) -> Dict[str, CONFIG_TYPE]:
return {
'REPO_DIR': {
'path': os.path.abspath(REPO_DIR),
'path': os.path.abspath(config['REPO_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(REPO_DIR, '.github')),
'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], '.github')),
},
'PYTHON_DIR': {
'path': os.path.abspath(PYTHON_DIR),
'path': os.path.abspath(config['PYTHON_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(PYTHON_DIR, '__main__.py')),
'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')),
},
'LEGACY_DIR': {
'path': os.path.abspath(LEGACY_DIR),
'path': os.path.abspath(config['LEGACY_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(LEGACY_DIR, 'util.py')),
'is_valid': os.path.exists(os.path.join(config['LEGACY_DIR'], 'util.py')),
},
'TEMPLATES_DIR': {
'path': os.path.abspath(TEMPLATES_DIR),
'path': os.path.abspath(config['TEMPLATES_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(TEMPLATES_DIR, 'static')),
},
'OUTPUT_DIR': {
'path': os.path.abspath(OUTPUT_DIR),
'enabled': True,
'is_valid': os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')),
},
'SOURCES_DIR': {
'path': os.path.abspath(SOURCES_DIR),
'enabled': True,
'is_valid': os.path.exists(SOURCES_DIR),
},
'LOGS_DIR': {
'path': os.path.abspath(LOGS_DIR),
'enabled': True,
'is_valid': os.path.exists(LOGS_DIR),
},
'ARCHIVE_DIR': {
'path': os.path.abspath(ARCHIVE_DIR),
'enabled': True,
'is_valid': os.path.exists(ARCHIVE_DIR),
'is_valid': os.path.exists(os.path.join(config['TEMPLATES_DIR'], 'static')),
},
}
def get_config_locations(config: CONFIG_TYPE) -> Dict[str, CONFIG_TYPE]:
return {
'CHROME_USER_DATA_DIR': {
'path': CHROME_USER_DATA_DIR and os.path.abspath(CHROME_USER_DATA_DIR),
'enabled': USE_CHROME and CHROME_USER_DATA_DIR,
'is_valid': os.path.exists(os.path.join(CHROME_USER_DATA_DIR, 'Default')) if CHROME_USER_DATA_DIR else False,
'path': config['CHROME_USER_DATA_DIR'] and os.path.abspath(config['CHROME_USER_DATA_DIR']),
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
'is_valid': os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')) if config['CHROME_USER_DATA_DIR'] else False,
},
'COOKIES_FILE': {
'path': COOKIES_FILE and os.path.abspath(COOKIES_FILE),
'enabled': USE_WGET and COOKIES_FILE,
'is_valid': COOKIES_FILE and os.path.exists(COOKIES_FILE),
'path': config['COOKIES_FILE'] and os.path.abspath(config['COOKIES_FILE']),
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
'is_valid': config['COOKIES_FILE'] and os.path.exists(config['COOKIES_FILE']),
},
}
DEPENDENCIES = {
'PYTHON_BINARY': {
'path': PYTHON_BINARY,
'version': PYTHON_VERSION,
'hash': bin_hash(PYTHON_BINARY),
def get_data_locations(config: CONFIG_TYPE) -> Dict[str, CONFIG_TYPE]:
return {
'OUTPUT_DIR': {
'path': os.path.abspath(config['OUTPUT_DIR']),
'enabled': True,
'is_valid': bool(DJANGO_VERSION),
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
},
'SOURCES_DIR': {
'path': os.path.abspath(config['SOURCES_DIR']),
'enabled': True,
'is_valid': os.path.exists(config['SOURCES_DIR']),
},
'LOGS_DIR': {
'path': os.path.abspath(config['LOGS_DIR']),
'enabled': True,
'is_valid': os.path.exists(config['LOGS_DIR']),
},
'ARCHIVE_DIR': {
'path': os.path.abspath(config['ARCHIVE_DIR']),
'enabled': True,
'is_valid': os.path.exists(config['ARCHIVE_DIR']),
},
}
def get_dependency_info(config: CONFIG_TYPE) -> Dict[str, CONFIG_TYPE]:
return {
'PYTHON_BINARY': {
'path': config['PYTHON_BINARY'],
'version': config['PYTHON_VERSION'],
'hash': bin_hash(config['PYTHON_BINARY']),
'enabled': True,
'is_valid': bool(config['DJANGO_VERSION']),
},
'DJANGO_BINARY': {
'path': DJANGO_BINARY,
'version': DJANGO_VERSION,
'hash': bin_hash(DJANGO_BINARY),
'path': config['DJANGO_BINARY'],
'version': config['DJANGO_VERSION'],
'hash': bin_hash(config['DJANGO_BINARY']),
'enabled': True,
'is_valid': bool(DJANGO_VERSION),
'is_valid': bool(config['DJANGO_VERSION']),
},
'CURL_BINARY': {
'path': CURL_BINARY and shutil.which(CURL_BINARY),
'version': CURL_VERSION,
'hash': bin_hash(PYTHON_BINARY),
'enabled': USE_CURL,
'is_valid': bool(CURL_VERSION),
'path': (config['CURL_BINARY'] and shutil.which(config['CURL_BINARY'])) or config['CURL_BINARY'],
'version': config['CURL_VERSION'],
'hash': bin_hash(config['PYTHON_BINARY']),
'enabled': config['USE_CURL'],
'is_valid': bool(config['CURL_VERSION']),
},
'WGET_BINARY': {
'path': WGET_BINARY and shutil.which(WGET_BINARY),
'version': WGET_VERSION,
'hash': bin_hash(WGET_BINARY),
'enabled': USE_WGET,
'is_valid': bool(WGET_VERSION),
'path': (config['WGET_BINARY'] and shutil.which(config['WGET_BINARY'])) or config['WGET_BINARY'],
'version': config['WGET_VERSION'],
'hash': bin_hash(config['WGET_BINARY']),
'enabled': config['USE_WGET'],
'is_valid': bool(config['WGET_VERSION']),
},
'GIT_BINARY': {
'path': GIT_BINARY and shutil.which(GIT_BINARY),
'version': GIT_VERSION,
'hash': bin_hash(GIT_BINARY),
'enabled': FETCH_GIT,
'is_valid': bool(GIT_VERSION),
'path': (config['GIT_BINARY'] and shutil.which(config['GIT_BINARY'])) or config['GIT_BINARY'],
'version': config['GIT_VERSION'],
'hash': bin_hash(config['GIT_BINARY']),
'enabled': config['USE_GIT'],
'is_valid': bool(config['GIT_VERSION']),
},
'YOUTUBEDL_BINARY': {
'path': YOUTUBEDL_BINARY and shutil.which(YOUTUBEDL_BINARY),
'version': YOUTUBEDL_VERSION,
'hash': bin_hash(YOUTUBEDL_BINARY),
'enabled': FETCH_MEDIA,
'is_valid': bool(YOUTUBEDL_VERSION),
'path': (config['YOUTUBEDL_BINARY'] and shutil.which(config['YOUTUBEDL_BINARY'])) or config['YOUTUBEDL_BINARY'],
'version': config['YOUTUBEDL_VERSION'],
'hash': bin_hash(config['YOUTUBEDL_BINARY']),
'enabled': config['USE_YOUTUBEDL'],
'is_valid': bool(config['YOUTUBEDL_VERSION']),
},
'CHROME_BINARY': {
'path': CHROME_BINARY and shutil.which(CHROME_BINARY),
'version': CHROME_VERSION,
'hash': bin_hash(CHROME_BINARY),
'enabled': USE_CHROME,
'is_valid': bool(CHROME_VERSION),
'path': (config['CHROME_BINARY'] and shutil.which(config['CHROME_BINARY'])) or config['CHROME_BINARY'],
'version': config['CHROME_VERSION'],
'hash': bin_hash(config['CHROME_BINARY']),
'enabled': config['USE_CHROME'],
'is_valid': bool(config['CHROME_VERSION']),
},
}
CHROME_OPTIONS = {
'TIMEOUT': TIMEOUT,
'RESOLUTION': RESOLUTION,
'CHECK_SSL_VALIDITY': CHECK_SSL_VALIDITY,
'CHROME_BINARY': CHROME_BINARY,
'CHROME_HEADLESS': CHROME_HEADLESS,
'CHROME_SANDBOX': CHROME_SANDBOX,
'CHROME_USER_AGENT': CHROME_USER_AGENT,
'CHROME_USER_DATA_DIR': CHROME_USER_DATA_DIR,
def get_chrome_info(config: CONFIG_TYPE) -> Dict[str, CONFIG_TYPE]:
return {
'TIMEOUT': config['TIMEOUT'],
'RESOLUTION': config['RESOLUTION'],
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
'CHROME_BINARY': config['CHROME_BINARY'],
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
}
# PYPPETEER_ARGS = {
# 'headless': CHROME_HEADLESS,
# 'ignoreHTTPSErrors': not CHECK_SSL_VALIDITY,
# # 'executablePath': CHROME_BINARY,
# }
except KeyboardInterrupt:
raise SystemExit(1)
except Exception as e:
stderr()
stderr('{red}[X] Error during configuration: {} {}{reset}'.format(e.__class__.__name__, e, **ANSI))
stderr(' Your archive data is unaffected.')
stderr(' Check your config or environemnt variables for mistakes and try again.')
stderr(' For more info see:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
stderr()
raise
################################## Load Config #################################
CONFIG = load_config(SHELL_CONFIG_DEFAULTS)
CONFIG = load_config(ARCHIVE_CONFIG_DEFAULTS, CONFIG)
CONFIG = load_config(ARCHIVE_METHOD_TOGGLES_DEFAULTS, CONFIG)
CONFIG = load_config(ARCHIVE_METHOD_OPTIONS_DEFAULTS, CONFIG)
CONFIG = load_config(DEPENDENCY_CONFIG_DEFAULTS, CONFIG)
CONFIG = load_config(DERIVED_CONFIG_DEFAULTS, CONFIG)
globals().update(CONFIG)
def check_dependencies() -> None:
if HAS_INVALID_DEPENDENCIES:
stderr('{red}[X] Missing some required dependencies.{reset}'.format(**ANSI))
############################## Importable Checkers #############################
def check_system_config(config: CONFIG_TYPE=CONFIG) -> None:
ANSI = config['ANSI']
### Check system environment
if config['USER'] == 'root':
stderr('[!] ArchiveBox should never be run as root!', color='red')
stderr(' For more information, see the security overview documentation:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
raise SystemExit(1)
### Check Python environment
if float(config['PYTHON_VERSION']) < 3.6:
stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
stderr('')
stderr(' Confirm that it\'s fixed by opening a new shell and running:')
stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
raise SystemExit(1)
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
if config['CHROME_USER_DATA_DIR'] and not os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')):
stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
stderr(' For more info see:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if 'Default' in config['CHROME_USER_DATA_DIR']:
stderr()
stderr(' Try removing /Default from the end e.g.:')
stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
raise SystemExit(1)
def check_dependencies(config: CONFIG_TYPE=CONFIG, show_help: bool=True) -> None:
invalid = [
'{}: {} ({})'.format(name, info['path'] or 'unable to find binary', info['version'] or 'unable to detect version')
for name, info in config['DEPENDENCIES'].items()
if info['enabled'] and not info['is_valid']
]
if invalid:
stderr('[X] Missing some required dependencies.', color='red')
stderr()
stderr(' {}'.format('\n '.join(invalid)))
if show_help:
stderr()
stderr(' To get more info on dependency status run:')
stderr(' archivebox --version')
raise SystemExit(1)
if config['TIMEOUT'] < 5:
stderr()
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
stderr()
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
stderr()
stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
stderr(' (Setting it somewhere over 60 seconds is recommended)')
stderr()
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
def check_data_folder() -> None:
if not os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')):
stderr('{red}[X] No archive data was found in:{reset} {}'.format(OUTPUT_DIR, **ANSI))
def check_data_folder(out_dir: Optional[str]=None, config: CONFIG_TYPE=CONFIG) -> None:
out_dir = out_dir or config['OUTPUT_DIR']
json_index_exists = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME))
if not json_index_exists:
stderr('[X] No archive index was found in current directory.', color='red')
stderr(f' {out_dir}')
stderr()
stderr(' Are you running archivebox in the right folder?')
stderr(' cd path/to/your/archive/folder')
stderr(' archivebox [command]')
stderr()
stderr(' To create a new archive collection in this folder, run:')
stderr(' To create a new archive collection or import existing data in this folder, run:')
stderr(' archivebox init')
raise SystemExit(1)
sql_index_exists = os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME))
from .storage.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]
if (not sql_index_exists) or pending_migrations:
if sql_index_exists:
pending_operation = f'apply the {len(pending_migrations)} pending migrations'
else:
pending_operation = 'generate the new SQL main index'
stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
stderr(f' {out_dir}')
stderr()
stderr(f' To upgrade it to the latest version and {pending_operation} run:')
stderr(' archivebox init')
raise SystemExit(1)
def setup_django(out_dir: str=None, check_db=False, config: CONFIG_TYPE=CONFIG) -> None:
import django
sys.path.append(config['PYTHON_DIR'])
os.environ.setdefault('OUTPUT_DIR', out_dir or config['OUTPUT_DIR'])
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup()
if check_db:
sql_index_path = os.path.join(out_dir or config['OUTPUT_DIR'], SQL_INDEX_FILENAME)
assert os.path.exists(sql_index_path), (
f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}')
check_system_config()
__all__ = (
'stderr',
'check_data_folder',
'check_dependencies',
'setup_django',
*CONFIG,
)