diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index b3915e2f..56009cd1 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -33,6 +33,7 @@ from config import ( WGET_USER_AGENT, CHECK_SSL_VALIDITY, COOKIES_FILE, + WGET_AUTO_COMPRESSION ) from util import ( domain, @@ -224,10 +225,10 @@ def fetch_wget(link_dir, link, timeout=TIMEOUT): '--backup-converted', '--span-hosts', '--no-parent', - '--compression=auto', '-e', 'robots=off', '--restrict-file-names=unix', '--timeout={}'.format(timeout), + *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(() if FETCH_WARC else ('--timestamping',)), *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()), *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()), diff --git a/archivebox/config.py b/archivebox/config.py index d8e01b24..29ed2df2 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -72,6 +72,7 @@ TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates') CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC +WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode) ########################### Environment & Dependencies #########################