From 24e7a748551f505b7fd10b23de0c2f559494f90e Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 15 Oct 2020 08:31:49 -0500 Subject: [PATCH] feat: Add WGET_ARGS to control wget arguments --- archivebox/config/__init__.py | 13 ++++++++++++- archivebox/config/stubs.py | 1 + archivebox/extractors/wget.py | 10 ++-------- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index f2dae1ef..98023d90 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -120,7 +120,17 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { '--audio-format', 'mp3', '--audio-quality', '320K', '--embed-thumbnail', - '--add-metadata']} + '--add-metadata']}, + + 'WGET_ARGS': {'type': list, 'default': ['--no-verbose', + '--adjust-extension', + '--convert-links', + '--force-directories', + '--backup-converted', + '--span-hosts', + '--no-parent', + '-e', 'robots=off', + ]} }, 'DEPENDENCY_CONFIG': { @@ -276,6 +286,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)}, 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, + 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, diff --git a/archivebox/config/stubs.py b/archivebox/config/stubs.py index 10bf6abc..ae66540e 100644 --- a/archivebox/config/stubs.py +++ b/archivebox/config/stubs.py @@ -95,6 +95,7 @@ class ConfigDict(BaseConfig, total=False): CHROME_BINARY: Optional[str] YOUTUBEDL_ARGS: Optional[str] + WGET_ARGS: Optional[str] ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue] diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index da88dc5f..331f636b 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -19,6 +19,7 @@ from ..util import ( urldecode, ) from ..config import ( + WGET_ARGS, TIMEOUT, SAVE_WGET, SAVE_WARC, @@ -59,14 +60,7 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> cmd = [ WGET_BINARY, # '--server-response', # print headers for better error parsing - '--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', + *WGET_ARGS, '--timeout={}'.format(timeout), *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),