From 4e69d2c9e14bbbc4597731fdc349f5461a726b54 Mon Sep 17 00:00:00 2001
From: Ben Muthalaly <benmuthalaly@gmail.com>
Date: Wed, 21 Feb 2024 15:13:06 -0600
Subject: [PATCH] Add `EXTRA_*_ARGS` for wget, curl, and singlefile

---
 archivebox/config.py                 |  8 +++++++-
 archivebox/extractors/archive_org.py | 13 ++++++++++---
 archivebox/extractors/favicon.py     | 18 ++++++++++++++----
 archivebox/extractors/headers.py     | 14 ++++++++++----
 archivebox/extractors/singlefile.py  | 25 +++++++++----------------
 archivebox/extractors/title.py       | 13 ++++++++++---
 archivebox/extractors/wget.py        | 15 +++++++++++----
 archivebox/util.py                   | 17 +++++++++++++++++
 8 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/archivebox/config.py b/archivebox/config.py
index 1edd2eeb..ebb939a4 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -187,12 +187,15 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                 '--no-parent',
                                                                 '-e', 'robots=off',
                                                                 ]},
+        'WGET_EXTRA_ARGS':          {'type': list,  'default': None},
         'CURL_ARGS':                {'type': list,  'default': ['--silent',
                                                                 '--location',
                                                                 '--compressed'
                                                                ]},
+        'CURL_EXTRA_ARGS':          {'type': list,  'default': None},
         'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
-        'SINGLEFILE_ARGS':          {'type': list,  'default' : None},
+        'SINGLEFILE_ARGS':          {'type': list,  'default': None},
+        'SINGLEFILE_EXTRA_ARGS':    {'type': list,  'default': None},
         'FAVICON_PROVIDER':         {'type': str,   'default': 'https://www.google.com/s2/favicons?domain={}'},
     },
 
@@ -530,6 +533,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
     'CURL_USER_AGENT':          {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
     'CURL_ARGS':                {'default': lambda c: c['CURL_ARGS'] or []},
+    'CURL_EXTRA_ARGS':          {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
     'SAVE_FAVICON':             {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
     'SAVE_ARCHIVE_DOT_ORG':     {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
 
@@ -540,12 +544,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
     'SAVE_WGET':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
     'SAVE_WARC':                {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
     'WGET_ARGS':                {'default': lambda c: c['WGET_ARGS'] or []},
+    'WGET_EXTRA_ARGS':          {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
 
     'RIPGREP_VERSION':          {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
 
     'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
     'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
     'SINGLEFILE_ARGS':          {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
+    'SINGLEFILE_EXTRA_ARGS':    {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
 
     'USE_READABILITY':          {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
     'READABILITY_VERSION':      {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py
index a0883113..93730f26 100644
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -10,10 +10,12 @@ from ..system import run, chmod_file
 from ..util import (
     enforce_types,
     is_static_file,
+    dedupe,
 )
 from ..config import (
     TIMEOUT,
     CURL_ARGS,
+    CURL_EXTRA_ARGS,
     CHECK_SSL_VALIDITY,
     SAVE_ARCHIVE_DOT_ORG,
     CURL_BINARY,
@@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
     output: ArchiveOutput = 'archive.org.txt'
     archive_org_url = None
     submit_url = 'https://web.archive.org/save/{}'.format(link.url)
-    cmd = [
-        CURL_BINARY,
-        *CURL_ARGS,
+    # earlier options take precedence
+    options = [
         '--head',
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *CURL_EXTRA_ARGS,
+        *CURL_ARGS,
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(*options),
         submit_url,
     ]
     status = 'succeeded'
diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py
index 5baafc17..3b41f349 100644
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -6,13 +6,18 @@ from typing import Optional
 
 from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..system import chmod_file, run
-from ..util import enforce_types, domain
+from ..util import (
+    enforce_types,
+     domain,
+     dedupe,
+)
 from ..config import (
     TIMEOUT,
     SAVE_FAVICON,
     FAVICON_PROVIDER,
     CURL_BINARY,
     CURL_ARGS,
+    CURL_EXTRA_ARGS,
     CURL_VERSION,
     CHECK_SSL_VALIDITY,
     CURL_USER_AGENT,
@@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
 
     out_dir = out_dir or link.link_dir
     output: ArchiveOutput = 'favicon.ico'
-    cmd = [
-        CURL_BINARY,
-        *CURL_ARGS,
+    # earlier options take precedence
+    options = [
         '--max-time', str(timeout),
         '--output', str(output),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *CURL_EXTRA_ARGS,
+        *CURL_ARGS,
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(*options),
         FAVICON_PROVIDER.format(domain(link.url)),
     ]
     status = 'failed'
diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py
index 91dcb8e3..3828de93 100644
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@@ -9,11 +9,13 @@ from ..system import atomic_write
 from ..util import (
     enforce_types,
     get_headers,
+    dedupe,
 )
 from ..config import (
     TIMEOUT,
     CURL_BINARY,
     CURL_ARGS,
+    CURL_EXTRA_ARGS,
     CURL_USER_AGENT,
     CURL_VERSION,
     CHECK_SSL_VALIDITY,
@@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
 
     status = 'succeeded'
     timer = TimedProgress(timeout, prefix='      ')
-
-    cmd = [
-        CURL_BINARY,
-        *CURL_ARGS,
+    # earlier options take precedence
+    options = [
         '--head',
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *CURL_EXTRA_ARGS,
+        *CURL_ARGS,
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(*options),
         link.url,
     ]
     try:
diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py
index e50b3932..b2119119 100644
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -11,6 +11,7 @@ from ..util import (
     enforce_types,
     is_static_file,
     chrome_args,
+    dedupe,
 )
 from ..config import (
     TIMEOUT,
@@ -18,6 +19,7 @@ from ..config import (
     DEPENDENCIES,
     SINGLEFILE_VERSION,
     SINGLEFILE_ARGS,
+    SINGLEFILE_EXTRA_ARGS,
     CHROME_BINARY,
 )
 from ..logging_util import TimedProgress
@@ -46,11 +48,6 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
 
     # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
     browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
-    options = [
-        *SINGLEFILE_ARGS,
-        '--browser-executable-path={}'.format(CHROME_BINARY),
-        browser_args,
-    ]
 
     # Deduplicate options (single-file doesn't like when you use the same option two times)
     #
@@ -58,19 +55,15 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
     # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most 
     # specificity, therefore the user sets it with a lot intent, therefore it should take precedence 
     # kind of like the ergonomic principle of lexical scope in programming languages.
-    seen_option_names = []
-    def test_seen(argument):
-        option_name = argument.split("=")[0]
-        if option_name in seen_option_names:
-            return False
-        else:
-            seen_option_names.append(option_name)
-            return True
-    deduped_options = list(filter(test_seen, options))
-
+    options = [
+        '--browser-executable-path={}'.format(CHROME_BINARY),
+        browser_args,
+        *SINGLEFILE_EXTRA_ARGS,
+        *SINGLEFILE_ARGS,
+    ]
     cmd = [
         DEPENDENCIES['SINGLEFILE_BINARY']['path'],
-        *deduped_options,
+        *dedupe(*options),
         link.url,
         output,
     ]
diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index 6b0e37f6..b2b65af2 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -10,6 +10,7 @@ from ..util import (
     enforce_types,
     download_url,
     htmldecode,
+    dedupe,
 )
 from ..config import (
     TIMEOUT,
@@ -17,6 +18,7 @@ from ..config import (
     SAVE_TITLE,
     CURL_BINARY,
     CURL_ARGS,
+    CURL_EXTRA_ARGS,
     CURL_VERSION,
     CURL_USER_AGENT,
 )
@@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
     from core.models import Snapshot
 
     output: ArchiveOutput = None
-    cmd = [
-        CURL_BINARY,
-        *CURL_ARGS,
+    # earlier options take precedence
+    options = [
         '--max-time', str(timeout),
         *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
         *([] if CHECK_SSL_VALIDITY else ['--insecure']),
+        *CURL_EXTRA_ARGS,
+        *CURL_ARGS,
+    ]
+    cmd = [
+        CURL_BINARY,
+        *dedupe(*options),
         link.url,
     ]
     status = 'succeeded'
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index f3057271..d50409b6 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -15,9 +15,11 @@ from ..util import (
     path,
     domain,
     urldecode,
+    dedupe,
 )
 from ..config import (
     WGET_ARGS,
+    WGET_EXTRA_ARGS,
     TIMEOUT,
     SAVE_WGET,
     SAVE_WARC,
@@ -55,10 +57,8 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
 
     # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
     output: ArchiveOutput = None
-    cmd = [
-        WGET_BINARY,
-        # '--server-response',  # print headers for better error parsing
-        *WGET_ARGS,
+    # earlier options take precedence
+    options = [
         '--timeout={}'.format(timeout),
         *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
         *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
@@ -68,6 +68,13 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
         *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
         *([] if SAVE_WARC else ['--timestamping']),
         *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
+        # '--server-response',  # print headers for better error parsing
+        *WGET_EXTRA_ARGS, 
+        *WGET_ARGS,
+    ]
+    cmd = [
+        WGET_BINARY,
+        *dedupe(*options),
         link.url,
     ]
 
diff --git a/archivebox/util.py b/archivebox/util.py
index 5321081c..6b31c86e 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -317,6 +317,23 @@ def ansi_to_html(text):
     return COLOR_REGEX.sub(single_sub, text)
 
 
+@enforce_types
+def dedupe(*options: List[str]) -> List[str]:
+    """
+    Deduplicates the given options. Options that come earlier in the list clobber
+    later conflicting options.
+    """
+    seen_option_names = []
+    def test_seen(argument):
+        option_name = argument.split("=")[0]
+        if option_name in seen_option_names:
+            return False
+        else:
+            seen_option_names.append(option_name)
+            return True
+    return list(filter(test_seen, options))
+
+
 class AttributeDict(dict):
     """Helper to allow accessing dict values via Example.key or Example['key']"""