mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
Flip dedupe precedence order
This commit is contained in:
parent
4d9c5a7b4b
commit
d74ddd42ae
8 changed files with 33 additions and 40 deletions
|
@ -46,14 +46,14 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
||||||
output: ArchiveOutput = 'archive.org.txt'
|
output: ArchiveOutput = 'archive.org.txt'
|
||||||
archive_org_url = None
|
archive_org_url = None
|
||||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||||
# earlier options take precedence
|
# later options take precedence
|
||||||
options = [
|
options = [
|
||||||
|
*CURL_ARGS,
|
||||||
|
*CURL_EXTRA_ARGS,
|
||||||
'--head',
|
'--head',
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
*CURL_EXTRA_ARGS,
|
|
||||||
*CURL_ARGS,
|
|
||||||
]
|
]
|
||||||
cmd = [
|
cmd = [
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
|
|
|
@ -39,14 +39,14 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
|
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or link.link_dir
|
||||||
output: ArchiveOutput = 'favicon.ico'
|
output: ArchiveOutput = 'favicon.ico'
|
||||||
# earlier options take precedence
|
# later options take precedence
|
||||||
options = [
|
options = [
|
||||||
|
*CURL_ARGS,
|
||||||
|
*CURL_EXTRA_ARGS,
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
'--output', str(output),
|
'--output', str(output),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
*CURL_EXTRA_ARGS,
|
|
||||||
*CURL_ARGS,
|
|
||||||
]
|
]
|
||||||
cmd = [
|
cmd = [
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
|
|
|
@ -42,14 +42,14 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
||||||
|
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
# earlier options take precedence
|
# later options take precedence
|
||||||
options = [
|
options = [
|
||||||
|
*CURL_ARGS,
|
||||||
|
*CURL_EXTRA_ARGS,
|
||||||
'--head',
|
'--head',
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
*CURL_EXTRA_ARGS,
|
|
||||||
*CURL_ARGS,
|
|
||||||
]
|
]
|
||||||
cmd = [
|
cmd = [
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
|
|
|
@ -41,11 +41,12 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
||||||
output: ArchiveOutput = 'media'
|
output: ArchiveOutput = 'media'
|
||||||
output_path = out_dir / output
|
output_path = out_dir / output
|
||||||
output_path.mkdir(exist_ok=True)
|
output_path.mkdir(exist_ok=True)
|
||||||
|
# later options take precedence
|
||||||
options = [
|
options = [
|
||||||
|
*YOUTUBEDL_ARGS,
|
||||||
|
*YOUTUBEDL_EXTRA_ARGS,
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||||
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
||||||
*YOUTUBEDL_EXTRA_ARGS,
|
|
||||||
*YOUTUBEDL_ARGS,
|
|
||||||
]
|
]
|
||||||
cmd = [
|
cmd = [
|
||||||
YOUTUBEDL_BINARY,
|
YOUTUBEDL_BINARY,
|
||||||
|
|
|
@ -48,18 +48,12 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
|
|
||||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||||
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
||||||
|
# later options take precedence
|
||||||
# Deduplicate options (single-file doesn't like when you use the same option two times)
|
|
||||||
#
|
|
||||||
# NOTE: Options names that come first clobber conflicting names that come later
|
|
||||||
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
|
|
||||||
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
|
|
||||||
# kind of like the ergonomic principle of lexical scope in programming languages.
|
|
||||||
options = [
|
options = [
|
||||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
|
||||||
browser_args,
|
|
||||||
*SINGLEFILE_EXTRA_ARGS,
|
|
||||||
*SINGLEFILE_ARGS,
|
*SINGLEFILE_ARGS,
|
||||||
|
*SINGLEFILE_EXTRA_ARGS,
|
||||||
|
browser_args,
|
||||||
|
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||||
]
|
]
|
||||||
cmd = [
|
cmd = [
|
||||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||||
|
|
|
@ -104,13 +104,13 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
|
|
||||||
output: ArchiveOutput = None
|
output: ArchiveOutput = None
|
||||||
# earlier options take precedence
|
# later options take precedence
|
||||||
options = [
|
options = [
|
||||||
|
*CURL_ARGS,
|
||||||
|
*CURL_EXTRA_ARGS,
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
*CURL_EXTRA_ARGS,
|
|
||||||
*CURL_ARGS,
|
|
||||||
]
|
]
|
||||||
cmd = [
|
cmd = [
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
|
|
|
@ -57,8 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
|
|
||||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||||
output: ArchiveOutput = None
|
output: ArchiveOutput = None
|
||||||
# earlier options take precedence
|
# later options take precedence
|
||||||
options = [
|
options = [
|
||||||
|
*WGET_ARGS,
|
||||||
|
*WGET_EXTRA_ARGS,
|
||||||
'--timeout={}'.format(timeout),
|
'--timeout={}'.format(timeout),
|
||||||
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
|
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
|
||||||
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
|
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
|
||||||
|
@ -69,8 +71,6 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
*([] if SAVE_WARC else ['--timestamping']),
|
*([] if SAVE_WARC else ['--timestamping']),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||||
# '--server-response', # print headers for better error parsing
|
# '--server-response', # print headers for better error parsing
|
||||||
*WGET_EXTRA_ARGS,
|
|
||||||
*WGET_ARGS,
|
|
||||||
]
|
]
|
||||||
cmd = [
|
cmd = [
|
||||||
WGET_BINARY,
|
WGET_BINARY,
|
||||||
|
|
|
@ -240,6 +240,8 @@ def chrome_args(**options) -> List[str]:
|
||||||
|
|
||||||
cmd_args = [options['CHROME_BINARY']]
|
cmd_args = [options['CHROME_BINARY']]
|
||||||
|
|
||||||
|
cmd_args += CHROME_EXTRA_ARGS
|
||||||
|
|
||||||
if options['CHROME_HEADLESS']:
|
if options['CHROME_HEADLESS']:
|
||||||
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
|
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
|
||||||
if chrome_major_version >= 111:
|
if chrome_major_version >= 111:
|
||||||
|
@ -284,7 +286,6 @@ def chrome_args(**options) -> List[str]:
|
||||||
if options['CHROME_USER_DATA_DIR']:
|
if options['CHROME_USER_DATA_DIR']:
|
||||||
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
||||||
|
|
||||||
cmd_args += CHROME_EXTRA_ARGS
|
|
||||||
|
|
||||||
return dedupe(*cmd_args)
|
return dedupe(*cmd_args)
|
||||||
|
|
||||||
|
@ -324,20 +325,17 @@ def ansi_to_html(text):
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def dedupe(*options: List[str]) -> List[str]:
|
def dedupe(*options: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Deduplicates the given options. Options that come earlier in the list clobber
|
Deduplicates the given options. Options that come later clobber earlier
|
||||||
later conflicting options.
|
conflicting options.
|
||||||
"""
|
"""
|
||||||
seen_option_names = []
|
deduped = {}
|
||||||
def test_seen(argument):
|
|
||||||
option_name = argument.split("=")[0]
|
for option in options:
|
||||||
if option_name in seen_option_names:
|
deduped[option.split('=')[0]] = option
|
||||||
return False
|
|
||||||
else:
|
return list(deduped.values())
|
||||||
seen_option_names.append(option_name)
|
|
||||||
return True
|
|
||||||
return list(filter(test_seen, options))
|
|
||||||
|
|
||||||
|
|
||||||
class AttributeDict(dict):
|
class AttributeDict(dict):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue