mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 22:54:27 -04:00
Add _EXTRA_ARGS
for various extractors (#1360)
This commit is contained in:
commit
ca2c484a8e
10 changed files with 115 additions and 42 deletions
|
@ -152,6 +152,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'CHROME_TIMEOUT': {'type': int, 'default': 0},
|
'CHROME_TIMEOUT': {'type': int, 'default': 0},
|
||||||
'CHROME_HEADLESS': {'type': bool, 'default': True},
|
'CHROME_HEADLESS': {'type': bool, 'default': True},
|
||||||
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
|
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
|
||||||
|
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
|
||||||
|
|
||||||
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
|
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
|
||||||
'--restrict-filenames',
|
'--restrict-filenames',
|
||||||
'--trim-filenames', '128',
|
'--trim-filenames', '128',
|
||||||
|
@ -176,6 +178,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'--add-metadata',
|
'--add-metadata',
|
||||||
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
|
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
|
||||||
]},
|
]},
|
||||||
|
'YOUTUBEDL_EXTRA_ARGS': {'type': list, 'default': None},
|
||||||
|
|
||||||
|
|
||||||
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
|
'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
|
||||||
|
@ -187,12 +190,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'--no-parent',
|
'--no-parent',
|
||||||
'-e', 'robots=off',
|
'-e', 'robots=off',
|
||||||
]},
|
]},
|
||||||
|
'WGET_EXTRA_ARGS': {'type': list, 'default': None},
|
||||||
'CURL_ARGS': {'type': list, 'default': ['--silent',
|
'CURL_ARGS': {'type': list, 'default': ['--silent',
|
||||||
'--location',
|
'--location',
|
||||||
'--compressed'
|
'--compressed'
|
||||||
]},
|
]},
|
||||||
|
'CURL_EXTRA_ARGS': {'type': list, 'default': None},
|
||||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||||
'SINGLEFILE_ARGS': {'type': list, 'default' : None},
|
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
||||||
|
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
|
||||||
|
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
|
||||||
|
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
|
||||||
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
|
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -530,6 +538,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||||
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
||||||
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
|
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
|
||||||
|
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
|
||||||
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
||||||
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
|
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
|
||||||
|
|
||||||
|
@ -540,18 +549,22 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
||||||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
||||||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
||||||
|
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
||||||
|
|
||||||
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
||||||
|
|
||||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||||
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
|
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
|
||||||
|
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
|
||||||
|
|
||||||
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
|
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
|
||||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
||||||
|
|
||||||
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
||||||
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
||||||
|
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
|
||||||
|
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
|
||||||
|
|
||||||
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||||
|
@ -561,6 +574,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
|
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
|
||||||
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
|
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
|
||||||
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
|
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
|
||||||
|
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
|
||||||
|
|
||||||
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
|
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
|
||||||
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
|
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
|
||||||
|
@ -582,6 +596,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
|
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
|
||||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||||
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
||||||
|
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
|
||||||
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
||||||
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,10 +10,12 @@ from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
CURL_ARGS,
|
CURL_ARGS,
|
||||||
|
CURL_EXTRA_ARGS,
|
||||||
CHECK_SSL_VALIDITY,
|
CHECK_SSL_VALIDITY,
|
||||||
SAVE_ARCHIVE_DOT_ORG,
|
SAVE_ARCHIVE_DOT_ORG,
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
|
@ -44,13 +46,18 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
||||||
output: ArchiveOutput = 'archive.org.txt'
|
output: ArchiveOutput = 'archive.org.txt'
|
||||||
archive_org_url = None
|
archive_org_url = None
|
||||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||||
cmd = [
|
# later options take precedence
|
||||||
CURL_BINARY,
|
options = [
|
||||||
*CURL_ARGS,
|
*CURL_ARGS,
|
||||||
|
*CURL_EXTRA_ARGS,
|
||||||
'--head',
|
'--head',
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
|
]
|
||||||
|
cmd = [
|
||||||
|
CURL_BINARY,
|
||||||
|
*dedupe(options),
|
||||||
submit_url,
|
submit_url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
|
|
|
@ -6,13 +6,18 @@ from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||||
from ..system import chmod_file, run
|
from ..system import chmod_file, run
|
||||||
from ..util import enforce_types, domain
|
from ..util import (
|
||||||
|
enforce_types,
|
||||||
|
domain,
|
||||||
|
dedupe,
|
||||||
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
SAVE_FAVICON,
|
SAVE_FAVICON,
|
||||||
FAVICON_PROVIDER,
|
FAVICON_PROVIDER,
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
CURL_ARGS,
|
CURL_ARGS,
|
||||||
|
CURL_EXTRA_ARGS,
|
||||||
CURL_VERSION,
|
CURL_VERSION,
|
||||||
CHECK_SSL_VALIDITY,
|
CHECK_SSL_VALIDITY,
|
||||||
CURL_USER_AGENT,
|
CURL_USER_AGENT,
|
||||||
|
@ -34,13 +39,18 @@ def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
|
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or link.link_dir
|
||||||
output: ArchiveOutput = 'favicon.ico'
|
output: ArchiveOutput = 'favicon.ico'
|
||||||
cmd = [
|
# later options take precedence
|
||||||
CURL_BINARY,
|
options = [
|
||||||
*CURL_ARGS,
|
*CURL_ARGS,
|
||||||
|
*CURL_EXTRA_ARGS,
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
'--output', str(output),
|
'--output', str(output),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
|
]
|
||||||
|
cmd = [
|
||||||
|
CURL_BINARY,
|
||||||
|
*dedupe(options),
|
||||||
FAVICON_PROVIDER.format(domain(link.url)),
|
FAVICON_PROVIDER.format(domain(link.url)),
|
||||||
]
|
]
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
|
|
|
@ -9,11 +9,13 @@ from ..system import atomic_write
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
get_headers,
|
get_headers,
|
||||||
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
CURL_ARGS,
|
CURL_ARGS,
|
||||||
|
CURL_EXTRA_ARGS,
|
||||||
CURL_USER_AGENT,
|
CURL_USER_AGENT,
|
||||||
CURL_VERSION,
|
CURL_VERSION,
|
||||||
CHECK_SSL_VALIDITY,
|
CHECK_SSL_VALIDITY,
|
||||||
|
@ -40,14 +42,18 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
||||||
|
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
# later options take precedence
|
||||||
cmd = [
|
options = [
|
||||||
CURL_BINARY,
|
|
||||||
*CURL_ARGS,
|
*CURL_ARGS,
|
||||||
|
*CURL_EXTRA_ARGS,
|
||||||
'--head',
|
'--head',
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
|
]
|
||||||
|
cmd = [
|
||||||
|
CURL_BINARY,
|
||||||
|
*dedupe(options),
|
||||||
link.url,
|
link.url,
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -8,11 +8,13 @@ from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
MEDIA_TIMEOUT,
|
MEDIA_TIMEOUT,
|
||||||
SAVE_MEDIA,
|
SAVE_MEDIA,
|
||||||
YOUTUBEDL_ARGS,
|
YOUTUBEDL_ARGS,
|
||||||
|
YOUTUBEDL_EXTRA_ARGS,
|
||||||
YOUTUBEDL_BINARY,
|
YOUTUBEDL_BINARY,
|
||||||
YOUTUBEDL_VERSION,
|
YOUTUBEDL_VERSION,
|
||||||
CHECK_SSL_VALIDITY
|
CHECK_SSL_VALIDITY
|
||||||
|
@ -39,11 +41,16 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
||||||
output: ArchiveOutput = 'media'
|
output: ArchiveOutput = 'media'
|
||||||
output_path = out_dir / output
|
output_path = out_dir / output
|
||||||
output_path.mkdir(exist_ok=True)
|
output_path.mkdir(exist_ok=True)
|
||||||
cmd = [
|
# later options take precedence
|
||||||
YOUTUBEDL_BINARY,
|
options = [
|
||||||
*YOUTUBEDL_ARGS,
|
*YOUTUBEDL_ARGS,
|
||||||
|
*YOUTUBEDL_EXTRA_ARGS,
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||||
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
||||||
|
]
|
||||||
|
cmd = [
|
||||||
|
YOUTUBEDL_BINARY,
|
||||||
|
*dedupe(options),
|
||||||
link.url,
|
link.url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
|
|
|
@ -11,13 +11,15 @@ from ..system import run, atomic_write
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
SAVE_MERCURY,
|
SAVE_MERCURY,
|
||||||
DEPENDENCIES,
|
DEPENDENCIES,
|
||||||
MERCURY_VERSION,
|
MERCURY_VERSION,
|
||||||
|
MERCURY_ARGS,
|
||||||
|
MERCURY_EXTRA_ARGS,
|
||||||
)
|
)
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
output_folder.mkdir(exist_ok=True)
|
output_folder.mkdir(exist_ok=True)
|
||||||
|
# later options take precedence
|
||||||
# Get plain text version of article
|
options = [
|
||||||
|
*MERCURY_ARGS,
|
||||||
|
*MERCURY_EXTRA_ARGS,
|
||||||
|
]
|
||||||
|
# By default, get plain text version of article
|
||||||
cmd = [
|
cmd = [
|
||||||
DEPENDENCIES['MERCURY_BINARY']['path'],
|
DEPENDENCIES['MERCURY_BINARY']['path'],
|
||||||
link.url,
|
link.url,
|
||||||
"--format=text"
|
*dedupe(options)
|
||||||
]
|
]
|
||||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -11,6 +11,7 @@ from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
chrome_args,
|
chrome_args,
|
||||||
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -18,6 +19,7 @@ from ..config import (
|
||||||
DEPENDENCIES,
|
DEPENDENCIES,
|
||||||
SINGLEFILE_VERSION,
|
SINGLEFILE_VERSION,
|
||||||
SINGLEFILE_ARGS,
|
SINGLEFILE_ARGS,
|
||||||
|
SINGLEFILE_EXTRA_ARGS,
|
||||||
CHROME_BINARY,
|
CHROME_BINARY,
|
||||||
)
|
)
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
@ -46,31 +48,16 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
|
|
||||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||||
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
||||||
|
# later options take precedence
|
||||||
options = [
|
options = [
|
||||||
*SINGLEFILE_ARGS,
|
*SINGLEFILE_ARGS,
|
||||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
*SINGLEFILE_EXTRA_ARGS,
|
||||||
browser_args,
|
browser_args,
|
||||||
|
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Deduplicate options (single-file doesn't like when you use the same option two times)
|
|
||||||
#
|
|
||||||
# NOTE: Options names that come first clobber conflicting names that come later
|
|
||||||
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
|
|
||||||
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
|
|
||||||
# kind of like the ergonomic principle of lexical scope in programming languages.
|
|
||||||
seen_option_names = []
|
|
||||||
def test_seen(argument):
|
|
||||||
option_name = argument.split("=")[0]
|
|
||||||
if option_name in seen_option_names:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
seen_option_names.append(option_name)
|
|
||||||
return True
|
|
||||||
deduped_options = list(filter(test_seen, options))
|
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||||
*deduped_options,
|
*dedupe(options),
|
||||||
link.url,
|
link.url,
|
||||||
output,
|
output,
|
||||||
]
|
]
|
||||||
|
|
|
@ -10,6 +10,7 @@ from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
download_url,
|
download_url,
|
||||||
htmldecode,
|
htmldecode,
|
||||||
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
|
@ -17,6 +18,7 @@ from ..config import (
|
||||||
SAVE_TITLE,
|
SAVE_TITLE,
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
CURL_ARGS,
|
CURL_ARGS,
|
||||||
|
CURL_EXTRA_ARGS,
|
||||||
CURL_VERSION,
|
CURL_VERSION,
|
||||||
CURL_USER_AGENT,
|
CURL_USER_AGENT,
|
||||||
)
|
)
|
||||||
|
@ -102,12 +104,17 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
|
|
||||||
output: ArchiveOutput = None
|
output: ArchiveOutput = None
|
||||||
cmd = [
|
# later options take precedence
|
||||||
CURL_BINARY,
|
options = [
|
||||||
*CURL_ARGS,
|
*CURL_ARGS,
|
||||||
|
*CURL_EXTRA_ARGS,
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||||
|
]
|
||||||
|
cmd = [
|
||||||
|
CURL_BINARY,
|
||||||
|
*dedupe(options),
|
||||||
link.url,
|
link.url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
|
|
|
@ -15,9 +15,11 @@ from ..util import (
|
||||||
path,
|
path,
|
||||||
domain,
|
domain,
|
||||||
urldecode,
|
urldecode,
|
||||||
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
WGET_ARGS,
|
WGET_ARGS,
|
||||||
|
WGET_EXTRA_ARGS,
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
SAVE_WGET,
|
SAVE_WGET,
|
||||||
SAVE_WARC,
|
SAVE_WARC,
|
||||||
|
@ -55,10 +57,10 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
|
|
||||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||||
output: ArchiveOutput = None
|
output: ArchiveOutput = None
|
||||||
cmd = [
|
# later options take precedence
|
||||||
WGET_BINARY,
|
options = [
|
||||||
# '--server-response', # print headers for better error parsing
|
|
||||||
*WGET_ARGS,
|
*WGET_ARGS,
|
||||||
|
*WGET_EXTRA_ARGS,
|
||||||
'--timeout={}'.format(timeout),
|
'--timeout={}'.format(timeout),
|
||||||
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
|
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
|
||||||
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
|
*(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
|
||||||
|
@ -68,6 +70,11 @@ def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
||||||
*([] if SAVE_WARC else ['--timestamping']),
|
*([] if SAVE_WARC else ['--timestamping']),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||||
|
# '--server-response', # print headers for better error parsing
|
||||||
|
]
|
||||||
|
cmd = [
|
||||||
|
WGET_BINARY,
|
||||||
|
*dedupe(options),
|
||||||
link.url,
|
link.url,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -241,7 +241,11 @@ def chrome_args(**options) -> List[str]:
|
||||||
|
|
||||||
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
|
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
|
||||||
|
|
||||||
from .config import CHROME_OPTIONS, CHROME_VERSION
|
from .config import (
|
||||||
|
CHROME_OPTIONS,
|
||||||
|
CHROME_VERSION,
|
||||||
|
CHROME_EXTRA_ARGS,
|
||||||
|
)
|
||||||
|
|
||||||
options = {**CHROME_OPTIONS, **options}
|
options = {**CHROME_OPTIONS, **options}
|
||||||
|
|
||||||
|
@ -250,6 +254,8 @@ def chrome_args(**options) -> List[str]:
|
||||||
|
|
||||||
cmd_args = [options['CHROME_BINARY']]
|
cmd_args = [options['CHROME_BINARY']]
|
||||||
|
|
||||||
|
cmd_args += CHROME_EXTRA_ARGS
|
||||||
|
|
||||||
if options['CHROME_HEADLESS']:
|
if options['CHROME_HEADLESS']:
|
||||||
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
|
chrome_major_version = int(re.search(r'\s(\d+)\.\d', CHROME_VERSION)[1])
|
||||||
if chrome_major_version >= 111:
|
if chrome_major_version >= 111:
|
||||||
|
@ -294,7 +300,8 @@ def chrome_args(**options) -> List[str]:
|
||||||
if options['CHROME_USER_DATA_DIR']:
|
if options['CHROME_USER_DATA_DIR']:
|
||||||
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
||||||
|
|
||||||
return cmd_args
|
|
||||||
|
return dedupe(cmd_args)
|
||||||
|
|
||||||
def chrome_cleanup():
|
def chrome_cleanup():
|
||||||
"""
|
"""
|
||||||
|
@ -331,6 +338,20 @@ def ansi_to_html(text):
|
||||||
return COLOR_REGEX.sub(single_sub, text)
|
return COLOR_REGEX.sub(single_sub, text)
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def dedupe(options: List[str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Deduplicates the given options. Options that come later clobber earlier
|
||||||
|
conflicting options.
|
||||||
|
"""
|
||||||
|
deduped = {}
|
||||||
|
|
||||||
|
for option in options:
|
||||||
|
deduped[option.split('=')[0]] = option
|
||||||
|
|
||||||
|
return list(deduped.values())
|
||||||
|
|
||||||
|
|
||||||
class AttributeDict(dict):
|
class AttributeDict(dict):
|
||||||
"""Helper to allow accessing dict values via Example.key or Example['key']"""
|
"""Helper to allow accessing dict values via Example.key or Example['key']"""
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue