mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
Add ARGS
and EXTRA_ARGS
for Mercury extractor
This commit is contained in:
parent
d8cf09c21e
commit
f4deb97f59
2 changed files with 14 additions and 4 deletions
|
@ -199,6 +199,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||||
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
'SINGLEFILE_ARGS': {'type': list, 'default': None},
|
||||||
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
|
'SINGLEFILE_EXTRA_ARGS': {'type': list, 'default': None},
|
||||||
|
'MERCURY_ARGS': {'type': list, 'default': ['--format=text']},
|
||||||
|
'MERCURY_EXTRA_ARGS': {'type': list, 'default': None},
|
||||||
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
|
'FAVICON_PROVIDER': {'type': str, 'default': 'https://www.google.com/s2/favicons?domain={}'},
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -561,6 +563,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
|
|
||||||
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
||||||
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
||||||
|
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
|
||||||
|
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
|
||||||
|
|
||||||
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||||
|
|
|
@ -11,13 +11,15 @@ from ..system import run, atomic_write
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
|
dedupe,
|
||||||
)
|
)
|
||||||
from ..config import (
|
from ..config import (
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
SAVE_MERCURY,
|
SAVE_MERCURY,
|
||||||
DEPENDENCIES,
|
DEPENDENCIES,
|
||||||
MERCURY_VERSION,
|
MERCURY_VERSION,
|
||||||
|
MERCURY_ARGS,
|
||||||
|
MERCURY_EXTRA_ARGS,
|
||||||
)
|
)
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
@ -60,12 +62,16 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
output_folder.mkdir(exist_ok=True)
|
output_folder.mkdir(exist_ok=True)
|
||||||
|
# later options take precedence
|
||||||
# Get plain text version of article
|
options = [
|
||||||
|
*MERCURY_ARGS,
|
||||||
|
*MERCURY_EXTRA_ARGS,
|
||||||
|
]
|
||||||
|
# By default, get plain text version of article
|
||||||
cmd = [
|
cmd = [
|
||||||
DEPENDENCIES['MERCURY_BINARY']['path'],
|
DEPENDENCIES['MERCURY_BINARY']['path'],
|
||||||
link.url,
|
link.url,
|
||||||
"--format=text"
|
*dedupe(options)
|
||||||
]
|
]
|
||||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue