mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-09 12:21:57 -04:00
move pdf, screenshot, dom, singlefile, and ytdlp extractor config to new plugin system
This commit is contained in:
parent
a2a586e369
commit
a5ffd4e9d3
11 changed files with 333 additions and 353 deletions
|
@ -30,6 +30,7 @@ import inspect
|
||||||
import getpass
|
import getpass
|
||||||
import shutil
|
import shutil
|
||||||
import requests
|
import requests
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -62,7 +63,6 @@ from .misc.logging import (
|
||||||
stderr,
|
stderr,
|
||||||
hint,
|
hint,
|
||||||
)
|
)
|
||||||
from .misc.checks import check_system_config
|
|
||||||
|
|
||||||
# print('STARTING CONFIG LOADING')
|
# print('STARTING CONFIG LOADING')
|
||||||
|
|
||||||
|
@ -167,15 +167,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||||
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
|
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
|
||||||
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
|
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
|
||||||
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']},
|
|
||||||
|
|
||||||
'COOKIES_FILE': {'type': str, 'default': None},
|
'COOKIES_FILE': {'type': str, 'default': None},
|
||||||
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
|
||||||
|
|
||||||
'CHROME_TIMEOUT': {'type': int, 'default': 0},
|
|
||||||
'CHROME_HEADLESS': {'type': bool, 'default': True},
|
|
||||||
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
|
|
||||||
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
|
|
||||||
|
|
||||||
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
|
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
|
||||||
'--restrict-filenames',
|
'--restrict-filenames',
|
||||||
|
@ -267,7 +260,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
||||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||||
'CHROME_BINARY': {'type': str, 'default': None},
|
|
||||||
|
|
||||||
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
||||||
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
||||||
|
@ -551,7 +543,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
|
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
|
||||||
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
||||||
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
||||||
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
|
|
||||||
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||||
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||||
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
|
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
|
||||||
|
@ -595,7 +587,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
||||||
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
||||||
|
|
||||||
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
# 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
||||||
|
|
||||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||||
|
@ -620,15 +612,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
|
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
|
||||||
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
|
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
|
||||||
|
|
||||||
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
|
|
||||||
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
|
|
||||||
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
|
|
||||||
'CHROME_USER_AGENT': {'default': lambda c: c['CHROME_USER_AGENT'].format(**c)},
|
|
||||||
|
|
||||||
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
|
|
||||||
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
|
|
||||||
'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
|
|
||||||
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
|
|
||||||
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
|
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
|
||||||
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
|
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
|
||||||
|
|
||||||
|
@ -638,8 +621,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||||
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
|
||||||
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
|
|
||||||
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
||||||
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
||||||
}
|
}
|
||||||
|
@ -1183,21 +1165,20 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||||
'enabled': config['USE_YOUTUBEDL'],
|
'enabled': config['USE_YOUTUBEDL'],
|
||||||
'is_valid': bool(config['YOUTUBEDL_VERSION']),
|
'is_valid': bool(config['YOUTUBEDL_VERSION']),
|
||||||
},
|
},
|
||||||
'CHROME_BINARY': {
|
# 'CHROME_BINARY': {
|
||||||
'path': bin_path(config['CHROME_BINARY']),
|
# 'path': bin_path(config['CHROME_BINARY']),
|
||||||
'version': config['CHROME_VERSION'],
|
# 'version': config['CHROME_VERSION'],
|
||||||
'hash': bin_hash(config['CHROME_BINARY']),
|
# 'hash': bin_hash(config['CHROME_BINARY']),
|
||||||
'enabled': config['USE_CHROME'],
|
# 'enabled': config['USE_CHROME'],
|
||||||
'is_valid': bool(config['CHROME_VERSION']),
|
# 'is_valid': bool(config['CHROME_VERSION']),
|
||||||
},
|
# },
|
||||||
'RIPGREP_BINARY': {
|
# 'RIPGREP_BINARY': {
|
||||||
'path': bin_path(config['RIPGREP_BINARY']),
|
# 'path': bin_path(config['RIPGREP_BINARY']),
|
||||||
'version': config['RIPGREP_VERSION'],
|
# 'version': config['RIPGREP_VERSION'],
|
||||||
'hash': bin_hash(config['RIPGREP_BINARY']),
|
# 'hash': bin_hash(config['RIPGREP_BINARY']),
|
||||||
'enabled': config['USE_RIPGREP'],
|
# 'enabled': config['USE_RIPGREP'],
|
||||||
'is_valid': bool(config['RIPGREP_VERSION']),
|
# 'is_valid': bool(config['RIPGREP_VERSION']),
|
||||||
},
|
# },
|
||||||
# TODO: add an entry for the sonic search backend?
|
|
||||||
# 'SONIC_BINARY': {
|
# 'SONIC_BINARY': {
|
||||||
# 'path': bin_path(config['SONIC_BINARY']),
|
# 'path': bin_path(config['SONIC_BINARY']),
|
||||||
# 'version': config['SONIC_VERSION'],
|
# 'version': config['SONIC_VERSION'],
|
||||||
|
@ -1207,20 +1188,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||||
# },
|
# },
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_chrome_info(config: ConfigDict) -> ConfigValue:
|
|
||||||
return {
|
|
||||||
'TIMEOUT': config['TIMEOUT'],
|
|
||||||
'RESOLUTION': config['RESOLUTION'],
|
|
||||||
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
|
|
||||||
'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
|
|
||||||
'CHROME_TIMEOUT': config['CHROME_TIMEOUT'],
|
|
||||||
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
|
|
||||||
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
|
|
||||||
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
|
|
||||||
'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
# ******************************** Load Config *********************************
|
# ******************************** Load Config *********************************
|
||||||
|
@ -1264,27 +1231,6 @@ os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # n
|
||||||
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
|
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
|
||||||
sys.path.append(CONFIG.NODE_BIN_PATH)
|
sys.path.append(CONFIG.NODE_BIN_PATH)
|
||||||
|
|
||||||
# OPTIONAL: also look around the host system for node modules to use
|
|
||||||
# avoid enabling this unless absolutely needed,
|
|
||||||
# having overlapping potential sources of libs is a big source of bugs/confusing to users
|
|
||||||
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
|
|
||||||
# sys.path.append(DEV_NODE_BIN_PATH)
|
|
||||||
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
|
|
||||||
# sys.path.append(USER_NODE_BIN_PATH)
|
|
||||||
|
|
||||||
# disable stderr "you really shouldnt disable ssl" warnings with library config
|
|
||||||
if not CONFIG['CHECK_SSL_VALIDITY']:
|
|
||||||
import urllib3
|
|
||||||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
||||||
|
|
||||||
# get SQLite database version, compile options, and runtime options
|
|
||||||
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
|
|
||||||
#cursor = sqlite3.connect(':memory:').cursor()
|
|
||||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
|
|
||||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
|
|
||||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
|
|
||||||
#cursor.close()
|
|
||||||
|
|
||||||
########################### Config Validity Checkers ###########################
|
########################### Config Validity Checkers ###########################
|
||||||
|
|
||||||
|
@ -1308,13 +1254,19 @@ def bump_startup_progress_bar():
|
||||||
if INITIAL_STARTUP_PROGRESS:
|
if INITIAL_STARTUP_PROGRESS:
|
||||||
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
|
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def setup_django_minimal():
|
||||||
|
sys.path.append(str(archivebox.PACKAGE_DIR))
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||||
|
django.setup()
|
||||||
|
|
||||||
|
|
||||||
def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
|
def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
|
||||||
global INITIAL_STARTUP_PROGRESS
|
global INITIAL_STARTUP_PROGRESS
|
||||||
global INITIAL_STARTUP_PROGRESS_TASK
|
global INITIAL_STARTUP_PROGRESS_TASK
|
||||||
|
|
||||||
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
|
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
|
||||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
|
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
|
||||||
check_system_config(config)
|
|
||||||
|
|
||||||
output_dir = out_dir or Path(config['OUTPUT_DIR'])
|
output_dir = out_dir or Path(config['OUTPUT_DIR'])
|
||||||
|
|
||||||
|
|
|
@ -8,13 +8,6 @@ from ..system import run, chmod_file, atomic_write
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
chrome_args,
|
|
||||||
chrome_cleanup,
|
|
||||||
)
|
|
||||||
from ..config import (
|
|
||||||
TIMEOUT,
|
|
||||||
SAVE_DOM,
|
|
||||||
CHROME_VERSION,
|
|
||||||
)
|
)
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
@ -25,6 +18,8 @@ def get_output_path():
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -33,42 +28,48 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
||||||
if (out_dir / get_output_path()).stat().st_size > 1:
|
if (out_dir / get_output_path()).stat().st_size > 1:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_DOM
|
return CHROME_CONFIG.SAVE_DOM
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||||
"""print HTML of site to file using chrome --dump-html"""
|
"""print HTML of site to file using chrome --dump-html"""
|
||||||
|
|
||||||
|
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||||
|
|
||||||
|
CHROME_BIN = CHROME_BINARY.load()
|
||||||
|
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = get_output_path()
|
output: ArchiveOutput = get_output_path()
|
||||||
output_path = out_dir / output
|
output_path = out_dir / output
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(),
|
str(CHROME_BIN.abspath),
|
||||||
|
*CHROME_CONFIG.chrome_args(),
|
||||||
'--dump-dom',
|
'--dump-dom',
|
||||||
link.url
|
link.url
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
|
||||||
atomic_write(output_path, result.stdout)
|
atomic_write(output_path, result.stdout)
|
||||||
|
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
hints = result.stderr.decode()
|
hints = result.stderr
|
||||||
raise ArchiveError('Failed to save DOM', hints)
|
raise ArchiveError('Failed to save DOM', hints)
|
||||||
|
|
||||||
chmod_file(output, cwd=str(out_dir))
|
chmod_file(output, cwd=str(out_dir))
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
chrome_cleanup()
|
CHROME_BINARY.chrome_cleanup_lockfile()
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
cmd=cmd,
|
cmd=cmd,
|
||||||
pwd=str(out_dir),
|
pwd=str(out_dir),
|
||||||
cmd_version=CHROME_VERSION,
|
cmd_version=str(CHROME_BIN.version),
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
**timer.stats,
|
**timer.stats,
|
||||||
|
|
|
@ -5,20 +5,7 @@ from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..system import run, chmod_file
|
from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import enforce_types, is_static_file, dedupe
|
||||||
enforce_types,
|
|
||||||
is_static_file,
|
|
||||||
dedupe,
|
|
||||||
)
|
|
||||||
from ..config import (
|
|
||||||
MEDIA_TIMEOUT,
|
|
||||||
SAVE_MEDIA,
|
|
||||||
YOUTUBEDL_ARGS,
|
|
||||||
YOUTUBEDL_EXTRA_ARGS,
|
|
||||||
YOUTUBEDL_BINARY,
|
|
||||||
YOUTUBEDL_VERSION,
|
|
||||||
CHECK_SSL_VALIDITY
|
|
||||||
)
|
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,6 +25,8 @@ def get_embed_path(archiveresult=None):
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
from plugins_extractor.ytdlp.apps import YTDLP_CONFIG
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -45,45 +34,52 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
|
||||||
if not overwrite and (out_dir / get_output_path()).exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_MEDIA
|
return YTDLP_CONFIG.USE_YTDLP
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
|
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
|
||||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
||||||
|
|
||||||
|
|
||||||
|
# from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||||
|
from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG
|
||||||
|
|
||||||
|
YTDLP_BIN = YTDLP_BINARY.load()
|
||||||
|
assert YTDLP_BIN.abspath and YTDLP_BIN.version
|
||||||
|
|
||||||
|
timeout = timeout or YTDLP_CONFIG.YTDLP_TIMEOUT
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = get_output_path()
|
output: ArchiveOutput = get_output_path()
|
||||||
output_path = out_dir / output
|
output_path = out_dir / output
|
||||||
output_path.mkdir(exist_ok=True)
|
output_path.mkdir(exist_ok=True)
|
||||||
# later options take precedence
|
# later options take precedence
|
||||||
options = [
|
options = [
|
||||||
*YOUTUBEDL_ARGS,
|
*YTDLP_CONFIG.YTDLP_EXTRA_ARGS,
|
||||||
*YOUTUBEDL_EXTRA_ARGS,
|
*([] if YTDLP_CONFIG.YTDLP_CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
|
||||||
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
||||||
]
|
]
|
||||||
cmd = [
|
cmd = [
|
||||||
YOUTUBEDL_BINARY,
|
str(YTDLP_BIN.abspath),
|
||||||
*dedupe(options),
|
*dedupe(options),
|
||||||
link.url,
|
link.url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(cmd, cwd=str(output_path), timeout=timeout + 1)
|
result = run(cmd, cwd=str(output_path), timeout=timeout + 1, text=True)
|
||||||
chmod_file(output, cwd=str(out_dir))
|
chmod_file(output, cwd=str(out_dir))
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
if (b'ERROR: Unsupported URL' in result.stderr
|
if ('ERROR: Unsupported URL' in result.stderr
|
||||||
or b'HTTP Error 404' in result.stderr
|
or 'HTTP Error 404' in result.stderr
|
||||||
or b'HTTP Error 403' in result.stderr
|
or 'HTTP Error 403' in result.stderr
|
||||||
or b'URL could be a direct video link' in result.stderr
|
or 'URL could be a direct video link' in result.stderr
|
||||||
or b'Unable to extract container ID' in result.stderr):
|
or 'Unable to extract container ID' in result.stderr):
|
||||||
# These happen too frequently on non-media pages to warrant printing to console
|
# These happen too frequently on non-media pages to warrant printing to console
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
hints = (
|
hints = (
|
||||||
'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
|
'Got yt-dlp response code: {}.'.format(result.returncode),
|
||||||
*result.stderr.decode().split('\n'),
|
*result.stderr.split('\n'),
|
||||||
)
|
)
|
||||||
raise ArchiveError('Failed to save media', hints)
|
raise ArchiveError('Failed to save media', hints)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
@ -117,7 +113,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
cmd=cmd,
|
cmd=cmd,
|
||||||
pwd=str(out_dir),
|
pwd=str(out_dir),
|
||||||
cmd_version=YOUTUBEDL_VERSION,
|
cmd_version=str(YTDLP_BIN.version),
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
index_texts=index_texts,
|
index_texts=index_texts,
|
||||||
|
|
|
@ -8,13 +8,6 @@ from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
chrome_args,
|
|
||||||
chrome_cleanup,
|
|
||||||
)
|
|
||||||
from ..config import (
|
|
||||||
TIMEOUT,
|
|
||||||
SAVE_PDF,
|
|
||||||
CHROME_VERSION,
|
|
||||||
)
|
)
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
@ -25,6 +18,8 @@ def get_output_path():
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -32,34 +27,40 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
||||||
if not overwrite and (out_dir / get_output_path()).exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_PDF
|
return CHROME_CONFIG.SAVE_PDF
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||||
"""print PDF of site to file using chrome --headless"""
|
"""print PDF of site to file using chrome --headless"""
|
||||||
|
|
||||||
|
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||||
|
|
||||||
|
CHROME_BIN = CHROME_BINARY.load()
|
||||||
|
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = get_output_path()
|
output: ArchiveOutput = get_output_path()
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(),
|
str(CHROME_BIN.abspath),
|
||||||
|
*CHROME_CONFIG.chrome_args(),
|
||||||
'--print-to-pdf',
|
'--print-to-pdf',
|
||||||
link.url,
|
link.url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
|
||||||
|
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
hints = (result.stderr or result.stdout).decode()
|
hints = (result.stderr or result.stdout)
|
||||||
raise ArchiveError('Failed to save PDF', hints)
|
raise ArchiveError('Failed to save PDF', hints)
|
||||||
|
|
||||||
chmod_file(get_output_path(), cwd=str(out_dir))
|
chmod_file(get_output_path(), cwd=str(out_dir))
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
chrome_cleanup()
|
CHROME_BINARY.chrome_cleanup_lockfile()
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
|
@ -67,7 +68,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
cmd=cmd,
|
cmd=cmd,
|
||||||
pwd=str(out_dir),
|
pwd=str(out_dir),
|
||||||
cmd_version=CHROME_VERSION,
|
cmd_version=str(CHROME_BINARY.version),
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
**timer.stats,
|
**timer.stats,
|
||||||
|
|
|
@ -5,17 +5,7 @@ from typing import Optional
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||||
from ..system import run, chmod_file
|
from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import enforce_types, is_static_file
|
||||||
enforce_types,
|
|
||||||
is_static_file,
|
|
||||||
chrome_args,
|
|
||||||
chrome_cleanup,
|
|
||||||
)
|
|
||||||
from ..config import (
|
|
||||||
TIMEOUT,
|
|
||||||
SAVE_SCREENSHOT,
|
|
||||||
CHROME_VERSION,
|
|
||||||
)
|
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,6 +15,8 @@ def get_output_path():
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
from plugins_extractor.chrome.apps import CHROME_CONFIG
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -32,40 +24,45 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
|
||||||
if not overwrite and (out_dir / get_output_path()).exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_SCREENSHOT
|
return CHROME_CONFIG.SAVE_SCREENSHOT
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||||
"""take screenshot of site using chrome --headless"""
|
"""take screenshot of site using chrome --headless"""
|
||||||
|
|
||||||
|
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||||
|
CHROME_BIN = CHROME_BINARY.load()
|
||||||
|
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output: ArchiveOutput = get_output_path()
|
output: ArchiveOutput = get_output_path()
|
||||||
cmd = [
|
cmd = [
|
||||||
*chrome_args(),
|
str(CHROME_BIN.abspath),
|
||||||
|
*CHROME_CONFIG.chrome_args(),
|
||||||
'--screenshot',
|
'--screenshot',
|
||||||
link.url,
|
link.url,
|
||||||
]
|
]
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
|
||||||
|
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
hints = (result.stderr or result.stdout).decode()
|
hints = (result.stderr or result.stdout)
|
||||||
raise ArchiveError('Failed to save screenshot', hints)
|
raise ArchiveError('Failed to save screenshot', hints)
|
||||||
|
|
||||||
chmod_file(output, cwd=str(out_dir))
|
chmod_file(output, cwd=str(out_dir))
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = err
|
output = err
|
||||||
chrome_cleanup()
|
CHROME_BINARY.chrome_cleanup_lockfile()
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
cmd=cmd,
|
cmd=cmd,
|
||||||
pwd=str(out_dir),
|
pwd=str(out_dir),
|
||||||
cmd_version=CHROME_VERSION,
|
cmd_version=str(CHROME_BIN.version),
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
**timer.stats,
|
**timer.stats,
|
||||||
|
|
|
@ -7,22 +7,7 @@ import json
|
||||||
|
|
||||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||||
from ..system import run, chmod_file
|
from ..system import run, chmod_file
|
||||||
from ..util import (
|
from ..util import enforce_types, is_static_file, dedupe
|
||||||
enforce_types,
|
|
||||||
is_static_file,
|
|
||||||
chrome_args,
|
|
||||||
dedupe,
|
|
||||||
)
|
|
||||||
from ..config import (
|
|
||||||
TIMEOUT,
|
|
||||||
SAVE_SINGLEFILE,
|
|
||||||
DEPENDENCIES,
|
|
||||||
SINGLEFILE_VERSION,
|
|
||||||
SINGLEFILE_ARGS,
|
|
||||||
SINGLEFILE_EXTRA_ARGS,
|
|
||||||
CHROME_BINARY,
|
|
||||||
COOKIES_FILE,
|
|
||||||
)
|
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,6 +17,8 @@ def get_output_path():
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||||
|
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG
|
||||||
|
|
||||||
if is_static_file(link.url):
|
if is_static_file(link.url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -39,30 +26,35 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite:
|
||||||
if not overwrite and (out_dir / get_output_path()).exists():
|
if not overwrite and (out_dir / get_output_path()).exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return SAVE_SINGLEFILE
|
return SINGLEFILE_CONFIG.SAVE_SINGLEFILE
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
|
||||||
"""download full site using single-file"""
|
"""download full site using single-file"""
|
||||||
|
|
||||||
|
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
|
||||||
|
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY
|
||||||
|
|
||||||
|
CHROME_BIN = CHROME_BINARY.load()
|
||||||
|
assert CHROME_BIN.abspath and CHROME_BIN.version
|
||||||
|
SINGLEFILE_BIN = SINGLEFILE_BINARY.load()
|
||||||
|
assert SINGLEFILE_BIN.abspath and SINGLEFILE_BIN.version
|
||||||
|
|
||||||
out_dir = out_dir or Path(link.link_dir)
|
out_dir = out_dir or Path(link.link_dir)
|
||||||
output = get_output_path()
|
output = get_output_path()
|
||||||
|
|
||||||
browser_args = chrome_args(CHROME_TIMEOUT=0)
|
browser_args = CHROME_CONFIG.chrome_args(CHROME_TIMEOUT=0)
|
||||||
|
|
||||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||||
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
|
||||||
# later options take precedence
|
|
||||||
options = [
|
options = [
|
||||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
'--browser-executable-path={}'.format(CHROME_BIN.abspath),
|
||||||
*(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
|
*(["--browser-cookies-file={}".format(SINGLEFILE_CONFIG.SINGLEFILE_COOKIES_FILE)] if SINGLEFILE_CONFIG.SINGLEFILE_COOKIES_FILE else []),
|
||||||
browser_args,
|
'--browser-args={}'.format(json.dumps(browser_args)),
|
||||||
*SINGLEFILE_ARGS,
|
*SINGLEFILE_CONFIG.SINGLEFILE_EXTRA_ARGS,
|
||||||
*SINGLEFILE_EXTRA_ARGS,
|
|
||||||
]
|
]
|
||||||
cmd = [
|
cmd = [
|
||||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
str(SINGLEFILE_BIN.abspath),
|
||||||
*dedupe(options),
|
*dedupe(options),
|
||||||
link.url,
|
link.url,
|
||||||
output,
|
output,
|
||||||
|
@ -72,13 +64,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
result = None
|
result = None
|
||||||
try:
|
try:
|
||||||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True, capture_output=True)
|
||||||
|
|
||||||
# parse out number of files downloaded from last line of stderr:
|
# parse out number of files downloaded from last line of stderr:
|
||||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||||
output_tail = [
|
output_tail = [
|
||||||
line.strip()
|
line.strip()
|
||||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
|
for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
|
||||||
if line.strip()
|
if line.strip()
|
||||||
]
|
]
|
||||||
hints = (
|
hints = (
|
||||||
|
@ -93,9 +85,9 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
except (Exception, OSError) as err:
|
except (Exception, OSError) as err:
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
|
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
|
||||||
cmd[2] = browser_args.replace('"', "\\\"")
|
cmd[2] = cmd[2].replace('"', "\\\"")
|
||||||
if result:
|
if result:
|
||||||
err.hints = (result.stdout + result.stderr).decode().split('\n')
|
err.hints = (result.stdout + result.stderr).split('\n')
|
||||||
output = err
|
output = err
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
@ -103,7 +95,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||||
return ArchiveResult(
|
return ArchiveResult(
|
||||||
cmd=cmd,
|
cmd=cmd,
|
||||||
pwd=str(out_dir),
|
pwd=str(out_dir),
|
||||||
cmd_version=SINGLEFILE_VERSION,
|
cmd_version=str(SINGLEFILE_BIN.version),
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
**timer.stats,
|
**timer.stats,
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
__package__ = 'archivebox'
|
__package__ = 'archivebox'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
import platform
|
import platform
|
||||||
|
import archivebox
|
||||||
|
|
||||||
from typing import Dict, List, Optional, Iterable, IO, Union
|
from typing import Dict, List, Optional, Iterable, IO, Union
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -69,6 +69,7 @@ from .extractors import archive_links, archive_link, ignore_methods
|
||||||
from .misc.logging import stderr, hint
|
from .misc.logging import stderr, hint
|
||||||
from .misc.checks import check_data_folder, check_dependencies
|
from .misc.checks import check_data_folder, check_dependencies
|
||||||
from .config import (
|
from .config import (
|
||||||
|
setup_django_minimal,
|
||||||
ConfigDict,
|
ConfigDict,
|
||||||
ANSI,
|
ANSI,
|
||||||
IS_TTY,
|
IS_TTY,
|
||||||
|
@ -81,8 +82,6 @@ from .config import (
|
||||||
TIMEZONE,
|
TIMEZONE,
|
||||||
ENFORCE_ATOMIC_WRITES,
|
ENFORCE_ATOMIC_WRITES,
|
||||||
OUTPUT_PERMISSIONS,
|
OUTPUT_PERMISSIONS,
|
||||||
PYTHON_BINARY,
|
|
||||||
ARCHIVEBOX_BINARY,
|
|
||||||
ONLY_NEW,
|
ONLY_NEW,
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
SOURCES_DIR,
|
SOURCES_DIR,
|
||||||
|
@ -95,31 +94,22 @@ from .config import (
|
||||||
HTML_INDEX_FILENAME,
|
HTML_INDEX_FILENAME,
|
||||||
SQL_INDEX_FILENAME,
|
SQL_INDEX_FILENAME,
|
||||||
ALLOWED_IN_OUTPUT_DIR,
|
ALLOWED_IN_OUTPUT_DIR,
|
||||||
SEARCH_BACKEND_ENGINE,
|
|
||||||
LDAP,
|
LDAP,
|
||||||
get_version,
|
|
||||||
write_config_file,
|
write_config_file,
|
||||||
VERSION,
|
VERSION,
|
||||||
VERSIONS_AVAILABLE,
|
|
||||||
CAN_UPGRADE,
|
|
||||||
COMMIT_HASH,
|
COMMIT_HASH,
|
||||||
BUILD_TIME,
|
BUILD_TIME,
|
||||||
CODE_LOCATIONS,
|
CODE_LOCATIONS,
|
||||||
DATA_LOCATIONS,
|
DATA_LOCATIONS,
|
||||||
DEPENDENCIES,
|
DEPENDENCIES,
|
||||||
CHROME_BINARY,
|
|
||||||
CHROME_VERSION,
|
|
||||||
YOUTUBEDL_BINARY,
|
YOUTUBEDL_BINARY,
|
||||||
YOUTUBEDL_VERSION,
|
YOUTUBEDL_VERSION,
|
||||||
SINGLEFILE_VERSION,
|
SINGLEFILE_VERSION,
|
||||||
READABILITY_VERSION,
|
READABILITY_VERSION,
|
||||||
MERCURY_VERSION,
|
MERCURY_VERSION,
|
||||||
NODE_VERSION,
|
|
||||||
load_all_config,
|
load_all_config,
|
||||||
CONFIG,
|
CONFIG,
|
||||||
USER_CONFIG,
|
USER_CONFIG,
|
||||||
ADMIN_USERNAME,
|
|
||||||
ADMIN_PASSWORD,
|
|
||||||
get_real_name,
|
get_real_name,
|
||||||
setup_django,
|
setup_django,
|
||||||
)
|
)
|
||||||
|
@ -216,6 +206,11 @@ def version(quiet: bool=False,
|
||||||
out_dir: Path=OUTPUT_DIR) -> None:
|
out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
"""Print the ArchiveBox version and dependency information"""
|
"""Print the ArchiveBox version and dependency information"""
|
||||||
|
|
||||||
|
setup_django_minimal()
|
||||||
|
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG
|
||||||
|
from plugins_auth.ldap.apps import LDAP_CONFIG
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
print(VERSION)
|
print(VERSION)
|
||||||
|
|
||||||
if not quiet:
|
if not quiet:
|
||||||
|
@ -227,7 +222,7 @@ def version(quiet: bool=False,
|
||||||
|
|
||||||
p = platform.uname()
|
p = platform.uname()
|
||||||
print(
|
print(
|
||||||
'ArchiveBox v{}'.format(get_version(CONFIG)),
|
'ArchiveBox v{}'.format(archivebox.__version__),
|
||||||
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||||
f'BUILD_TIME={BUILD_TIME}',
|
f'BUILD_TIME={BUILD_TIME}',
|
||||||
)
|
)
|
||||||
|
@ -241,29 +236,35 @@ def version(quiet: bool=False,
|
||||||
)
|
)
|
||||||
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
|
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
|
||||||
print(
|
print(
|
||||||
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
|
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||||
f'FS_USER={PUID}:{PGID}',
|
f'FS_USER={SHELL_CONFIG.PUID}:{SHELL_CONFIG.PGID}',
|
||||||
f'FS_PERMS={OUTPUT_PERMISSIONS}',
|
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
|
||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
f'DEBUG={DEBUG}',
|
f'DEBUG={SHELL_CONFIG.DEBUG}',
|
||||||
f'IS_TTY={IS_TTY}',
|
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
|
||||||
f'TZ={TIMEZONE}',
|
f'TZ={TIMEZONE}',
|
||||||
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
|
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
|
||||||
f'LDAP={LDAP}',
|
f'LDAP={LDAP_CONFIG.LDAP_ENABLED}',
|
||||||
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||||
)
|
)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
|
print('{white}[i] Old dependency versions:{reset}'.format(**ANSI))
|
||||||
for name, dependency in DEPENDENCIES.items():
|
for name, dependency in DEPENDENCIES.items():
|
||||||
print(printable_dependency_version(name, dependency))
|
print(printable_dependency_version(name, dependency))
|
||||||
|
|
||||||
# add a newline between core dependencies and extractor dependencies for easier reading
|
# add a newline between core dependencies and extractor dependencies for easier reading
|
||||||
if name == 'ARCHIVEBOX_BINARY':
|
if name == 'ARCHIVEBOX_BINARY':
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
print()
|
||||||
|
print('{white}[i] New dependency versions:{reset}'.format(**ANSI))
|
||||||
|
for name, binary in settings.BINARIES.items():
|
||||||
|
loaded_bin = binary.load()
|
||||||
|
print('', '√' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
|
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
|
||||||
for name, path in CODE_LOCATIONS.items():
|
for name, path in CODE_LOCATIONS.items():
|
||||||
|
@ -431,10 +432,11 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
|
||||||
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||||
|
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
|
from plugins_sys.config.apps import SERVER_CONFIG
|
||||||
|
|
||||||
if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
|
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
|
||||||
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
|
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
|
||||||
User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
|
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
|
||||||
|
|
||||||
if existing_index:
|
if existing_index:
|
||||||
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
||||||
|
@ -693,8 +695,8 @@ def add(urls: Union[str, List[str]],
|
||||||
|
|
||||||
# tail_worker_logs(worker['stdout_logfile'])
|
# tail_worker_logs(worker['stdout_logfile'])
|
||||||
|
|
||||||
if CAN_UPGRADE:
|
# if CAN_UPGRADE:
|
||||||
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||||
|
|
||||||
return new_links
|
return new_links
|
||||||
|
|
||||||
|
@ -967,6 +969,8 @@ def list_folders(links: List[Link],
|
||||||
def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
"""Automatically install all ArchiveBox dependencies and extras"""
|
"""Automatically install all ArchiveBox dependencies and extras"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if not (out_dir / ARCHIVE_DIR_NAME).exists():
|
if not (out_dir / ARCHIVE_DIR_NAME).exists():
|
||||||
run_subcommand('init', stdin=None, pwd=out_dir)
|
run_subcommand('init', stdin=None, pwd=out_dir)
|
||||||
|
|
||||||
|
@ -980,24 +984,26 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
|
|
||||||
stderr('\n[+] Installing enabled ArchiveBox dependencies automatically...', color='green')
|
stderr('\n[+] Installing enabled ArchiveBox dependencies automatically...', color='green')
|
||||||
|
|
||||||
|
from plugins_pkg.pip.apps import PYTHON_BINARY
|
||||||
|
|
||||||
stderr('\n Installing YOUTUBEDL_BINARY automatically using pip...')
|
stderr('\n Installing YOUTUBEDL_BINARY automatically using pip...')
|
||||||
if YOUTUBEDL_VERSION:
|
if YOUTUBEDL_VERSION:
|
||||||
print(f'{YOUTUBEDL_VERSION} is already installed', YOUTUBEDL_BINARY)
|
print(f'{YOUTUBEDL_VERSION} is already installed', YOUTUBEDL_BINARY)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
run_shell([
|
run_shell([
|
||||||
PYTHON_BINARY, '-m', 'pip',
|
PYTHON_BINARY.load().abspath, '-m', 'pip',
|
||||||
'install',
|
'install',
|
||||||
'--upgrade',
|
'--upgrade',
|
||||||
'--no-cache-dir',
|
'--no-cache-dir',
|
||||||
'--no-warn-script-location',
|
'--no-warn-script-location',
|
||||||
'yt-dlp',
|
'yt-dlp',
|
||||||
], capture_output=False, cwd=out_dir)
|
], capture_output=False, cwd=out_dir, text=True)
|
||||||
pkg_path = run_shell([
|
pkg_path = run_shell([
|
||||||
PYTHON_BINARY, '-m', 'pip',
|
PYTHON_BINARY.load().abspath, '-m', 'pip',
|
||||||
'show',
|
'show',
|
||||||
'yt-dlp',
|
'yt-dlp',
|
||||||
], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
|
], capture_output=True, text=True, cwd=out_dir).stdout.split('Location: ')[-1].split('\n', 1)[0]
|
||||||
NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
|
NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
|
||||||
os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
|
os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
|
||||||
assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
|
assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
|
||||||
|
@ -1006,33 +1012,18 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
stderr(f'[X] Failed to install python packages: {e}', color='red')
|
stderr(f'[X] Failed to install python packages: {e}', color='red')
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
if platform.machine() == 'armv7l':
|
|
||||||
stderr('\n Skip the automatic installation of CHROME_BINARY because playwright is not available on armv7.')
|
from plugins_extractor.chrome.apps import CHROME_BINARY
|
||||||
else:
|
|
||||||
stderr('\n Installing CHROME_BINARY automatically using playwright...')
|
CHROME_BINARY.load_or_install()
|
||||||
if CHROME_VERSION:
|
|
||||||
print(f'{CHROME_VERSION} is already installed', CHROME_BINARY)
|
from plugins_pkg.npm.apps import NPM_BINARY
|
||||||
else:
|
from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
|
||||||
try:
|
|
||||||
run_shell([
|
SINGLEFILE_BINARY.load_or_install()
|
||||||
PYTHON_BINARY, '-m', 'pip',
|
|
||||||
'install',
|
|
||||||
'--upgrade',
|
|
||||||
'--no-cache-dir',
|
|
||||||
'--no-warn-script-location',
|
|
||||||
'playwright',
|
|
||||||
], capture_output=False, cwd=out_dir)
|
|
||||||
run_shell([PYTHON_BINARY, '-m', 'playwright', 'install', 'chromium'], capture_output=False, cwd=out_dir)
|
|
||||||
proc = run_shell([PYTHON_BINARY, '-c', 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)'], capture_output=True, text=True, cwd=out_dir)
|
|
||||||
NEW_CHROME_BINARY = proc.stdout.decode().strip() if isinstance(proc.stdout, bytes) else proc.stdout.strip()
|
|
||||||
assert NEW_CHROME_BINARY and len(NEW_CHROME_BINARY), 'CHROME_BINARY must contain a path'
|
|
||||||
config(f'CHROME_BINARY={NEW_CHROME_BINARY}', set=True, out_dir=out_dir)
|
|
||||||
except BaseException as e: # lgtm [py/catch-base-exception]
|
|
||||||
stderr(f'[X] Failed to install chromium using playwright: {e.__class__.__name__} {e}', color='red')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...')
|
stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...')
|
||||||
if not NODE_VERSION:
|
if not NPM_BINARY.load().version:
|
||||||
stderr('[X] You must first install node & npm using your system package manager', color='red')
|
stderr('[X] You must first install node & npm using your system package manager', color='red')
|
||||||
hint([
|
hint([
|
||||||
'https://github.com/nodesource/distributions#table-of-contents',
|
'https://github.com/nodesource/distributions#table-of-contents',
|
||||||
|
@ -1077,7 +1068,9 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
|
|
||||||
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
|
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
|
||||||
|
|
||||||
run_shell([PYTHON_BINARY, ARCHIVEBOX_BINARY, '--version'], capture_output=False, cwd=out_dir)
|
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||||
|
|
||||||
|
run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def config(config_options_str: Optional[str]=None,
|
def config(config_options_str: Optional[str]=None,
|
||||||
|
@ -1192,6 +1185,8 @@ def schedule(add: bool=False,
|
||||||
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
||||||
|
|
||||||
check_data_folder(CONFIG)
|
check_data_folder(CONFIG)
|
||||||
|
setup_django_minimal()
|
||||||
|
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||||
|
|
||||||
Path(LOGS_DIR).mkdir(exist_ok=True)
|
Path(LOGS_DIR).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
@ -1212,7 +1207,7 @@ def schedule(add: bool=False,
|
||||||
'cd',
|
'cd',
|
||||||
quoted(out_dir),
|
quoted(out_dir),
|
||||||
'&&',
|
'&&',
|
||||||
quoted(ARCHIVEBOX_BINARY),
|
quoted(ARCHIVEBOX_BINARY.load().abspath),
|
||||||
*([
|
*([
|
||||||
'add',
|
'add',
|
||||||
*(['--overwrite'] if overwrite else []),
|
*(['--overwrite'] if overwrite else []),
|
||||||
|
@ -1300,8 +1295,8 @@ def schedule(add: bool=False,
|
||||||
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
|
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
if CAN_UPGRADE:
|
# if CAN_UPGRADE:
|
||||||
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -1386,6 +1381,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
"""Run an ArchiveBox Django management command"""
|
"""Run an ArchiveBox Django management command"""
|
||||||
|
|
||||||
check_data_folder(CONFIG)
|
check_data_folder(CONFIG)
|
||||||
|
setup_django_minimal()
|
||||||
from django.core.management import execute_from_command_line
|
from django.core.management import execute_from_command_line
|
||||||
|
|
||||||
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
|
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
|
||||||
|
@ -1393,7 +1389,9 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||||
stderr('')
|
stderr('')
|
||||||
|
|
||||||
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
|
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||||
|
|
||||||
|
execute_from_command_line([ARCHIVEBOX_BINARY.load().abspath, 'manage', *(args or ['help'])])
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
__package__ = 'archivebox.plugins_extractor.chrome'
|
__package__ = 'archivebox.plugins_extractor.chrome'
|
||||||
|
|
||||||
|
import sys
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Dict, ClassVar
|
from typing import List, Optional, Dict, ClassVar
|
||||||
|
@ -7,7 +8,8 @@ from typing import List, Optional, Dict, ClassVar
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
# Depends on other PyPI/vendor packages:
|
# Depends on other PyPI/vendor packages:
|
||||||
from pydantic import InstanceOf, Field
|
from rich import print
|
||||||
|
from pydantic import InstanceOf, Field, model_validator
|
||||||
from pydantic_pkgr import (
|
from pydantic_pkgr import (
|
||||||
BinProvider,
|
BinProvider,
|
||||||
BinName,
|
BinName,
|
||||||
|
@ -25,9 +27,12 @@ from plugantic.base_binary import BaseBinary, env
|
||||||
from plugantic.base_hook import BaseHook
|
from plugantic.base_hook import BaseHook
|
||||||
|
|
||||||
# Depends on Other Plugins:
|
# Depends on Other Plugins:
|
||||||
|
from plugins_sys.config.apps import ARCHIVING_CONFIG, SHELL_CONFIG
|
||||||
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
|
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
|
||||||
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
|
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
|
||||||
|
|
||||||
|
from ...util import dedupe
|
||||||
|
|
||||||
|
|
||||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||||
"chromium",
|
"chromium",
|
||||||
|
@ -82,11 +87,113 @@ def create_macos_app_symlink(target: Path, shortcut: Path):
|
||||||
class ChromeConfig(BaseConfigSet):
|
class ChromeConfig(BaseConfigSet):
|
||||||
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
|
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
|
||||||
|
|
||||||
CHROME_BINARY: str = Field(default='chrome')
|
USE_CHROME: bool = Field(default=True)
|
||||||
CHROME_ARGS: List[str] | None = Field(default=None)
|
|
||||||
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
|
|
||||||
CHROME_DEFAULT_ARGS: List[str] = Field(default=lambda: ['--timeout={TIMEOUT-10}'])
|
|
||||||
|
|
||||||
|
# Chrome Binary
|
||||||
|
CHROME_BINARY: str = Field(default='chrome')
|
||||||
|
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
|
||||||
|
|
||||||
|
# Chrome Options Tuning
|
||||||
|
CHROME_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10)
|
||||||
|
CHROME_HEADLESS: bool = Field(default=True)
|
||||||
|
CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
|
||||||
|
CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
|
||||||
|
CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
|
|
||||||
|
# Cookies & Auth
|
||||||
|
CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||||
|
CHROME_USER_DATA_DIR: Path | None = Field(default=None)
|
||||||
|
CHROME_PROFILE_NAME: str = Field(default='Default')
|
||||||
|
|
||||||
|
# Extractor Toggles
|
||||||
|
SAVE_SCREENSHOT: bool = Field(default=True, alias='FETCH_SCREENSHOT')
|
||||||
|
SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM')
|
||||||
|
SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF')
|
||||||
|
|
||||||
|
@model_validator(mode='after')
|
||||||
|
def validate_use_chrome(self):
|
||||||
|
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
|
||||||
|
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
||||||
|
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
|
||||||
|
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
|
||||||
|
print(file=sys.stderr)
|
||||||
|
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
|
||||||
|
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
|
||||||
|
print(file=sys.stderr)
|
||||||
|
|
||||||
|
# if user has specified a user data dir, make sure its valid
|
||||||
|
if self.CHROME_USER_DATA_DIR and self.CHROME_USER_DATA_DIR.exists():
|
||||||
|
# check to make sure user_data_dir/<profile_name> exists
|
||||||
|
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).exists():
|
||||||
|
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
|
||||||
|
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
|
||||||
|
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
|
||||||
|
print(' For more info see:', file=sys.stderr)
|
||||||
|
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
|
||||||
|
if '/Default' in str(self.CHROME_USER_DATA_DIR):
|
||||||
|
print(file=sys.stderr)
|
||||||
|
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
|
||||||
|
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
|
||||||
|
|
||||||
|
# hard error is too annoying here, instead just set it to nothing
|
||||||
|
# raise SystemExit(2)
|
||||||
|
self.CHROME_USER_DATA_DIR = None
|
||||||
|
else:
|
||||||
|
self.CHROME_USER_DATA_DIR = None
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def chrome_args(self, **options) -> List[str]:
|
||||||
|
"""helper to build up a chrome shell command with arguments"""
|
||||||
|
|
||||||
|
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
|
||||||
|
|
||||||
|
options = self.model_copy(update=options)
|
||||||
|
|
||||||
|
cmd_args = [*options.CHROME_EXTRA_ARGS]
|
||||||
|
|
||||||
|
if options.CHROME_HEADLESS:
|
||||||
|
cmd_args += ["--headless=new"] # expects chrome version >= 111
|
||||||
|
|
||||||
|
if not options.CHROME_SANDBOX:
|
||||||
|
# assume this means we are running inside a docker container
|
||||||
|
# in docker, GPU support is limited, sandboxing is unecessary,
|
||||||
|
# and SHM is limited to 64MB by default (which is too low to be usable).
|
||||||
|
cmd_args += (
|
||||||
|
"--no-sandbox",
|
||||||
|
"--no-zygote",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-software-rasterizer",
|
||||||
|
"--run-all-compositor-stages-before-draw",
|
||||||
|
"--hide-scrollbars",
|
||||||
|
"--autoplay-policy=no-user-gesture-required",
|
||||||
|
"--no-first-run",
|
||||||
|
"--use-fake-ui-for-media-stream",
|
||||||
|
"--use-fake-device-for-media-stream",
|
||||||
|
"--disable-sync",
|
||||||
|
# "--password-store=basic",
|
||||||
|
)
|
||||||
|
|
||||||
|
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
|
||||||
|
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
|
||||||
|
|
||||||
|
# set window size for screenshot/pdf/etc. rendering
|
||||||
|
cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),)
|
||||||
|
|
||||||
|
if not options.CHROME_CHECK_SSL_VALIDITY:
|
||||||
|
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||||
|
|
||||||
|
if options.CHROME_USER_AGENT:
|
||||||
|
cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),)
|
||||||
|
|
||||||
|
if options.CHROME_TIMEOUT:
|
||||||
|
cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),)
|
||||||
|
|
||||||
|
if options.CHROME_USER_DATA_DIR:
|
||||||
|
cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
|
||||||
|
cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME))
|
||||||
|
|
||||||
|
return dedupe(cmd_args)
|
||||||
|
|
||||||
CHROME_CONFIG = ChromeConfig()
|
CHROME_CONFIG = ChromeConfig()
|
||||||
|
|
||||||
|
@ -122,6 +229,18 @@ class ChromeBinary(BaseBinary):
|
||||||
# otherwise on linux we can symlink directly to binary executable
|
# otherwise on linux we can symlink directly to binary executable
|
||||||
symlink.symlink_to(binary.abspath)
|
symlink.symlink_to(binary.abspath)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def chrome_cleanup_lockfile():
|
||||||
|
"""
|
||||||
|
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||||
|
a timeout or other error
|
||||||
|
"""
|
||||||
|
lock_file = Path("~/.config/chromium/SingletonLock")
|
||||||
|
|
||||||
|
if SHELL_CONFIG.IN_DOCKER and lock_file.exists():
|
||||||
|
lock_file.unlink()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
CHROME_BINARY = ChromeBinary()
|
CHROME_BINARY = ChromeBinary()
|
||||||
|
|
||||||
|
|
|
@ -24,40 +24,21 @@ from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||||
|
|
||||||
###################### Config ##########################
|
###################### Config ##########################
|
||||||
|
|
||||||
class SinglefileToggleConfigs(BaseConfigSet):
|
class SinglefileConfig(BaseConfigSet):
|
||||||
section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_TOGGLES'
|
section: ClassVar[ConfigSectionName] = 'ARCHIVING_CONFIG'
|
||||||
|
|
||||||
SAVE_SINGLEFILE: bool = True
|
SAVE_SINGLEFILE: bool = True
|
||||||
|
|
||||||
|
|
||||||
class SinglefileOptionsConfigs(BaseConfigSet):
|
|
||||||
section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_OPTIONS'
|
|
||||||
|
|
||||||
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||||
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||||
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||||
|
|
||||||
|
|
||||||
class SinglefileDependencyConfigs(BaseConfigSet):
|
|
||||||
section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG'
|
|
||||||
|
|
||||||
SINGLEFILE_BINARY: str = Field(default='wget')
|
SINGLEFILE_BINARY: str = Field(default='wget')
|
||||||
SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None)
|
|
||||||
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
||||||
SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
|
||||||
|
|
||||||
class SinglefileConfigs(SinglefileToggleConfigs, SinglefileOptionsConfigs, SinglefileDependencyConfigs):
|
|
||||||
# section: ClassVar[ConfigSectionName] = 'ALL_CONFIGS'
|
|
||||||
pass
|
|
||||||
|
|
||||||
DEFAULT_GLOBAL_CONFIG = {
|
SINGLEFILE_CONFIG = SinglefileConfig()
|
||||||
'CHECK_SSL_VALIDITY': False,
|
|
||||||
'SAVE_SINGLEFILE': True,
|
|
||||||
'TIMEOUT': 120,
|
|
||||||
}
|
|
||||||
|
|
||||||
SINGLEFILE_CONFIG = SinglefileConfigs(**DEFAULT_GLOBAL_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
SINGLEFILE_MIN_VERSION = '1.1.54'
|
SINGLEFILE_MIN_VERSION = '1.1.54'
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
import sys
|
||||||
from typing import List, Dict, ClassVar
|
from typing import List, Dict, ClassVar
|
||||||
from subprocess import run, PIPE
|
from subprocess import run, PIPE
|
||||||
from pydantic import InstanceOf, Field
|
|
||||||
|
from pydantic import InstanceOf, Field, model_validator, AliasChoices
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
|
@ -10,20 +12,37 @@ from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
||||||
from plugantic.base_binary import BaseBinary, env, apt, brew
|
from plugantic.base_binary import BaseBinary, env, apt, brew
|
||||||
from plugantic.base_hook import BaseHook
|
from plugantic.base_hook import BaseHook
|
||||||
|
|
||||||
|
from plugins_sys.config.apps import ARCHIVING_CONFIG
|
||||||
from plugins_pkg.pip.apps import pip
|
from plugins_pkg.pip.apps import pip
|
||||||
|
|
||||||
###################### Config ##########################
|
###################### Config ##########################
|
||||||
|
|
||||||
|
|
||||||
class YtdlpDependencyConfigs(BaseConfigSet):
|
class YtdlpConfig(BaseConfigSet):
|
||||||
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
|
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
|
||||||
|
|
||||||
USE_YTDLP: bool = True
|
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
|
||||||
|
|
||||||
YTDLP_BINARY: str = Field(default='yt-dlp')
|
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
|
||||||
|
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
|
||||||
|
|
||||||
|
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||||
|
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
|
||||||
|
|
||||||
|
@model_validator(mode='after')
|
||||||
|
def validate_use_ytdlp(self):
|
||||||
|
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
|
||||||
|
print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
||||||
|
print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr)
|
||||||
|
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
|
||||||
|
print(file=sys.stderr)
|
||||||
|
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
|
||||||
|
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
|
||||||
|
print(file=sys.stderr)
|
||||||
|
return self
|
||||||
|
|
||||||
DEFAULT_GLOBAL_CONFIG = {}
|
|
||||||
YTDLP_CONFIG = YtdlpDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
|
YTDLP_CONFIG = YtdlpConfig()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,6 +50,9 @@ class YtdlpBinary(BaseBinary):
|
||||||
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
|
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
|
binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
|
||||||
|
|
||||||
|
YTDLP_BINARY = YtdlpBinary()
|
||||||
|
|
||||||
|
|
||||||
class FfmpegBinary(BaseBinary):
|
class FfmpegBinary(BaseBinary):
|
||||||
name: BinName = 'ffmpeg'
|
name: BinName = 'ffmpeg'
|
||||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||||
|
@ -53,10 +75,9 @@ class FfmpegBinary(BaseBinary):
|
||||||
# def get_ffmpeg_version(self) -> Optional[str]:
|
# def get_ffmpeg_version(self) -> Optional[str]:
|
||||||
# return self.exec(cmd=['-version']).stdout
|
# return self.exec(cmd=['-version']).stdout
|
||||||
|
|
||||||
|
|
||||||
YTDLP_BINARY = YtdlpBinary()
|
|
||||||
FFMPEG_BINARY = FfmpegBinary()
|
FFMPEG_BINARY = FfmpegBinary()
|
||||||
|
|
||||||
|
|
||||||
# class YtdlpExtractor(BaseExtractor):
|
# class YtdlpExtractor(BaseExtractor):
|
||||||
# name: str = 'ytdlp'
|
# name: str = 'ytdlp'
|
||||||
# binary: str = 'ytdlp'
|
# binary: str = 'ytdlp'
|
||||||
|
|
|
@ -18,8 +18,6 @@ from requests.exceptions import RequestException, ReadTimeout
|
||||||
|
|
||||||
from base32_crockford import encode as base32_encode # type: ignore
|
from base32_crockford import encode as base32_encode # type: ignore
|
||||||
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
||||||
from os.path import lexists
|
|
||||||
from os import remove as remove_file
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import chardet
|
import chardet
|
||||||
|
@ -282,82 +280,6 @@ def get_headers(url: str, timeout: int=None) -> str:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
|
||||||
def chrome_args(**options) -> List[str]:
|
|
||||||
"""helper to build up a chrome shell command with arguments"""
|
|
||||||
|
|
||||||
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
|
|
||||||
|
|
||||||
from .config import (
|
|
||||||
CHROME_OPTIONS,
|
|
||||||
CHROME_VERSION,
|
|
||||||
CHROME_EXTRA_ARGS,
|
|
||||||
)
|
|
||||||
|
|
||||||
options = {**CHROME_OPTIONS, **options}
|
|
||||||
|
|
||||||
if not options['CHROME_BINARY']:
|
|
||||||
raise Exception('Could not find any CHROME_BINARY installed on your system')
|
|
||||||
|
|
||||||
cmd_args = [options['CHROME_BINARY']]
|
|
||||||
|
|
||||||
cmd_args += CHROME_EXTRA_ARGS
|
|
||||||
|
|
||||||
if options['CHROME_HEADLESS']:
|
|
||||||
cmd_args += ("--headless=new",) # expects chrome version >= 111
|
|
||||||
|
|
||||||
if not options['CHROME_SANDBOX']:
|
|
||||||
# assume this means we are running inside a docker container
|
|
||||||
# in docker, GPU support is limited, sandboxing is unecessary,
|
|
||||||
# and SHM is limited to 64MB by default (which is too low to be usable).
|
|
||||||
cmd_args += (
|
|
||||||
"--no-sandbox",
|
|
||||||
"--no-zygote",
|
|
||||||
"--disable-dev-shm-usage",
|
|
||||||
"--disable-software-rasterizer",
|
|
||||||
"--run-all-compositor-stages-before-draw",
|
|
||||||
"--hide-scrollbars",
|
|
||||||
"--autoplay-policy=no-user-gesture-required",
|
|
||||||
"--no-first-run",
|
|
||||||
"--use-fake-ui-for-media-stream",
|
|
||||||
"--use-fake-device-for-media-stream",
|
|
||||||
"--disable-sync",
|
|
||||||
# "--password-store=basic",
|
|
||||||
)
|
|
||||||
|
|
||||||
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
|
|
||||||
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
|
|
||||||
|
|
||||||
# set window size for screenshot/pdf/etc. rendering
|
|
||||||
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
|
|
||||||
|
|
||||||
if not options['CHECK_SSL_VALIDITY']:
|
|
||||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
|
||||||
|
|
||||||
if options['CHROME_USER_AGENT']:
|
|
||||||
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
|
|
||||||
|
|
||||||
if options['CHROME_TIMEOUT']:
|
|
||||||
cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),)
|
|
||||||
|
|
||||||
if options['CHROME_USER_DATA_DIR']:
|
|
||||||
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
|
|
||||||
cmd_args.append('--profile-directory=Default')
|
|
||||||
|
|
||||||
return dedupe(cmd_args)
|
|
||||||
|
|
||||||
|
|
||||||
def chrome_cleanup():
|
|
||||||
"""
|
|
||||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
|
||||||
a timeout or other error
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .config import IN_DOCKER
|
|
||||||
|
|
||||||
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
|
|
||||||
remove_file("/home/archivebox/.config/chromium/SingletonLock")
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def ansi_to_html(text: str) -> str:
|
def ansi_to_html(text: str) -> str:
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue