From a5ffd4e9d3e0549f0edd27e19e31879f7ff6b427 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 25 Sep 2024 00:42:26 -0700 Subject: [PATCH] move pdf, screenshot, dom, singlefile, and ytdlp extractor config to new plugin system --- archivebox/config.py | 98 ++++--------- archivebox/extractors/dom.py | 29 ++-- archivebox/extractors/media.py | 54 ++++---- archivebox/extractors/pdf.py | 29 ++-- archivebox/extractors/screenshot.py | 33 ++--- archivebox/extractors/singlefile.py | 56 ++++---- archivebox/main.py | 118 ++++++++-------- archivebox/plugins_extractor/chrome/apps.py | 129 +++++++++++++++++- .../plugins_extractor/singlefile/apps.py | 25 +--- archivebox/plugins_extractor/ytdlp/apps.py | 37 +++-- archivebox/util.py | 78 ----------- 11 files changed, 333 insertions(+), 353 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 2035c792..6a209621 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -30,6 +30,7 @@ import inspect import getpass import shutil import requests +import archivebox from hashlib import md5 from pathlib import Path @@ -62,7 +63,6 @@ from .misc.logging import ( stderr, hint, ) -from .misc.checks import check_system_config # print('STARTING CONFIG LOADING') @@ -167,15 +167,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, 'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'}, 'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'}, - 'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, 'COOKIES_FILE': {'type': str, 'default': None}, - 'CHROME_USER_DATA_DIR': {'type': str, 'default': None}, - - 'CHROME_TIMEOUT': {'type': int, 'default': 0}, - 'CHROME_HEADLESS': {'type': bool, 'default': True}, - 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, - 'CHROME_EXTRA_ARGS': {'type': list, 'default': None}, 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ '--restrict-filenames', @@ -267,7 +260,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl 'NODE_BINARY': {'type': str, 'default': 'node'}, 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, - 'CHROME_BINARY': {'type': str, 'default': None}, 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, @@ -551,7 +543,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME}, 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, - 'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None}, + 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)}, 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories @@ -595,7 +587,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, 'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []}, - 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, + # 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, @@ -620,15 +612,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []}, 'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []}, - 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()}, - 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])}, - 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None}, - 'CHROME_USER_AGENT': {'default': lambda c: c['CHROME_USER_AGENT'].format(**c)}, - - 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']}, - 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, - 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']}, - 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']}, 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']}, 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']}, @@ -638,8 +621,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, - 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)}, - 'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []}, + 'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}}, 'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}}, } @@ -1183,21 +1165,20 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_YOUTUBEDL'], 'is_valid': bool(config['YOUTUBEDL_VERSION']), }, - 'CHROME_BINARY': { - 'path': bin_path(config['CHROME_BINARY']), - 'version': config['CHROME_VERSION'], - 'hash': bin_hash(config['CHROME_BINARY']), - 'enabled': config['USE_CHROME'], - 'is_valid': bool(config['CHROME_VERSION']), - }, - 'RIPGREP_BINARY': { - 'path': bin_path(config['RIPGREP_BINARY']), - 'version': config['RIPGREP_VERSION'], - 'hash': bin_hash(config['RIPGREP_BINARY']), - 'enabled': config['USE_RIPGREP'], - 'is_valid': bool(config['RIPGREP_VERSION']), - }, - # TODO: add an entry for the sonic search backend? + # 'CHROME_BINARY': { + # 'path': bin_path(config['CHROME_BINARY']), + # 'version': config['CHROME_VERSION'], + # 'hash': bin_hash(config['CHROME_BINARY']), + # 'enabled': config['USE_CHROME'], + # 'is_valid': bool(config['CHROME_VERSION']), + # }, + # 'RIPGREP_BINARY': { + # 'path': bin_path(config['RIPGREP_BINARY']), + # 'version': config['RIPGREP_VERSION'], + # 'hash': bin_hash(config['RIPGREP_BINARY']), + # 'enabled': config['USE_RIPGREP'], + # 'is_valid': bool(config['RIPGREP_VERSION']), + # }, # 'SONIC_BINARY': { # 'path': bin_path(config['SONIC_BINARY']), # 'version': config['SONIC_VERSION'], @@ -1207,20 +1188,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: # }, } -def get_chrome_info(config: ConfigDict) -> ConfigValue: - return { - 'TIMEOUT': config['TIMEOUT'], - 'RESOLUTION': config['RESOLUTION'], - 'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'], - 'CHROME_BINARY': bin_path(config['CHROME_BINARY']), - 'CHROME_TIMEOUT': config['CHROME_TIMEOUT'], - 'CHROME_HEADLESS': config['CHROME_HEADLESS'], - 'CHROME_SANDBOX': config['CHROME_SANDBOX'], - 'CHROME_USER_AGENT': config['CHROME_USER_AGENT'], - 'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'], - } - - # ****************************************************************************** # ****************************************************************************** # ******************************** Load Config ********************************* @@ -1264,27 +1231,6 @@ os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # n # add ./node_modules/.bin to $PATH so we can use node scripts in extractors sys.path.append(CONFIG.NODE_BIN_PATH) -# OPTIONAL: also look around the host system for node modules to use -# avoid enabling this unless absolutely needed, -# having overlapping potential sources of libs is a big source of bugs/confusing to users -# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin')) -# sys.path.append(DEV_NODE_BIN_PATH) -# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve()) -# sys.path.append(USER_NODE_BIN_PATH) - -# disable stderr "you really shouldnt disable ssl" warnings with library config -if not CONFIG['CHECK_SSL_VALIDITY']: - import urllib3 - requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -# get SQLite database version, compile options, and runtime options -# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django -#cursor = sqlite3.connect(':memory:').cursor() -#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0] -#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0] -#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()] -#cursor.close() ########################### Config Validity Checkers ########################### @@ -1308,13 +1254,19 @@ def bump_startup_progress_bar(): if INITIAL_STARTUP_PROGRESS: INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore + +def setup_django_minimal(): + sys.path.append(str(archivebox.PACKAGE_DIR)) + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') + django.setup() + + def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None: global INITIAL_STARTUP_PROGRESS global INITIAL_STARTUP_PROGRESS_TASK with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS: INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25) - check_system_config(config) output_dir = out_dir or Path(config['OUTPUT_DIR']) diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index 0035ec87..675aa62e 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -8,13 +8,6 @@ from ..system import run, chmod_file, atomic_write from ..util import ( enforce_types, is_static_file, - chrome_args, - chrome_cleanup, -) -from ..config import ( - TIMEOUT, - SAVE_DOM, - CHROME_VERSION, ) from ..logging_util import TimedProgress @@ -25,6 +18,8 @@ def get_output_path(): @enforce_types def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: + from plugins_extractor.chrome.apps import CHROME_CONFIG + if is_static_file(link.url): return False @@ -33,42 +28,48 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona if (out_dir / get_output_path()).stat().st_size > 1: return False - return SAVE_DOM + return CHROME_CONFIG.SAVE_DOM @enforce_types -def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """print HTML of site to file using chrome --dump-html""" + from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY + + CHROME_BIN = CHROME_BINARY.load() + assert CHROME_BIN.abspath and CHROME_BIN.version + out_dir = out_dir or Path(link.link_dir) output: ArchiveOutput = get_output_path() output_path = out_dir / output cmd = [ - *chrome_args(), + str(CHROME_BIN.abspath), + *CHROME_CONFIG.chrome_args(), '--dump-dom', link.url ] status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - result = run(cmd, cwd=str(out_dir), timeout=timeout) + result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True) atomic_write(output_path, result.stdout) if result.returncode: - hints = result.stderr.decode() + hints = result.stderr raise ArchiveError('Failed to save DOM', hints) chmod_file(output, cwd=str(out_dir)) except Exception as err: status = 'failed' output = err - chrome_cleanup() + CHROME_BINARY.chrome_cleanup_lockfile() finally: timer.end() return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=CHROME_VERSION, + cmd_version=str(CHROME_BIN.version), output=output, status=status, **timer.stats, diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 8c33e92d..9952fc1d 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -5,20 +5,7 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..system import run, chmod_file -from ..util import ( - enforce_types, - is_static_file, - dedupe, -) -from ..config import ( - MEDIA_TIMEOUT, - SAVE_MEDIA, - YOUTUBEDL_ARGS, - YOUTUBEDL_EXTRA_ARGS, - YOUTUBEDL_BINARY, - YOUTUBEDL_VERSION, - CHECK_SSL_VALIDITY -) +from ..util import enforce_types, is_static_file, dedupe from ..logging_util import TimedProgress @@ -38,6 +25,8 @@ def get_embed_path(archiveresult=None): @enforce_types def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: + from plugins_extractor.ytdlp.apps import YTDLP_CONFIG + if is_static_file(link.url): return False @@ -45,45 +34,52 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio if not overwrite and (out_dir / get_output_path()).exists(): return False - return SAVE_MEDIA + return YTDLP_CONFIG.USE_YTDLP @enforce_types -def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: +def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult: """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp""" + + # from plugins_extractor.chrome.apps import CHROME_CONFIG + from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG + + YTDLP_BIN = YTDLP_BINARY.load() + assert YTDLP_BIN.abspath and YTDLP_BIN.version + + timeout = timeout or YTDLP_CONFIG.YTDLP_TIMEOUT out_dir = out_dir or Path(link.link_dir) output: ArchiveOutput = get_output_path() output_path = out_dir / output output_path.mkdir(exist_ok=True) # later options take precedence options = [ - *YOUTUBEDL_ARGS, - *YOUTUBEDL_EXTRA_ARGS, - *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), + *YTDLP_CONFIG.YTDLP_EXTRA_ARGS, + *([] if YTDLP_CONFIG.YTDLP_CHECK_SSL_VALIDITY else ['--no-check-certificate']), # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} ] cmd = [ - YOUTUBEDL_BINARY, + str(YTDLP_BIN.abspath), *dedupe(options), link.url, ] status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - result = run(cmd, cwd=str(output_path), timeout=timeout + 1) + result = run(cmd, cwd=str(output_path), timeout=timeout + 1, text=True) chmod_file(output, cwd=str(out_dir)) if result.returncode: - if (b'ERROR: Unsupported URL' in result.stderr - or b'HTTP Error 404' in result.stderr - or b'HTTP Error 403' in result.stderr - or b'URL could be a direct video link' in result.stderr - or b'Unable to extract container ID' in result.stderr): + if ('ERROR: Unsupported URL' in result.stderr + or 'HTTP Error 404' in result.stderr + or 'HTTP Error 403' in result.stderr + or 'URL could be a direct video link' in result.stderr + or 'Unable to extract container ID' in result.stderr): # These happen too frequently on non-media pages to warrant printing to console pass else: hints = ( - 'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode), - *result.stderr.decode().split('\n'), + 'Got yt-dlp response code: {}.'.format(result.returncode), + *result.stderr.split('\n'), ) raise ArchiveError('Failed to save media', hints) except Exception as err: @@ -117,7 +113,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=YOUTUBEDL_VERSION, + cmd_version=str(YTDLP_BIN.version), output=output, status=status, index_texts=index_texts, diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index 17bdd47f..e3c2330e 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -8,13 +8,6 @@ from ..system import run, chmod_file from ..util import ( enforce_types, is_static_file, - chrome_args, - chrome_cleanup, -) -from ..config import ( - TIMEOUT, - SAVE_PDF, - CHROME_VERSION, ) from ..logging_util import TimedProgress @@ -25,6 +18,8 @@ def get_output_path(): @enforce_types def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: + from plugins_extractor.chrome.apps import CHROME_CONFIG + if is_static_file(link.url): return False @@ -32,34 +27,40 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona if not overwrite and (out_dir / get_output_path()).exists(): return False - return SAVE_PDF + return CHROME_CONFIG.SAVE_PDF @enforce_types -def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """print PDF of site to file using chrome --headless""" + from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY + + CHROME_BIN = CHROME_BINARY.load() + assert CHROME_BIN.abspath and CHROME_BIN.version + out_dir = out_dir or Path(link.link_dir) output: ArchiveOutput = get_output_path() cmd = [ - *chrome_args(), + str(CHROME_BIN.abspath), + *CHROME_CONFIG.chrome_args(), '--print-to-pdf', link.url, ] status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - result = run(cmd, cwd=str(out_dir), timeout=timeout) + result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True) if result.returncode: - hints = (result.stderr or result.stdout).decode() + hints = (result.stderr or result.stdout) raise ArchiveError('Failed to save PDF', hints) chmod_file(get_output_path(), cwd=str(out_dir)) except Exception as err: status = 'failed' output = err - chrome_cleanup() + CHROME_BINARY.chrome_cleanup_lockfile() finally: timer.end() @@ -67,7 +68,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=CHROME_VERSION, + cmd_version=str(CHROME_BINARY.version), output=output, status=status, **timer.stats, diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index ae380e6f..d10554b7 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -5,17 +5,7 @@ from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..system import run, chmod_file -from ..util import ( - enforce_types, - is_static_file, - chrome_args, - chrome_cleanup, -) -from ..config import ( - TIMEOUT, - SAVE_SCREENSHOT, - CHROME_VERSION, -) +from ..util import enforce_types, is_static_file from ..logging_util import TimedProgress @@ -25,6 +15,8 @@ def get_output_path(): @enforce_types def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: + from plugins_extractor.chrome.apps import CHROME_CONFIG + if is_static_file(link.url): return False @@ -32,40 +24,45 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: if not overwrite and (out_dir / get_output_path()).exists(): return False - return SAVE_SCREENSHOT + return CHROME_CONFIG.SAVE_SCREENSHOT @enforce_types -def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """take screenshot of site using chrome --headless""" + from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY + CHROME_BIN = CHROME_BINARY.load() + assert CHROME_BIN.abspath and CHROME_BIN.version + out_dir = out_dir or Path(link.link_dir) output: ArchiveOutput = get_output_path() cmd = [ - *chrome_args(), + str(CHROME_BIN.abspath), + *CHROME_CONFIG.chrome_args(), '--screenshot', link.url, ] status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - result = run(cmd, cwd=str(out_dir), timeout=timeout) + result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True) if result.returncode: - hints = (result.stderr or result.stdout).decode() + hints = (result.stderr or result.stdout) raise ArchiveError('Failed to save screenshot', hints) chmod_file(output, cwd=str(out_dir)) except Exception as err: status = 'failed' output = err - chrome_cleanup() + CHROME_BINARY.chrome_cleanup_lockfile() finally: timer.end() return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=CHROME_VERSION, + cmd_version=str(CHROME_BIN.version), output=output, status=status, **timer.stats, diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 950ccd9c..c7184a94 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -7,22 +7,7 @@ import json from ..index.schema import Link, ArchiveResult, ArchiveError from ..system import run, chmod_file -from ..util import ( - enforce_types, - is_static_file, - chrome_args, - dedupe, -) -from ..config import ( - TIMEOUT, - SAVE_SINGLEFILE, - DEPENDENCIES, - SINGLEFILE_VERSION, - SINGLEFILE_ARGS, - SINGLEFILE_EXTRA_ARGS, - CHROME_BINARY, - COOKIES_FILE, -) +from ..util import enforce_types, is_static_file, dedupe from ..logging_util import TimedProgress @@ -32,6 +17,8 @@ def get_output_path(): @enforce_types def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: + from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG + if is_static_file(link.url): return False @@ -39,30 +26,35 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: if not overwrite and (out_dir / get_output_path()).exists(): return False - return SAVE_SINGLEFILE + return SINGLEFILE_CONFIG.SAVE_SINGLEFILE @enforce_types -def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """download full site using single-file""" + + from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY + from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY + + CHROME_BIN = CHROME_BINARY.load() + assert CHROME_BIN.abspath and CHROME_BIN.version + SINGLEFILE_BIN = SINGLEFILE_BINARY.load() + assert SINGLEFILE_BIN.abspath and SINGLEFILE_BIN.version out_dir = out_dir or Path(link.link_dir) output = get_output_path() - browser_args = chrome_args(CHROME_TIMEOUT=0) + browser_args = CHROME_CONFIG.chrome_args(CHROME_TIMEOUT=0) # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli - browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - # later options take precedence options = [ - '--browser-executable-path={}'.format(CHROME_BINARY), - *(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []), - browser_args, - *SINGLEFILE_ARGS, - *SINGLEFILE_EXTRA_ARGS, + '--browser-executable-path={}'.format(CHROME_BIN.abspath), + *(["--browser-cookies-file={}".format(SINGLEFILE_CONFIG.SINGLEFILE_COOKIES_FILE)] if SINGLEFILE_CONFIG.SINGLEFILE_COOKIES_FILE else []), + '--browser-args={}'.format(json.dumps(browser_args)), + *SINGLEFILE_CONFIG.SINGLEFILE_EXTRA_ARGS, ] cmd = [ - DEPENDENCIES['SINGLEFILE_BINARY']['path'], + str(SINGLEFILE_BIN.abspath), *dedupe(options), link.url, output, @@ -72,13 +64,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO timer = TimedProgress(timeout, prefix=' ') result = None try: - result = run(cmd, cwd=str(out_dir), timeout=timeout) + result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True, capture_output=True) # parse out number of files downloaded from last line of stderr: # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" output_tail = [ line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:] + for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:] if line.strip() ] hints = ( @@ -93,9 +85,9 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO except (Exception, OSError) as err: status = 'failed' # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes). - cmd[2] = browser_args.replace('"', "\\\"") + cmd[2] = cmd[2].replace('"', "\\\"") if result: - err.hints = (result.stdout + result.stderr).decode().split('\n') + err.hints = (result.stdout + result.stderr).split('\n') output = err finally: timer.end() @@ -103,7 +95,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO return ArchiveResult( cmd=cmd, pwd=str(out_dir), - cmd_version=SINGLEFILE_VERSION, + cmd_version=str(SINGLEFILE_BIN.version), output=output, status=status, **timer.stats, diff --git a/archivebox/main.py b/archivebox/main.py index 32b2b14c..6d802b86 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -1,10 +1,10 @@ __package__ = 'archivebox' import os -import time import sys import shutil import platform +import archivebox from typing import Dict, List, Optional, Iterable, IO, Union from pathlib import Path @@ -69,6 +69,7 @@ from .extractors import archive_links, archive_link, ignore_methods from .misc.logging import stderr, hint from .misc.checks import check_data_folder, check_dependencies from .config import ( + setup_django_minimal, ConfigDict, ANSI, IS_TTY, @@ -81,8 +82,6 @@ from .config import ( TIMEZONE, ENFORCE_ATOMIC_WRITES, OUTPUT_PERMISSIONS, - PYTHON_BINARY, - ARCHIVEBOX_BINARY, ONLY_NEW, OUTPUT_DIR, SOURCES_DIR, @@ -95,31 +94,22 @@ from .config import ( HTML_INDEX_FILENAME, SQL_INDEX_FILENAME, ALLOWED_IN_OUTPUT_DIR, - SEARCH_BACKEND_ENGINE, LDAP, - get_version, write_config_file, VERSION, - VERSIONS_AVAILABLE, - CAN_UPGRADE, COMMIT_HASH, BUILD_TIME, CODE_LOCATIONS, DATA_LOCATIONS, DEPENDENCIES, - CHROME_BINARY, - CHROME_VERSION, YOUTUBEDL_BINARY, YOUTUBEDL_VERSION, SINGLEFILE_VERSION, READABILITY_VERSION, MERCURY_VERSION, - NODE_VERSION, load_all_config, CONFIG, USER_CONFIG, - ADMIN_USERNAME, - ADMIN_PASSWORD, get_real_name, setup_django, ) @@ -216,6 +206,11 @@ def version(quiet: bool=False, out_dir: Path=OUTPUT_DIR) -> None: """Print the ArchiveBox version and dependency information""" + setup_django_minimal() + from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG + from plugins_auth.ldap.apps import LDAP_CONFIG + from django.conf import settings + print(VERSION) if not quiet: @@ -227,7 +222,7 @@ def version(quiet: bool=False, p = platform.uname() print( - 'ArchiveBox v{}'.format(get_version(CONFIG)), + 'ArchiveBox v{}'.format(archivebox.__version__), f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}', f'BUILD_TIME={BUILD_TIME}', ) @@ -241,29 +236,35 @@ def version(quiet: bool=False, ) OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount'] print( - f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}', + f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}', f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', - f'FS_USER={PUID}:{PGID}', - f'FS_PERMS={OUTPUT_PERMISSIONS}', + f'FS_USER={SHELL_CONFIG.PUID}:{SHELL_CONFIG.PGID}', + f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}', ) print( - f'DEBUG={DEBUG}', - f'IS_TTY={IS_TTY}', + f'DEBUG={SHELL_CONFIG.DEBUG}', + f'IS_TTY={SHELL_CONFIG.IS_TTY}', f'TZ={TIMEZONE}', - f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}', - f'LDAP={LDAP}', + f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}', + f'LDAP={LDAP_CONFIG.LDAP_ENABLED}', #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually ) print() - print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) + print('{white}[i] Old dependency versions:{reset}'.format(**ANSI)) for name, dependency in DEPENDENCIES.items(): print(printable_dependency_version(name, dependency)) # add a newline between core dependencies and extractor dependencies for easier reading if name == 'ARCHIVEBOX_BINARY': print() - + + print() + print('{white}[i] New dependency versions:{reset}'.format(**ANSI)) + for name, binary in settings.BINARIES.items(): + loaded_bin = binary.load() + print('', '√' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath) + print() print('{white}[i] Source-code locations:{reset}'.format(**ANSI)) for name, path in CODE_LOCATIONS.items(): @@ -431,10 +432,11 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path= print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI)) from django.contrib.auth.models import User + from plugins_sys.config.apps import SERVER_CONFIG - if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists(): + if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists(): print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI)) - User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD) + User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD) if existing_index: print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI)) @@ -693,8 +695,8 @@ def add(urls: Union[str, List[str]], # tail_worker_logs(worker['stdout_logfile']) - if CAN_UPGRADE: - hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") + # if CAN_UPGRADE: + # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") return new_links @@ -967,6 +969,8 @@ def list_folders(links: List[Link], def setup(out_dir: Path=OUTPUT_DIR) -> None: """Automatically install all ArchiveBox dependencies and extras""" + + if not (out_dir / ARCHIVE_DIR_NAME).exists(): run_subcommand('init', stdin=None, pwd=out_dir) @@ -980,24 +984,26 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None: stderr('\n[+] Installing enabled ArchiveBox dependencies automatically...', color='green') + from plugins_pkg.pip.apps import PYTHON_BINARY + stderr('\n Installing YOUTUBEDL_BINARY automatically using pip...') if YOUTUBEDL_VERSION: print(f'{YOUTUBEDL_VERSION} is already installed', YOUTUBEDL_BINARY) else: try: run_shell([ - PYTHON_BINARY, '-m', 'pip', + PYTHON_BINARY.load().abspath, '-m', 'pip', 'install', '--upgrade', '--no-cache-dir', '--no-warn-script-location', 'yt-dlp', - ], capture_output=False, cwd=out_dir) + ], capture_output=False, cwd=out_dir, text=True) pkg_path = run_shell([ - PYTHON_BINARY, '-m', 'pip', + PYTHON_BINARY.load().abspath, '-m', 'pip', 'show', 'yt-dlp', - ], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0] + ], capture_output=True, text=True, cwd=out_dir).stdout.split('Location: ')[-1].split('\n', 1)[0] NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py' os.chmod(NEW_YOUTUBEDL_BINARY, 0o777) assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}' @@ -1006,33 +1012,18 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None: stderr(f'[X] Failed to install python packages: {e}', color='red') raise SystemExit(1) - if platform.machine() == 'armv7l': - stderr('\n Skip the automatic installation of CHROME_BINARY because playwright is not available on armv7.') - else: - stderr('\n Installing CHROME_BINARY automatically using playwright...') - if CHROME_VERSION: - print(f'{CHROME_VERSION} is already installed', CHROME_BINARY) - else: - try: - run_shell([ - PYTHON_BINARY, '-m', 'pip', - 'install', - '--upgrade', - '--no-cache-dir', - '--no-warn-script-location', - 'playwright', - ], capture_output=False, cwd=out_dir) - run_shell([PYTHON_BINARY, '-m', 'playwright', 'install', 'chromium'], capture_output=False, cwd=out_dir) - proc = run_shell([PYTHON_BINARY, '-c', 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)'], capture_output=True, text=True, cwd=out_dir) - NEW_CHROME_BINARY = proc.stdout.decode().strip() if isinstance(proc.stdout, bytes) else proc.stdout.strip() - assert NEW_CHROME_BINARY and len(NEW_CHROME_BINARY), 'CHROME_BINARY must contain a path' - config(f'CHROME_BINARY={NEW_CHROME_BINARY}', set=True, out_dir=out_dir) - except BaseException as e: # lgtm [py/catch-base-exception] - stderr(f'[X] Failed to install chromium using playwright: {e.__class__.__name__} {e}', color='red') - raise SystemExit(1) + + from plugins_extractor.chrome.apps import CHROME_BINARY + + CHROME_BINARY.load_or_install() + + from plugins_pkg.npm.apps import NPM_BINARY + from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY + + SINGLEFILE_BINARY.load_or_install() stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...') - if not NODE_VERSION: + if not NPM_BINARY.load().version: stderr('[X] You must first install node & npm using your system package manager', color='red') hint([ 'https://github.com/nodesource/distributions#table-of-contents', @@ -1077,7 +1068,9 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None: stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green') - run_shell([PYTHON_BINARY, ARCHIVEBOX_BINARY, '--version'], capture_output=False, cwd=out_dir) + from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY + + run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir) @enforce_types def config(config_options_str: Optional[str]=None, @@ -1192,6 +1185,8 @@ def schedule(add: bool=False, """Set ArchiveBox to regularly import URLs at specific times using cron""" check_data_folder(CONFIG) + setup_django_minimal() + from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY Path(LOGS_DIR).mkdir(exist_ok=True) @@ -1212,7 +1207,7 @@ def schedule(add: bool=False, 'cd', quoted(out_dir), '&&', - quoted(ARCHIVEBOX_BINARY), + quoted(ARCHIVEBOX_BINARY.load().abspath), *([ 'add', *(['--overwrite'] if overwrite else []), @@ -1300,8 +1295,8 @@ def schedule(add: bool=False, print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) raise SystemExit(1) - if CAN_UPGRADE: - hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") + # if CAN_UPGRADE: + # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n") @enforce_types @@ -1386,6 +1381,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None: """Run an ArchiveBox Django management command""" check_data_folder(CONFIG) + setup_django_minimal() from django.core.management import execute_from_command_line if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY): @@ -1393,7 +1389,9 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None: stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow') stderr('') - execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])]) + from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY + + execute_from_command_line([ARCHIVEBOX_BINARY.load().abspath, 'manage', *(args or ['help'])]) @enforce_types diff --git a/archivebox/plugins_extractor/chrome/apps.py b/archivebox/plugins_extractor/chrome/apps.py index 6f2aa94a..61405e0f 100644 --- a/archivebox/plugins_extractor/chrome/apps.py +++ b/archivebox/plugins_extractor/chrome/apps.py @@ -1,5 +1,6 @@ __package__ = 'archivebox.plugins_extractor.chrome' +import sys import platform from pathlib import Path from typing import List, Optional, Dict, ClassVar @@ -7,7 +8,8 @@ from typing import List, Optional, Dict, ClassVar from django.conf import settings # Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field +from rich import print +from pydantic import InstanceOf, Field, model_validator from pydantic_pkgr import ( BinProvider, BinName, @@ -25,9 +27,12 @@ from plugantic.base_binary import BaseBinary, env from plugantic.base_hook import BaseHook # Depends on Other Plugins: +from plugins_sys.config.apps import ARCHIVING_CONFIG, SHELL_CONFIG from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER +from ...util import dedupe + CHROMIUM_BINARY_NAMES_LINUX = [ "chromium", @@ -82,11 +87,113 @@ def create_macos_app_symlink(target: Path, shortcut: Path): class ChromeConfig(BaseConfigSet): section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG" - CHROME_BINARY: str = Field(default='chrome') - CHROME_ARGS: List[str] | None = Field(default=None) - CHROME_EXTRA_ARGS: List[str] = Field(default=[]) - CHROME_DEFAULT_ARGS: List[str] = Field(default=lambda: ['--timeout={TIMEOUT-10}']) + USE_CHROME: bool = Field(default=True) + # Chrome Binary + CHROME_BINARY: str = Field(default='chrome') + CHROME_EXTRA_ARGS: List[str] = Field(default=[]) + + # Chrome Options Tuning + CHROME_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10) + CHROME_HEADLESS: bool = Field(default=True) + CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER) + CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION) + CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + + # Cookies & Auth + CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + CHROME_USER_DATA_DIR: Path | None = Field(default=None) + CHROME_PROFILE_NAME: str = Field(default='Default') + + # Extractor Toggles + SAVE_SCREENSHOT: bool = Field(default=True, alias='FETCH_SCREENSHOT') + SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM') + SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF') + + @model_validator(mode='after') + def validate_use_chrome(self): + if self.USE_CHROME and self.CHROME_TIMEOUT < 15: + print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr) + print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr) + print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr) + print(file=sys.stderr) + print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr) + print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr) + print(file=sys.stderr) + + # if user has specified a user data dir, make sure its valid + if self.CHROME_USER_DATA_DIR and self.CHROME_USER_DATA_DIR.exists(): + # check to make sure user_data_dir/ exists + if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).exists(): + print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr) + print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr) + print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr) + print(' For more info see:', file=sys.stderr) + print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr) + if '/Default' in str(self.CHROME_USER_DATA_DIR): + print(file=sys.stderr) + print(' Try removing /Default from the end e.g.:', file=sys.stderr) + print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr) + + # hard error is too annoying here, instead just set it to nothing + # raise SystemExit(2) + self.CHROME_USER_DATA_DIR = None + else: + self.CHROME_USER_DATA_DIR = None + + return self + + def chrome_args(self, **options) -> List[str]: + """helper to build up a chrome shell command with arguments""" + + # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/ + + options = self.model_copy(update=options) + + cmd_args = [*options.CHROME_EXTRA_ARGS] + + if options.CHROME_HEADLESS: + cmd_args += ["--headless=new"] # expects chrome version >= 111 + + if not options.CHROME_SANDBOX: + # assume this means we are running inside a docker container + # in docker, GPU support is limited, sandboxing is unecessary, + # and SHM is limited to 64MB by default (which is too low to be usable). + cmd_args += ( + "--no-sandbox", + "--no-zygote", + "--disable-dev-shm-usage", + "--disable-software-rasterizer", + "--run-all-compositor-stages-before-draw", + "--hide-scrollbars", + "--autoplay-policy=no-user-gesture-required", + "--no-first-run", + "--use-fake-ui-for-media-stream", + "--use-fake-device-for-media-stream", + "--disable-sync", + # "--password-store=basic", + ) + + # disable automatic updating when running headless, as there's no user to see the upgrade prompts + cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",) + + # set window size for screenshot/pdf/etc. rendering + cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),) + + if not options.CHROME_CHECK_SSL_VALIDITY: + cmd_args += ('--disable-web-security', '--ignore-certificate-errors') + + if options.CHROME_USER_AGENT: + cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),) + + if options.CHROME_TIMEOUT: + cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),) + + if options.CHROME_USER_DATA_DIR: + cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR)) + cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME)) + + return dedupe(cmd_args) CHROME_CONFIG = ChromeConfig() @@ -122,6 +229,18 @@ class ChromeBinary(BaseBinary): # otherwise on linux we can symlink directly to binary executable symlink.symlink_to(binary.abspath) + @staticmethod + def chrome_cleanup_lockfile(): + """ + Cleans up any state or runtime files that chrome leaves behind when killed by + a timeout or other error + """ + lock_file = Path("~/.config/chromium/SingletonLock") + + if SHELL_CONFIG.IN_DOCKER and lock_file.exists(): + lock_file.unlink() + + CHROME_BINARY = ChromeBinary() diff --git a/archivebox/plugins_extractor/singlefile/apps.py b/archivebox/plugins_extractor/singlefile/apps.py index e2b610f3..b937b7db 100644 --- a/archivebox/plugins_extractor/singlefile/apps.py +++ b/archivebox/plugins_extractor/singlefile/apps.py @@ -24,40 +24,21 @@ from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER ###################### Config ########################## -class SinglefileToggleConfigs(BaseConfigSet): - section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_TOGGLES' +class SinglefileConfig(BaseConfigSet): + section: ClassVar[ConfigSectionName] = 'ARCHIVING_CONFIG' SAVE_SINGLEFILE: bool = True - -class SinglefileOptionsConfigs(BaseConfigSet): - section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_OPTIONS' - SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) - -class SinglefileDependencyConfigs(BaseConfigSet): - section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG' - SINGLEFILE_BINARY: str = Field(default='wget') - SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None) SINGLEFILE_EXTRA_ARGS: List[str] = [] - SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] -class SinglefileConfigs(SinglefileToggleConfigs, SinglefileOptionsConfigs, SinglefileDependencyConfigs): - # section: ClassVar[ConfigSectionName] = 'ALL_CONFIGS' - pass -DEFAULT_GLOBAL_CONFIG = { - 'CHECK_SSL_VALIDITY': False, - 'SAVE_SINGLEFILE': True, - 'TIMEOUT': 120, -} - -SINGLEFILE_CONFIG = SinglefileConfigs(**DEFAULT_GLOBAL_CONFIG) +SINGLEFILE_CONFIG = SinglefileConfig() SINGLEFILE_MIN_VERSION = '1.1.54' diff --git a/archivebox/plugins_extractor/ytdlp/apps.py b/archivebox/plugins_extractor/ytdlp/apps.py index 2897a97a..4385f41f 100644 --- a/archivebox/plugins_extractor/ytdlp/apps.py +++ b/archivebox/plugins_extractor/ytdlp/apps.py @@ -1,6 +1,8 @@ +import sys from typing import List, Dict, ClassVar from subprocess import run, PIPE -from pydantic import InstanceOf, Field + +from pydantic import InstanceOf, Field, model_validator, AliasChoices from django.conf import settings @@ -10,20 +12,37 @@ from plugantic.base_configset import BaseConfigSet, ConfigSectionName from plugantic.base_binary import BaseBinary, env, apt, brew from plugantic.base_hook import BaseHook +from plugins_sys.config.apps import ARCHIVING_CONFIG from plugins_pkg.pip.apps import pip ###################### Config ########################## -class YtdlpDependencyConfigs(BaseConfigSet): +class YtdlpConfig(BaseConfigSet): section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG" - USE_YTDLP: bool = True + USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA')) - YTDLP_BINARY: str = Field(default='yt-dlp') + YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY') + YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS') + + YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT) + + @model_validator(mode='after') + def validate_use_ytdlp(self): + if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20: + print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr) + print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr) + print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr) + print(file=sys.stderr) + print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr) + print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr) + print(file=sys.stderr) + return self -DEFAULT_GLOBAL_CONFIG = {} -YTDLP_CONFIG = YtdlpDependencyConfigs(**DEFAULT_GLOBAL_CONFIG) + +YTDLP_CONFIG = YtdlpConfig() @@ -31,6 +50,9 @@ class YtdlpBinary(BaseBinary): name: BinName = YTDLP_CONFIG.YTDLP_BINARY binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env] +YTDLP_BINARY = YtdlpBinary() + + class FfmpegBinary(BaseBinary): name: BinName = 'ffmpeg' binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] @@ -53,10 +75,9 @@ class FfmpegBinary(BaseBinary): # def get_ffmpeg_version(self) -> Optional[str]: # return self.exec(cmd=['-version']).stdout - -YTDLP_BINARY = YtdlpBinary() FFMPEG_BINARY = FfmpegBinary() + # class YtdlpExtractor(BaseExtractor): # name: str = 'ytdlp' # binary: str = 'ytdlp' diff --git a/archivebox/util.py b/archivebox/util.py index 7349a008..33409c3c 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -18,8 +18,6 @@ from requests.exceptions import RequestException, ReadTimeout from base32_crockford import encode as base32_encode # type: ignore from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding -from os.path import lexists -from os import remove as remove_file try: import chardet @@ -282,82 +280,6 @@ def get_headers(url: str, timeout: int=None) -> str: ) -@enforce_types -def chrome_args(**options) -> List[str]: - """helper to build up a chrome shell command with arguments""" - - # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/ - - from .config import ( - CHROME_OPTIONS, - CHROME_VERSION, - CHROME_EXTRA_ARGS, - ) - - options = {**CHROME_OPTIONS, **options} - - if not options['CHROME_BINARY']: - raise Exception('Could not find any CHROME_BINARY installed on your system') - - cmd_args = [options['CHROME_BINARY']] - - cmd_args += CHROME_EXTRA_ARGS - - if options['CHROME_HEADLESS']: - cmd_args += ("--headless=new",) # expects chrome version >= 111 - - if not options['CHROME_SANDBOX']: - # assume this means we are running inside a docker container - # in docker, GPU support is limited, sandboxing is unecessary, - # and SHM is limited to 64MB by default (which is too low to be usable). - cmd_args += ( - "--no-sandbox", - "--no-zygote", - "--disable-dev-shm-usage", - "--disable-software-rasterizer", - "--run-all-compositor-stages-before-draw", - "--hide-scrollbars", - "--autoplay-policy=no-user-gesture-required", - "--no-first-run", - "--use-fake-ui-for-media-stream", - "--use-fake-device-for-media-stream", - "--disable-sync", - # "--password-store=basic", - ) - - # disable automatic updating when running headless, as there's no user to see the upgrade prompts - cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",) - - # set window size for screenshot/pdf/etc. rendering - cmd_args += ('--window-size={}'.format(options['RESOLUTION']),) - - if not options['CHECK_SSL_VALIDITY']: - cmd_args += ('--disable-web-security', '--ignore-certificate-errors') - - if options['CHROME_USER_AGENT']: - cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),) - - if options['CHROME_TIMEOUT']: - cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),) - - if options['CHROME_USER_DATA_DIR']: - cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - cmd_args.append('--profile-directory=Default') - - return dedupe(cmd_args) - - -def chrome_cleanup(): - """ - Cleans up any state or runtime files that chrome leaves behind when killed by - a timeout or other error - """ - - from .config import IN_DOCKER - - if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"): - remove_file("/home/archivebox/.config/chromium/SingletonLock") - @enforce_types def ansi_to_html(text: str) -> str: """