__package__ = 'archivebox.plugins_extractor.chrome' import sys import platform from pathlib import Path from typing import List, Optional, Dict, ClassVar # Depends on other PyPI/vendor packages: from rich import print from pydantic import InstanceOf, Field, model_validator from pydantic_pkgr import ( BinProvider, BinName, BinProviderName, ProviderLookupDict, bin_abspath, ) # Depends on other Django apps: from abx.archivebox.base_plugin import BasePlugin from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_binary import BaseBinary, env # from abx.archivebox.base_extractor import BaseExtractor # from abx.archivebox.base_queue import BaseQueue from abx.archivebox.base_hook import BaseHook # Depends on Other Plugins: from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER from ...util import dedupe CHROMIUM_BINARY_NAMES_LINUX = [ "chromium", "chromium-browser", "chromium-browser-beta", "chromium-browser-unstable", "chromium-browser-canary", "chromium-browser-dev", ] CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"] CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS CHROME_BINARY_NAMES_LINUX = [ "google-chrome", "google-chrome-stable", "google-chrome-beta", "google-chrome-canary", "google-chrome-unstable", "google-chrome-dev", "chrome" ] CHROME_BINARY_NAMES_MACOS = [ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary", ] CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS def autodetect_system_chrome_install(PATH=None) -> Optional[Path]: for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES: abspath = bin_abspath(bin_name, PATH=env.PATH) if abspath: return abspath return None def create_macos_app_symlink(target: Path, shortcut: Path): """ on macOS, some binaries are inside of .app, so we need to create a tiny bash script instead of a symlink (so that ../ parent relationships are relative to original .app instead of callsite dir) """ # TODO: should we enforce this? is it useful in any other situation? # if platform.system().lower() != 'darwin': # raise Exception(...) shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""") shortcut.chmod(0o777) # make sure its executable by everyone ###################### Config ########################## class ChromeConfig(BaseConfigSet): USE_CHROME: bool = Field(default=True) # Chrome Binary CHROME_BINARY: str = Field(default='chrome') CHROME_EXTRA_ARGS: List[str] = Field(default=[]) # Chrome Options Tuning CHROME_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10) CHROME_HEADLESS: bool = Field(default=True) CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER) CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION) CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) # Cookies & Auth CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) CHROME_USER_DATA_DIR: Path | None = Field(default=None) CHROME_PROFILE_NAME: str = Field(default='Default') # Extractor Toggles SAVE_SCREENSHOT: bool = Field(default=True, alias='FETCH_SCREENSHOT') SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM') SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF') @model_validator(mode='after') def validate_use_chrome(self): if self.USE_CHROME and self.CHROME_TIMEOUT < 15: print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr) print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr) print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr) print(file=sys.stderr) print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr) print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr) print(file=sys.stderr) # if user has specified a user data dir, make sure its valid if self.CHROME_USER_DATA_DIR and self.CHROME_USER_DATA_DIR.exists(): # check to make sure user_data_dir/ exists if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).exists(): print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr) print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr) print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr) print(' For more info see:', file=sys.stderr) print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr) if '/Default' in str(self.CHROME_USER_DATA_DIR): print(file=sys.stderr) print(' Try removing /Default from the end e.g.:', file=sys.stderr) print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr) # hard error is too annoying here, instead just set it to nothing # raise SystemExit(2) self.CHROME_USER_DATA_DIR = None else: self.CHROME_USER_DATA_DIR = None return self def chrome_args(self, **options) -> List[str]: """helper to build up a chrome shell command with arguments""" # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/ options = self.model_copy(update=options) cmd_args = [*options.CHROME_EXTRA_ARGS] if options.CHROME_HEADLESS: cmd_args += ["--headless=new"] # expects chrome version >= 111 if not options.CHROME_SANDBOX: # assume this means we are running inside a docker container # in docker, GPU support is limited, sandboxing is unecessary, # and SHM is limited to 64MB by default (which is too low to be usable). cmd_args += ( "--no-sandbox", "--no-zygote", "--disable-dev-shm-usage", "--disable-software-rasterizer", "--run-all-compositor-stages-before-draw", "--hide-scrollbars", "--autoplay-policy=no-user-gesture-required", "--no-first-run", "--use-fake-ui-for-media-stream", "--use-fake-device-for-media-stream", "--disable-sync", # "--password-store=basic", ) # disable automatic updating when running headless, as there's no user to see the upgrade prompts cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",) # set window size for screenshot/pdf/etc. rendering cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),) if not options.CHROME_CHECK_SSL_VALIDITY: cmd_args += ('--disable-web-security', '--ignore-certificate-errors') if options.CHROME_USER_AGENT: cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),) if options.CHROME_TIMEOUT: cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),) if options.CHROME_USER_DATA_DIR: cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR)) cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME)) return dedupe(cmd_args) CHROME_CONFIG = ChromeConfig() class ChromeBinary(BaseBinary): name: BinName = CHROME_CONFIG.CHROME_BINARY binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER] provider_overrides: Dict[BinProviderName, ProviderLookupDict] = { env.name: { 'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable }, PUPPETEER_BINPROVIDER.name: { 'packages': lambda: ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable }, PLAYWRIGHT_BINPROVIDER.name: { 'packages': lambda: ['chromium'], # playwright install chromium }, } @staticmethod def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None: if not (binary.abspath and binary.abspath.exists()): return bin_dir.mkdir(parents=True, exist_ok=True) symlink = bin_dir / binary.name if platform.system().lower() == 'darwin': # if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink create_macos_app_symlink(binary.abspath, symlink) else: # otherwise on linux we can symlink directly to binary executable symlink.symlink_to(binary.abspath) @staticmethod def chrome_cleanup_lockfile(): """ Cleans up any state or runtime files that chrome leaves behind when killed by a timeout or other error """ lock_file = Path("~/.config/chromium/SingletonLock") if SHELL_CONFIG.IN_DOCKER and lock_file.exists(): lock_file.unlink() CHROME_BINARY = ChromeBinary() class ChromePlugin(BasePlugin): app_label: str = 'chrome' verbose_name: str = 'Chrome Browser' hooks: List[InstanceOf[BaseHook]] = [ CHROME_CONFIG, CHROME_BINARY, ] PLUGIN = ChromePlugin() # PLUGIN.register(settings) DJANGO_APP = PLUGIN.AppConfig