mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-24 19:54:25 -04:00
split puppeteer plugin into Puppeteer, Playwright, and Chrome
This commit is contained in:
parent
33fd7fe439
commit
541cd6c5a1
10 changed files with 414 additions and 124 deletions
|
@ -6,33 +6,38 @@ from django.conf import settings
|
|||
|
||||
# Depends on other PyPI/vendor packages:
|
||||
from pydantic import InstanceOf, Field
|
||||
from pydantic_pkgr import BinProvider, BinName, BinProviderName, ProviderLookupDict, InstallArgs, HostBinPath, bin_abspath
|
||||
from pydantic_pkgr import (
|
||||
BinProvider,
|
||||
BinName,
|
||||
BinProviderName,
|
||||
ProviderLookupDict,
|
||||
InstallArgs,
|
||||
PATHStr,
|
||||
HostBinPath,
|
||||
)
|
||||
|
||||
# Depends on other Django apps:
|
||||
from plugantic.base_plugin import BasePlugin
|
||||
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
||||
from plugantic.base_configset import BaseConfigSet
|
||||
from plugantic.base_binary import BaseBinary, BaseBinProvider, env
|
||||
# from plugantic.base_extractor import BaseExtractor
|
||||
# from plugantic.base_queue import BaseQueue
|
||||
from plugantic.base_hook import BaseHook
|
||||
|
||||
# Depends on Other Plugins:
|
||||
from builtin_plugins.npm.apps import SYS_NPM_BINPROVIDER
|
||||
from builtin_plugins.npm.apps import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
|
||||
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class PuppeteerDependencyConfigs(BaseConfigSet):
|
||||
section: ConfigSectionName = 'DEPENDENCY_CONFIG'
|
||||
class PuppeteerConfigs(BaseConfigSet):
|
||||
# section: ConfigSectionName = 'DEPENDENCY_CONFIG'
|
||||
|
||||
PUPPETEER_BINARY: str = Field(default='wget')
|
||||
PUPPETEER_ARGS: Optional[List[str]] = Field(default=None)
|
||||
PUPPETEER_EXTRA_ARGS: List[str] = []
|
||||
PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
||||
|
||||
class PuppeteerConfigs(PuppeteerDependencyConfigs):
|
||||
# section: ConfigSectionName = 'ALL_CONFIGS'
|
||||
# PUPPETEER_BINARY: str = Field(default='wget')
|
||||
# PUPPETEER_ARGS: Optional[List[str]] = Field(default=None)
|
||||
# PUPPETEER_EXTRA_ARGS: List[str] = []
|
||||
# PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
||||
pass
|
||||
|
||||
DEFAULT_GLOBAL_CONFIG = {
|
||||
|
@ -42,17 +47,29 @@ PUPPETEER_CONFIG = PuppeteerConfigs(**DEFAULT_GLOBAL_CONFIG)
|
|||
|
||||
LIB_DIR_BROWSERS = settings.CONFIG.OUTPUT_DIR / "lib" / "browsers"
|
||||
|
||||
|
||||
class PuppeteerBinary(BaseBinary):
|
||||
name: BinName = "puppeteer"
|
||||
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
|
||||
PUPPETEER_BINARY = PuppeteerBinary()
|
||||
|
||||
|
||||
class PuppeteerBinProvider(BaseBinProvider):
|
||||
name: BinProviderName = "puppeteer"
|
||||
INSTALLER_BIN: BinName = "npx"
|
||||
|
||||
PATH: PATHStr = str(settings.CONFIG.BIN_DIR)
|
||||
|
||||
puppeteer_browsers_dir: Optional[Path] = LIB_DIR_BROWSERS
|
||||
puppeteer_install_args: List[str] = ["@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)]
|
||||
|
||||
# packages_handler: ProviderLookupDict = {
|
||||
# "chrome": lambda:
|
||||
# ['chrome@stable'],
|
||||
# }
|
||||
packages_handler: ProviderLookupDict = Field(default={
|
||||
"chrome": lambda:
|
||||
['chrome@stable'],
|
||||
}, exclude=True)
|
||||
|
||||
_browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {}
|
||||
|
||||
|
@ -61,6 +78,15 @@ class PuppeteerBinProvider(BaseBinProvider):
|
|||
|
||||
if self.puppeteer_browsers_dir:
|
||||
self.puppeteer_browsers_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def installed_browser_bins(self, browser_name: str='*') -> List[Path]:
|
||||
# if on macOS, browser binary is inside a .app, otherwise it's just a plain binary
|
||||
if platform.system().lower() == 'darwin':
|
||||
# /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
|
||||
return sorted(self.puppeteer_browsers_dir.glob(f'{browser_name}/mac*/chrome*/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing'))
|
||||
|
||||
# /data/lib/browsers/chrome/linux-131.0.6730.0/chrome-linux64/chrome
|
||||
return sorted(self.puppeteer_browsers_dir.glob(f"{browser_name}/linux*/chrome*/chrome"))
|
||||
|
||||
def on_get_abspath(self, bin_name: BinName, **context) -> Optional[HostBinPath]:
|
||||
assert bin_name == 'chrome', 'Only chrome is supported using the @puppeteer/browsers install method currently.'
|
||||
|
@ -70,21 +96,13 @@ class PuppeteerBinProvider(BaseBinProvider):
|
|||
return self._browser_abspaths[bin_name]
|
||||
|
||||
# first time loading, find browser in self.puppeteer_browsers_dir by searching filesystem for installed binaries
|
||||
browsers_present = [d.name for d in self.puppeteer_browsers_dir.glob("*")]
|
||||
if bin_name in browsers_present:
|
||||
candidates = []
|
||||
# if on macOS, browser binary is inside a .app, otherwise it's just a plain binary
|
||||
if platform.system().lower() == 'darwin':
|
||||
# /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
|
||||
candidates = sorted(self.puppeteer_browsers_dir.glob(f'/{bin_name}/mac*/chrome*/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing'))
|
||||
else:
|
||||
# /data/lib/browsers/chrome/linux-131.0.6730.0/chrome-linux64/chrome
|
||||
candidates = sorted(self.puppeteer_browsers_dir.glob(f'/{bin_name}/linux*/chrome*/chrome'))
|
||||
if candidates:
|
||||
self._browser_abspaths[bin_name] = candidates[-1]
|
||||
return self._browser_abspaths[bin_name]
|
||||
matching_bins = [abspath for abspath in self.installed_browser_bins() if bin_name in str(abspath)]
|
||||
if matching_bins:
|
||||
newest_bin = matching_bins[-1] # already sorted alphabetically, last should theoretically be highest version number
|
||||
self._browser_abspaths[bin_name] = newest_bin
|
||||
return self._browser_abspaths[bin_name]
|
||||
|
||||
return super().on_get_abspath(bin_name, **context)
|
||||
return None
|
||||
|
||||
def on_install(self, bin_name: str, packages: Optional[InstallArgs] = None, **context) -> str:
|
||||
"""npx @puppeteer/browsers install chrome@stable"""
|
||||
|
@ -119,64 +137,6 @@ class PuppeteerBinProvider(BaseBinProvider):
|
|||
|
||||
PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
|
||||
|
||||
CHROMIUM_BINARY_NAMES = [
|
||||
'chromium',
|
||||
'chromium-browser',
|
||||
'chromium-browser-beta',
|
||||
'chromium-browser-unstable',
|
||||
'chromium-browser-canary',
|
||||
'chromium-browser-dev'
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
]
|
||||
CHROME_BINARY_NAMES = [
|
||||
'google-chrome',
|
||||
'google-chrome-stable',
|
||||
'google-chrome-beta',
|
||||
'google-chrome-canary',
|
||||
'google-chrome-unstable',
|
||||
'google-chrome-dev',
|
||||
# 'chrome',
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
||||
]
|
||||
|
||||
def autodetect_system_chrome_install(PATH=None):
|
||||
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
|
||||
abspath = bin_abspath(bin_name, PATH=env.PATH)
|
||||
if abspath:
|
||||
return abspath
|
||||
return None
|
||||
|
||||
class ChromeBinary(BaseBinary):
|
||||
name: BinName = 'chrome'
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env]
|
||||
|
||||
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
||||
env.name: {
|
||||
'abspath': lambda:
|
||||
autodetect_system_chrome_install(PATH=env.PATH),
|
||||
},
|
||||
PUPPETEER_BINPROVIDER.name: {
|
||||
'packages': lambda:
|
||||
['chrome@stable'],
|
||||
}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=settings.CONFIG.BIN_DIR) -> None:
|
||||
if not (binary.abspath and binary.abspath.exists()):
|
||||
return
|
||||
bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
symlink = bin_dir / binary.name
|
||||
|
||||
if platform.system().lower() == 'darwin':
|
||||
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
|
||||
symlink.write_text(f"""#!/usr/bin/env bash\nexec '{binary.abspath}' "$@"\n""")
|
||||
symlink.chmod(0o777) # make sure its executable by everyone
|
||||
else:
|
||||
# otherwise on linux we can symlink directly to binary executable
|
||||
symlink.symlink_to(binary.abspath)
|
||||
|
||||
|
||||
# ALTERNATIVE INSTALL METHOD using Ansible:
|
||||
# install_playbook = self.plugin_dir / 'install_puppeteer.yml'
|
||||
|
@ -192,18 +152,14 @@ class ChromeBinary(BaseBinary):
|
|||
# )
|
||||
|
||||
|
||||
CHROME_BINARY = ChromeBinary()
|
||||
|
||||
PLUGIN_BINARIES = [CHROME_BINARY]
|
||||
|
||||
class PuppeteerPlugin(BasePlugin):
|
||||
app_label: str ='puppeteer'
|
||||
verbose_name: str = 'SingleFile'
|
||||
verbose_name: str = 'Puppeteer & Playwright'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
PUPPETEER_CONFIG,
|
||||
PUPPETEER_BINPROVIDER,
|
||||
CHROME_BINARY,
|
||||
PUPPETEER_BINARY,
|
||||
]
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue