new vastly simplified plugin spec without pydantic
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run

This commit is contained in:
Nick Sweeting 2024-10-14 21:50:47 -07:00
parent abf75f49f4
commit 01ba6d49d3
No known key found for this signature in database
115 changed files with 2466 additions and 2301 deletions

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.chrome'
__label__ = 'chrome'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'chrome': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import CHROME_CONFIG
return {
'chrome': CHROME_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CHROME_BINARY
return {
'chrome': CHROME_BINARY,
}
# @abx.hookimpl
# def get_EXTRACTORS():
# return {
# 'pdf': PDF_EXTRACTOR,
# 'screenshot': SCREENSHOT_EXTRACTOR,
# 'dom': DOM_EXTRACTOR,
# }

View file

@ -0,0 +1,145 @@
__package__ = 'plugins_extractor.chrome'
import os
import platform
from pathlib import Path
from typing import List, Optional
from pydantic import InstanceOf
from pydantic_pkgr import (
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# Depends on Other Plugins:
from archivebox.config import CONSTANTS
from archivebox.config.common import SHELL_CONFIG
from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
from .config import CHROME_CONFIG
CHROMIUM_BINARY_NAMES_LINUX = [
"chromium",
"chromium-browser",
"chromium-browser-beta",
"chromium-browser-unstable",
"chromium-browser-canary",
"chromium-browser-dev",
]
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
CHROME_BINARY_NAMES_LINUX = [
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-canary",
"google-chrome-unstable",
"google-chrome-dev",
"chrome"
]
CHROME_BINARY_NAMES_MACOS = [
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
]
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
APT_DEPENDENCIES = [
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
]
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
abspath = bin_abspath(bin_name, PATH=env.PATH)
if abspath:
return abspath
return None
def create_macos_app_symlink(target: Path, shortcut: Path):
"""
on macOS, some binaries are inside of .app, so we need to
create a tiny bash script instead of a symlink
(so that ../ parent relationships are relative to original .app instead of callsite dir)
"""
# TODO: should we enforce this? is it useful in any other situation?
# if platform.system().lower() != 'darwin':
# raise Exception(...)
shortcut.unlink(missing_ok=True)
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
shortcut.chmod(0o777) # make sure its executable by everyone
###################### Config ##########################
class ChromeBinary(BaseBinary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
CHROME_BINARY = ChromeBinary()

View file

@ -1,35 +1,18 @@
__package__ = 'archivebox.plugins_extractor.chrome'
__package__ = 'plugins_extractor.chrome'
import os
import sys
import platform
from pathlib import Path
from typing import List, Optional
# Depends on other PyPI/vendor packages:
from rich import print
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import (
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
from pydantic import Field, model_validator
from pydantic_pkgr import bin_abspath
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor
# from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_binary import env
# Depends on Other Plugins:
from archivebox.config import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
from archivebox.misc.logging import STDERR
from archivebox.misc.util import dedupe
@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet):
@model_validator(mode='after')
def validate_use_chrome(self):
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
print(file=sys.stderr)
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
STDERR.print()
# if user has specified a user data dir, make sure its valid
if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
# check to make sure user_data_dir/<profile_name> exists
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
print(' For more info see:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
STDERR.print(' For more info see:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if '/Default' in str(self.CHROME_USER_DATA_DIR):
print(file=sys.stderr)
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
STDERR.print()
STDERR.print(' Try removing /Default from the end e.g.:')
STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]))
# hard error is too annoying here, instead just set it to nothing
# raise SystemExit(2)
self.CHROME_USER_DATA_DIR = None
self.update_in_place(CHROME_USER_DATA_DIR=None)
else:
self.CHROME_USER_DATA_DIR = None
if self.CHROME_USER_DATA_DIR is not None:
self.update_in_place(CHROME_USER_DATA_DIR=None)
return self
@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet):
CHROME_CONFIG = ChromeConfig()
class ChromeBinary(BaseBinary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
CHROME_BINARY = ChromeBinary()
class ChromePlugin(BasePlugin):
app_label: str = 'chrome'
verbose_name: str = 'Chrome Browser'
hooks: List[InstanceOf[BaseHook]] = [
CHROME_CONFIG,
CHROME_BINARY,
]
PLUGIN = ChromePlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig