mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-16 16:14:28 -04:00
new vastly simplified plugin spec without pydantic
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
This commit is contained in:
parent
abf75f49f4
commit
01ba6d49d3
115 changed files with 2466 additions and 2301 deletions
|
@ -0,0 +1,46 @@
|
|||
__package__ = 'plugins_extractor.chrome'
|
||||
__label__ = 'chrome'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'chrome': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import CHROME_CONFIG
|
||||
|
||||
return {
|
||||
'chrome': CHROME_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import CHROME_BINARY
|
||||
|
||||
return {
|
||||
'chrome': CHROME_BINARY,
|
||||
}
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# return {
|
||||
# 'pdf': PDF_EXTRACTOR,
|
||||
# 'screenshot': SCREENSHOT_EXTRACTOR,
|
||||
# 'dom': DOM_EXTRACTOR,
|
||||
# }
|
145
archivebox/plugins_extractor/chrome/binaries.py
Normal file
145
archivebox/plugins_extractor/chrome/binaries.py
Normal file
|
@ -0,0 +1,145 @@
|
|||
__package__ = 'plugins_extractor.chrome'
|
||||
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import (
|
||||
BinProvider,
|
||||
BinName,
|
||||
BinaryOverrides,
|
||||
bin_abspath,
|
||||
)
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
|
||||
# Depends on Other Plugins:
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
|
||||
from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
|
||||
|
||||
|
||||
from .config import CHROME_CONFIG
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
"chromium",
|
||||
"chromium-browser",
|
||||
"chromium-browser-beta",
|
||||
"chromium-browser-unstable",
|
||||
"chromium-browser-canary",
|
||||
"chromium-browser-dev",
|
||||
]
|
||||
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
|
||||
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
|
||||
|
||||
CHROME_BINARY_NAMES_LINUX = [
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"google-chrome-beta",
|
||||
"google-chrome-canary",
|
||||
"google-chrome-unstable",
|
||||
"google-chrome-dev",
|
||||
"chrome"
|
||||
]
|
||||
CHROME_BINARY_NAMES_MACOS = [
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
|
||||
]
|
||||
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
|
||||
|
||||
APT_DEPENDENCIES = [
|
||||
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
|
||||
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
|
||||
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
|
||||
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
|
||||
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
|
||||
]
|
||||
|
||||
|
||||
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
|
||||
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
|
||||
abspath = bin_abspath(bin_name, PATH=env.PATH)
|
||||
if abspath:
|
||||
return abspath
|
||||
return None
|
||||
|
||||
def create_macos_app_symlink(target: Path, shortcut: Path):
|
||||
"""
|
||||
on macOS, some binaries are inside of .app, so we need to
|
||||
create a tiny bash script instead of a symlink
|
||||
(so that ../ parent relationships are relative to original .app instead of callsite dir)
|
||||
"""
|
||||
# TODO: should we enforce this? is it useful in any other situation?
|
||||
# if platform.system().lower() != 'darwin':
|
||||
# raise Exception(...)
|
||||
shortcut.unlink(missing_ok=True)
|
||||
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
|
||||
shortcut.chmod(0o777) # make sure its executable by everyone
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class ChromeBinary(BaseBinary):
|
||||
name: BinName = CHROME_CONFIG.CHROME_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
env.name: {
|
||||
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
|
||||
},
|
||||
PUPPETEER_BINPROVIDER.name: {
|
||||
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
|
||||
},
|
||||
PLAYWRIGHT_BINPROVIDER.name: {
|
||||
'packages': ['chromium'], # playwright install chromium
|
||||
},
|
||||
apt.name: {
|
||||
'packages': APT_DEPENDENCIES,
|
||||
},
|
||||
brew.name: {
|
||||
'packages': ['--cask', 'chromium'],
|
||||
},
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
|
||||
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
|
||||
return
|
||||
|
||||
bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
symlink = bin_dir / binary.name
|
||||
|
||||
try:
|
||||
if platform.system().lower() == 'darwin':
|
||||
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
|
||||
create_macos_app_symlink(binary.abspath, symlink)
|
||||
else:
|
||||
# otherwise on linux we can symlink directly to binary executable
|
||||
symlink.unlink(missing_ok=True)
|
||||
symlink.symlink_to(binary.abspath)
|
||||
except Exception as err:
|
||||
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
|
||||
# not actually needed, we can just run without it
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def chrome_cleanup_lockfile():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
"""
|
||||
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
|
||||
|
||||
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
|
||||
lock_file.unlink()
|
||||
|
||||
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
|
||||
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
|
||||
lock_file.unlink()
|
||||
|
||||
|
||||
|
||||
CHROME_BINARY = ChromeBinary()
|
||||
|
|
@ -1,35 +1,18 @@
|
|||
__package__ = 'archivebox.plugins_extractor.chrome'
|
||||
__package__ = 'plugins_extractor.chrome'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import platform
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
# Depends on other PyPI/vendor packages:
|
||||
from rich import print
|
||||
from pydantic import InstanceOf, Field, model_validator
|
||||
from pydantic_pkgr import (
|
||||
BinProvider,
|
||||
BinName,
|
||||
BinaryOverrides,
|
||||
bin_abspath,
|
||||
)
|
||||
from pydantic import Field, model_validator
|
||||
from pydantic_pkgr import bin_abspath
|
||||
|
||||
# Depends on other Django apps:
|
||||
from abx.archivebox.base_plugin import BasePlugin
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
# from abx.archivebox.base_extractor import BaseExtractor
|
||||
# from abx.archivebox.base_queue import BaseQueue
|
||||
from abx.archivebox.base_hook import BaseHook
|
||||
from abx.archivebox.base_binary import env
|
||||
|
||||
# Depends on Other Plugins:
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
|
||||
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
|
||||
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
|
||||
|
||||
from archivebox.misc.logging import STDERR
|
||||
from archivebox.misc.util import dedupe
|
||||
|
||||
|
||||
|
@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet):
|
|||
@model_validator(mode='after')
|
||||
def validate_use_chrome(self):
|
||||
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
|
||||
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
||||
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
|
||||
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
|
||||
STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
|
||||
STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
|
||||
STDERR.print()
|
||||
STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
STDERR.print()
|
||||
|
||||
# if user has specified a user data dir, make sure its valid
|
||||
if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
|
||||
# check to make sure user_data_dir/<profile_name> exists
|
||||
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
|
||||
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
|
||||
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
|
||||
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
|
||||
print(' For more info see:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
|
||||
STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
|
||||
STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
|
||||
STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
|
||||
STDERR.print(' For more info see:')
|
||||
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
|
||||
if '/Default' in str(self.CHROME_USER_DATA_DIR):
|
||||
print(file=sys.stderr)
|
||||
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
|
||||
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
|
||||
STDERR.print()
|
||||
STDERR.print(' Try removing /Default from the end e.g.:')
|
||||
STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]))
|
||||
|
||||
# hard error is too annoying here, instead just set it to nothing
|
||||
# raise SystemExit(2)
|
||||
self.CHROME_USER_DATA_DIR = None
|
||||
self.update_in_place(CHROME_USER_DATA_DIR=None)
|
||||
else:
|
||||
self.CHROME_USER_DATA_DIR = None
|
||||
if self.CHROME_USER_DATA_DIR is not None:
|
||||
self.update_in_place(CHROME_USER_DATA_DIR=None)
|
||||
|
||||
return self
|
||||
|
||||
|
@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet):
|
|||
|
||||
CHROME_CONFIG = ChromeConfig()
|
||||
|
||||
|
||||
class ChromeBinary(BaseBinary):
|
||||
name: BinName = CHROME_CONFIG.CHROME_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
env.name: {
|
||||
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
|
||||
},
|
||||
PUPPETEER_BINPROVIDER.name: {
|
||||
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
|
||||
},
|
||||
PLAYWRIGHT_BINPROVIDER.name: {
|
||||
'packages': ['chromium'], # playwright install chromium
|
||||
},
|
||||
apt.name: {
|
||||
'packages': APT_DEPENDENCIES,
|
||||
},
|
||||
brew.name: {
|
||||
'packages': ['--cask', 'chromium'],
|
||||
},
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
|
||||
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
|
||||
return
|
||||
|
||||
bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
symlink = bin_dir / binary.name
|
||||
|
||||
try:
|
||||
if platform.system().lower() == 'darwin':
|
||||
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
|
||||
create_macos_app_symlink(binary.abspath, symlink)
|
||||
else:
|
||||
# otherwise on linux we can symlink directly to binary executable
|
||||
symlink.unlink(missing_ok=True)
|
||||
symlink.symlink_to(binary.abspath)
|
||||
except Exception as err:
|
||||
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
|
||||
# not actually needed, we can just run without it
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def chrome_cleanup_lockfile():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
"""
|
||||
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
|
||||
|
||||
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
|
||||
lock_file.unlink()
|
||||
|
||||
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
|
||||
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
|
||||
lock_file.unlink()
|
||||
|
||||
|
||||
|
||||
CHROME_BINARY = ChromeBinary()
|
||||
|
||||
|
||||
class ChromePlugin(BasePlugin):
|
||||
app_label: str = 'chrome'
|
||||
verbose_name: str = 'Chrome Browser'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
CHROME_CONFIG,
|
||||
CHROME_BINARY,
|
||||
]
|
||||
|
||||
|
||||
|
||||
PLUGIN = ChromePlugin()
|
||||
# PLUGIN.register(settings)
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
Loading…
Add table
Add a link
Reference in a new issue