new vastly simplified plugin spec without pydantic
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run

This commit is contained in:
Nick Sweeting 2024-10-14 21:50:47 -07:00
parent abf75f49f4
commit 01ba6d49d3
No known key found for this signature in database
115 changed files with 2466 additions and 2301 deletions

View file

@ -0,0 +1,39 @@
__package__ = 'plugins_extractor.archivedotorg'
__label__ = 'archivedotorg'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://archive.org'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'archivedotorg': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import ARCHIVEDOTORG_CONFIG
return {
'archivedotorg': ARCHIVEDOTORG_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import ARCHIVEDOTORG_EXTRACTOR
#
# return {
# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
# }

View file

@ -1,28 +0,0 @@
__package__ = 'archivebox.plugins_extractor.archivedotorg'
from typing import List
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_hook import BaseHook
###################### Config ##########################
class ArchivedotorgConfig(BaseConfigSet):
SAVE_ARCHIVE_DOT_ORG: bool = True
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
class ArchivedotorgPlugin(BasePlugin):
app_label: str = 'archivedotorg'
verbose_name: str = 'Archive.org'
hooks: List[BaseHook] = [
ARCHIVEDOTORG_CONFIG
]
PLUGIN = ArchivedotorgPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,11 @@
__package__ = 'plugins_extractor.archivedotorg'
from abx.archivebox.base_configset import BaseConfigSet
class ArchivedotorgConfig(BaseConfigSet):
SAVE_ARCHIVE_DOT_ORG: bool = True
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.chrome'
__label__ = 'chrome'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'chrome': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import CHROME_CONFIG
return {
'chrome': CHROME_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CHROME_BINARY
return {
'chrome': CHROME_BINARY,
}
# @abx.hookimpl
# def get_EXTRACTORS():
# return {
# 'pdf': PDF_EXTRACTOR,
# 'screenshot': SCREENSHOT_EXTRACTOR,
# 'dom': DOM_EXTRACTOR,
# }

View file

@ -0,0 +1,145 @@
__package__ = 'plugins_extractor.chrome'
import os
import platform
from pathlib import Path
from typing import List, Optional
from pydantic import InstanceOf
from pydantic_pkgr import (
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# Depends on Other Plugins:
from archivebox.config import CONSTANTS
from archivebox.config.common import SHELL_CONFIG
from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
from .config import CHROME_CONFIG
CHROMIUM_BINARY_NAMES_LINUX = [
"chromium",
"chromium-browser",
"chromium-browser-beta",
"chromium-browser-unstable",
"chromium-browser-canary",
"chromium-browser-dev",
]
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
CHROME_BINARY_NAMES_LINUX = [
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-canary",
"google-chrome-unstable",
"google-chrome-dev",
"chrome"
]
CHROME_BINARY_NAMES_MACOS = [
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
]
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
APT_DEPENDENCIES = [
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
]
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
abspath = bin_abspath(bin_name, PATH=env.PATH)
if abspath:
return abspath
return None
def create_macos_app_symlink(target: Path, shortcut: Path):
"""
on macOS, some binaries are inside of .app, so we need to
create a tiny bash script instead of a symlink
(so that ../ parent relationships are relative to original .app instead of callsite dir)
"""
# TODO: should we enforce this? is it useful in any other situation?
# if platform.system().lower() != 'darwin':
# raise Exception(...)
shortcut.unlink(missing_ok=True)
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
shortcut.chmod(0o777) # make sure its executable by everyone
###################### Config ##########################
class ChromeBinary(BaseBinary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
CHROME_BINARY = ChromeBinary()

View file

@ -1,35 +1,18 @@
__package__ = 'archivebox.plugins_extractor.chrome'
__package__ = 'plugins_extractor.chrome'
import os
import sys
import platform
from pathlib import Path
from typing import List, Optional
# Depends on other PyPI/vendor packages:
from rich import print
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import (
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
from pydantic import Field, model_validator
from pydantic_pkgr import bin_abspath
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor
# from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_binary import env
# Depends on Other Plugins:
from archivebox.config import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
from archivebox.misc.logging import STDERR
from archivebox.misc.util import dedupe
@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet):
@model_validator(mode='after')
def validate_use_chrome(self):
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
print(file=sys.stderr)
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
STDERR.print()
# if user has specified a user data dir, make sure its valid
if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
# check to make sure user_data_dir/<profile_name> exists
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
print(' For more info see:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
STDERR.print(' For more info see:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if '/Default' in str(self.CHROME_USER_DATA_DIR):
print(file=sys.stderr)
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
STDERR.print()
STDERR.print(' Try removing /Default from the end e.g.:')
STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]))
# hard error is too annoying here, instead just set it to nothing
# raise SystemExit(2)
self.CHROME_USER_DATA_DIR = None
self.update_in_place(CHROME_USER_DATA_DIR=None)
else:
self.CHROME_USER_DATA_DIR = None
if self.CHROME_USER_DATA_DIR is not None:
self.update_in_place(CHROME_USER_DATA_DIR=None)
return self
@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet):
CHROME_CONFIG = ChromeConfig()
class ChromeBinary(BaseBinary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
CHROME_BINARY = ChromeBinary()
class ChromePlugin(BasePlugin):
app_label: str = 'chrome'
verbose_name: str = 'Chrome Browser'
hooks: List[InstanceOf[BaseHook]] = [
CHROME_CONFIG,
CHROME_BINARY,
]
PLUGIN = ChromePlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,38 @@
__package__ = 'plugins_extractor.curl'
__label__ = 'curl'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/curl/curl'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'curl': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import CURL_CONFIG
return {
'curl': CURL_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CURL_BINARY
return {
'curl': CURL_BINARY,
}

View file

@ -1,79 +0,0 @@
__package__ = 'plugins_extractor.curl'
from typing import List, Optional
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
class CurlConfig(BaseConfigSet):
SAVE_TITLE: bool = Field(default=True)
SAVE_HEADERS: bool = Field(default=True)
USE_CURL: bool = Field(default=lambda c:
ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
or FAVICON_CONFIG.SAVE_FAVICON
or c.SAVE_HEADERS
or c.SAVE_TITLE
)
CURL_BINARY: str = Field(default='curl')
CURL_ARGS: List[str] = [
'--silent',
'--location',
'--compressed',
]
CURL_EXTRA_ARGS: List[str] = []
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
CURL_CONFIG = CurlConfig()
class CurlBinary(BaseBinary):
name: BinName = CURL_CONFIG.CURL_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
CURL_BINARY = CurlBinary()
# class CurlExtractor(BaseExtractor):
# name: ExtractorName = 'curl'
# binary: str = CURL_BINARY.name
# def get_output_path(self, snapshot) -> Path | None:
# curl_index_path = curl_output_path(snapshot.as_link())
# if curl_index_path:
# return Path(curl_index_path)
# return None
# CURL_EXTRACTOR = CurlExtractor()
class CurlPlugin(BasePlugin):
app_label: str = 'curl'
verbose_name: str = 'CURL'
hooks: List[InstanceOf[BaseHook]] = [
CURL_CONFIG,
CURL_BINARY,
# CURL_EXTRACTOR,
]
PLUGIN = CurlPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,18 @@
__package__ = 'plugins_extractor.curl'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from .config import CURL_CONFIG
class CurlBinary(BaseBinary):
name: BinName = CURL_CONFIG.CURL_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
CURL_BINARY = CurlBinary()

View file

@ -0,0 +1,33 @@
__package__ = 'plugins_extractor.curl'
from typing import List, Optional
from pathlib import Path
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class CurlConfig(BaseConfigSet):
SAVE_TITLE: bool = Field(default=True)
SAVE_HEADERS: bool = Field(default=True)
USE_CURL: bool = Field(default=True)
CURL_BINARY: str = Field(default='curl')
CURL_ARGS: List[str] = [
'--silent',
'--location',
'--compressed',
]
CURL_EXTRA_ARGS: List[str] = []
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
CURL_CONFIG = CurlConfig()

View file

@ -0,0 +1,39 @@
__package__ = 'plugins_extractor.favicon'
__label__ = 'favicon'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'favicon': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import FAVICON_CONFIG
return {
'favicon': FAVICON_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import FAVICON_EXTRACTOR
# return {
# 'favicon': FAVICON_EXTRACTOR,
# }

View file

@ -1,30 +0,0 @@
__package__ = 'archivebox.plugins_extractor.favicon'
from typing import List
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_hook import BaseHook
###################### Config ##########################
class FaviconConfig(BaseConfigSet):
SAVE_FAVICON: bool = True
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
FAVICON_CONFIG = FaviconConfig()
class FaviconPlugin(BasePlugin):
app_label: str = 'favicon'
verbose_name: str = 'Favicon'
hooks: List[BaseHook] = [
FAVICON_CONFIG
]
PLUGIN = FaviconPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,13 @@
__package__ = 'plugins_extractor.favicon'
from abx.archivebox.base_configset import BaseConfigSet
class FaviconConfig(BaseConfigSet):
SAVE_FAVICON: bool = True
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
FAVICON_CONFIG = FaviconConfig()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.git'
__label__ = 'git'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/git/git'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'git': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import GIT_CONFIG
return {
'git': GIT_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import GIT_BINARY
return {
'git': GIT_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import GIT_EXTRACTOR
return {
'git': GIT_EXTRACTOR,
}

View file

@ -1,66 +0,0 @@
__package__ = 'plugins_extractor.git'
from typing import List
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG
class GitConfig(BaseConfigSet):
SAVE_GIT: bool = True
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
GIT_BINARY: str = Field(default='git')
GIT_ARGS: List[str] = [
'--recursive',
]
GIT_EXTRA_ARGS: List[str] = []
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
GIT_CONFIG = GitConfig()
class GitBinary(BaseBinary):
name: BinName = GIT_CONFIG.GIT_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
GIT_BINARY = GitBinary()
class GitExtractor(BaseExtractor):
name: ExtractorName = 'git'
binary: str = GIT_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.as_link() / 'git'
GIT_EXTRACTOR = GitExtractor()
class GitPlugin(BasePlugin):
app_label: str = 'git'
verbose_name: str = 'GIT'
hooks: List[InstanceOf[BaseHook]] = [
GIT_CONFIG,
GIT_BINARY,
GIT_EXTRACTOR,
]
PLUGIN = GitPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,18 @@
__package__ = 'plugins_extractor.git'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from .config import GIT_CONFIG
class GitBinary(BaseBinary):
name: BinName = GIT_CONFIG.GIT_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
GIT_BINARY = GitBinary()

View file

@ -0,0 +1,28 @@
__package__ = 'plugins_extractor.git'
from typing import List
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class GitConfig(BaseConfigSet):
SAVE_GIT: bool = True
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
GIT_BINARY: str = Field(default='git')
GIT_ARGS: List[str] = [
'--recursive',
]
GIT_EXTRA_ARGS: List[str] = []
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
GIT_CONFIG = GitConfig()

View file

@ -0,0 +1,17 @@
__package__ = 'plugins_extractor.git'
from pathlib import Path
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from .binaries import GIT_BINARY
class GitExtractor(BaseExtractor):
name: ExtractorName = 'git'
binary: str = GIT_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.as_link() / 'git'
GIT_EXTRACTOR = GitExtractor()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.mercury'
__label__ = 'mercury'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/postlight/mercury-parser'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'mercury': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import MERCURY_CONFIG
return {
'mercury': MERCURY_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import MERCURY_BINARY
return {
'mercury': MERCURY_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import MERCURY_EXTRACTOR
return {
'mercury': MERCURY_EXTRACTOR,
}

View file

@ -1,80 +0,0 @@
__package__ = 'plugins_extractor.mercury'
from typing import List, Optional
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
class MercuryConfig(BaseConfigSet):
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
MERCURY_BINARY: str = Field(default='postlight-parser')
MERCURY_EXTRA_ARGS: List[str] = []
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
MERCURY_CONFIG = MercuryConfig()
class MercuryBinary(BaseBinary):
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
},
SYS_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
'install': lambda: None, # never try to install things into global prefix
},
env.name: {
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
},
}
MERCURY_BINARY = MercuryBinary()
class MercuryExtractor(BaseExtractor):
name: ExtractorName = 'mercury'
binary: str = MERCURY_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.link_dir / 'mercury' / 'content.html'
MERCURY_EXTRACTOR = MercuryExtractor()
class MercuryPlugin(BasePlugin):
app_label: str = 'mercury'
verbose_name: str = 'MERCURY'
hooks: List[InstanceOf[BaseHook]] = [
MERCURY_CONFIG,
MERCURY_BINARY,
MERCURY_EXTRACTOR,
]
PLUGIN = MercuryPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,32 @@
__package__ = 'plugins_extractor.mercury'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
from abx.archivebox.base_binary import BaseBinary, env
from archivebox.plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
from .config import MERCURY_CONFIG
class MercuryBinary(BaseBinary):
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
},
SYS_NPM_BINPROVIDER.name: {
'packages': ['@postlight/parser@^2.2.3'],
'install': lambda: None, # never try to install things into global prefix
},
env.name: {
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
},
}
MERCURY_BINARY = MercuryBinary()

View file

@ -0,0 +1,31 @@
__package__ = 'plugins_extractor.mercury'
from typing import List, Optional
from pathlib import Path
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
class MercuryConfig(BaseConfigSet):
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
MERCURY_BINARY: str = Field(default='postlight-parser')
MERCURY_EXTRA_ARGS: List[str] = []
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
MERCURY_CONFIG = MercuryConfig()

View file

@ -0,0 +1,19 @@
__package__ = 'plugins_extractor.mercury'
from pathlib import Path
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from .binaries import MERCURY_BINARY
class MercuryExtractor(BaseExtractor):
name: ExtractorName = 'mercury'
binary: str = MERCURY_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return snapshot.link_dir / 'mercury' / 'content.html'
MERCURY_EXTRACTOR = MercuryExtractor()

View file

@ -0,0 +1,46 @@
__package__ = 'plugins_extractor.readability'
__label__ = 'readability'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/readability-extractor'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'readability': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import READABILITY_CONFIG
return {
'readability': READABILITY_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import READABILITY_BINARY
return {
'readability': READABILITY_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import READABILITY_EXTRACTOR
return {
'readability': READABILITY_EXTRACTOR,
}

View file

@ -1,86 +0,0 @@
__package__ = 'archivebox.plugins_extractor.readability'
from pathlib import Path
from typing import List
# from typing_extensions import Self
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################
class ReadabilityConfig(BaseConfigSet):
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
READABILITY_BINARY: str = Field(default='readability-extractor')
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
READABILITY_CONFIG = ReadabilityConfig()
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
class ReadabilityBinary(BaseBinary):
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
}
READABILITY_BINARY = ReadabilityBinary()
class ReadabilityExtractor(BaseExtractor):
name: str = 'readability'
binary: BinName = READABILITY_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'readability' / 'content.html'
READABILITY_BINARY = ReadabilityBinary()
READABILITY_EXTRACTOR = ReadabilityExtractor()
# class ReadabilityQueue(BaseQueue):
# name: str = 'singlefile'
# binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY]
# READABILITY_QUEUE = ReadabilityQueue()
class ReadabilityPlugin(BasePlugin):
app_label: str ='readability'
verbose_name: str = 'Readability'
hooks: List[InstanceOf[BaseHook]] = [
READABILITY_CONFIG,
READABILITY_BINARY,
READABILITY_EXTRACTOR,
# READABILITY_QUEUE,
]
PLUGIN = ReadabilityPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,27 @@
__package__ = 'plugins_extractor.readability'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
from abx.archivebox.base_binary import BaseBinary, env
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
from .config import READABILITY_CONFIG
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
class ReadabilityBinary(BaseBinary):
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
}
READABILITY_BINARY = ReadabilityBinary()

View file

@ -0,0 +1,19 @@
__package__ = 'plugins_extractor.readability'
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class ReadabilityConfig(BaseConfigSet):
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
READABILITY_BINARY: str = Field(default='readability-extractor')
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
READABILITY_CONFIG = ReadabilityConfig()

View file

@ -0,0 +1,20 @@
__package__ = 'plugins_extractor.readability'
from pathlib import Path
from pydantic_pkgr import BinName
from abx.archivebox.base_extractor import BaseExtractor
from .binaries import READABILITY_BINARY
class ReadabilityExtractor(BaseExtractor):
name: str = 'readability'
binary: BinName = READABILITY_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'readability' / 'content.html'
READABILITY_EXTRACTOR = ReadabilityExtractor()

View file

@ -0,0 +1,51 @@
__package__ = 'plugins_extractor.singlefile'
__label__ = 'singlefile'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/gildas-lormeau/singlefile'
__dependencies__ = ['npm']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'singlefile': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import SINGLEFILE_CONFIG
return {
'singlefile': SINGLEFILE_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import SINGLEFILE_BINARY
return {
'singlefile': SINGLEFILE_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import SINGLEFILE_EXTRACTOR
return {
'singlefile': SINGLEFILE_EXTRACTOR,
}
# @abx.hookimpl
# def get_INSTALLED_APPS():
# # needed to load ./models.py
# return [__package__]

View file

@ -1,110 +0,0 @@
__package__ = 'archivebox.plugins_extractor.singlefile'
from pathlib import Path
from typing import List, Optional
# from typing_extensions import Self
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor
from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################
class SinglefileConfig(BaseConfigSet):
SAVE_SINGLEFILE: bool = True
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
SINGLEFILE_BINARY: str = Field(default='single-file')
SINGLEFILE_EXTRA_ARGS: List[str] = []
SINGLEFILE_CONFIG = SinglefileConfig()
SINGLEFILE_MIN_VERSION = '1.1.54'
SINGLEFILE_MAX_VERSION = '1.1.60'
class SinglefileBinary(BaseBinary):
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
},
SYS_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
"install": lambda: None,
},
env.name: {
'abspath': lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
or bin_abspath('single-file', PATH=env.PATH)
or bin_abspath('single-file-node.js', PATH=env.PATH),
},
}
SINGLEFILE_BINARY = SinglefileBinary()
PLUGIN_BINARIES = [SINGLEFILE_BINARY]
class SinglefileExtractor(BaseExtractor):
name: str = 'singlefile'
binary: BinName = SINGLEFILE_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'singlefile.html'
SINGLEFILE_BINARY = SinglefileBinary()
SINGLEFILE_EXTRACTOR = SinglefileExtractor()
class SinglefileQueue(BaseQueue):
name: str = 'singlefile'
binaries: List[InstanceOf[BaseBinary]] = [SINGLEFILE_BINARY]
SINGLEFILE_QUEUE = SinglefileQueue()
class SinglefilePlugin(BasePlugin):
app_label: str ='singlefile'
verbose_name: str = 'SingleFile'
hooks: List[InstanceOf[BaseHook]] = [
SINGLEFILE_CONFIG,
SINGLEFILE_BINARY,
SINGLEFILE_EXTRACTOR,
SINGLEFILE_QUEUE,
]
PLUGIN = SinglefilePlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,48 @@
__package__ = 'plugins_extractor.singlefile'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
from abx.archivebox.base_binary import BaseBinary, env
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
from .config import SINGLEFILE_CONFIG
SINGLEFILE_MIN_VERSION = '1.1.54'
SINGLEFILE_MAX_VERSION = '1.1.60'
class SinglefileBinary(BaseBinary):
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
overrides: BinaryOverrides = {
LIB_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
},
SYS_NPM_BINPROVIDER.name: {
"abspath": lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
"install": lambda: None,
},
env.name: {
'abspath': lambda:
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
or bin_abspath('single-file', PATH=env.PATH)
or bin_abspath('single-file-node.js', PATH=env.PATH),
},
}
SINGLEFILE_BINARY = SinglefileBinary()

View file

@ -0,0 +1,25 @@
__package__ = 'plugins_extractor.singlefile'
from pathlib import Path
from typing import List, Optional
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
class SinglefileConfig(BaseConfigSet):
SAVE_SINGLEFILE: bool = True
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
SINGLEFILE_BINARY: str = Field(default='single-file')
SINGLEFILE_EXTRA_ARGS: List[str] = []
SINGLEFILE_CONFIG = SinglefileConfig()

View file

@ -0,0 +1,19 @@
__package__ = 'plugins_extractor.singlefile'
from pathlib import Path
from pydantic_pkgr import BinName
from abx.archivebox.base_extractor import BaseExtractor
from .binaries import SINGLEFILE_BINARY
class SinglefileExtractor(BaseExtractor):
name: str = 'singlefile'
binary: BinName = SINGLEFILE_BINARY.name
def get_output_path(self, snapshot) -> Path:
return Path(snapshot.link_dir) / 'singlefile.html'
SINGLEFILE_EXTRACTOR = SinglefileExtractor()

View file

@ -1,26 +0,0 @@
# Generated by Django 5.1.1 on 2024-09-10 05:05
from django.db import migrations
class Migration(migrations.Migration):
initial = True
dependencies = [
('core', '0074_alter_snapshot_downloaded_at'),
]
operations = [
migrations.CreateModel(
name='SinglefileResult',
fields=[
],
options={
'proxy': True,
'indexes': [],
'constraints': [],
},
bases=('core.archiveresult',),
),
]

View file

@ -1,40 +0,0 @@
__package__ = 'archivebox.queues'
import time
from django.core.cache import cache
from huey import crontab
from django_huey import db_task, on_startup, db_periodic_task
from huey_monitor.models import TaskModel
from huey_monitor.tqdm import ProcessInfo
@db_task(queue="singlefile", context=True)
def extract(url, out_dir, config, task=None, parent_task_id=None):
if task and parent_task_id:
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
process_info = ProcessInfo(task, desc="extract_singlefile", parent_task_id=parent_task_id, total=1)
time.sleep(5)
process_info.update(n=1)
return {'output': 'singlefile.html', 'status': 'succeeded'}
# @on_startup(queue='singlefile')
# def start_singlefile_queue():
# print("[+] Starting singlefile worker...")
# update_version.call_local()
# @db_periodic_task(crontab(minute='*/5'), queue='singlefile')
# def update_version():
# print('[*] Updating singlefile version... 5 minute interval')
# from django.conf import settings
# bin = settings.BINARIES.SinglefileBinary.load()
# if bin.version:
# cache.set(f"bin:abspath:{bin.name}", bin.abspath)
# cache.set(f"bin:version:{bin.name}:{bin.abspath}", bin.version)
# print('[√] Updated singlefile version:', bin.version, bin.abspath)

View file

@ -0,0 +1,47 @@
__package__ = 'plugins_extractor.wget'
__label__ = 'wget'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/wget'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'wget': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import WGET_CONFIG
return {
'wget': WGET_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import WGET_BINARY
return {
'wget': WGET_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR
return {
'wget': WGET_EXTRACTOR,
'warc': WARC_EXTRACTOR,
}

View file

@ -1,127 +0,0 @@
__package__ = 'plugins_extractor.wget'
import sys
from typing import List, Optional
from pathlib import Path
from subprocess import run, DEVNULL
from rich import print
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from .wget_util import wget_output_path
class WgetConfig(BaseConfigSet):
SAVE_WGET: bool = True
SAVE_WARC: bool = True
USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
WGET_BINARY: str = Field(default='wget')
WGET_ARGS: List[str] = [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
WGET_EXTRA_ARGS: List[str] = []
SAVE_WGET_REQUISITES: bool = Field(default=True)
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_WGET and self.WGET_TIMEOUT < 10:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' wget will fail to archive any sites if set to less than ~20 seconds.', file=sys.stderr)
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
print(file=sys.stderr)
return self
@property
def WGET_AUTO_COMPRESSION(self) -> bool:
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
return self._WGET_AUTO_COMPRESSION
try:
cmd = [
self.WGET_BINARY,
"--compression=auto",
"--help",
]
self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode
return self._WGET_AUTO_COMPRESSION
except (FileNotFoundError, OSError):
self._WGET_AUTO_COMPRESSION = False
return False
WGET_CONFIG = WgetConfig()
class WgetBinary(BaseBinary):
name: BinName = WGET_CONFIG.WGET_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
WGET_BINARY = WgetBinary()
class WgetExtractor(BaseExtractor):
name: ExtractorName = 'wget'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
wget_index_path = wget_output_path(snapshot.as_link())
if wget_index_path:
return Path(wget_index_path)
return None
WGET_EXTRACTOR = WgetExtractor()
class WarcExtractor(BaseExtractor):
name: ExtractorName = 'warc'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
if warc_files:
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
return None
WARC_EXTRACTOR = WarcExtractor()
class WgetPlugin(BasePlugin):
app_label: str = 'wget'
verbose_name: str = 'WGET'
hooks: List[InstanceOf[BaseHook]] = [
WGET_CONFIG,
WGET_BINARY,
WGET_EXTRACTOR,
WARC_EXTRACTOR,
]
PLUGIN = WgetPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,18 @@
__package__ = 'plugins_extractor.wget'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from .config import WGET_CONFIG
class WgetBinary(BaseBinary):
name: BinName = WGET_CONFIG.WGET_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
WGET_BINARY = WgetBinary()

View file

@ -0,0 +1,72 @@
__package__ = 'plugins_extractor.wget'
import subprocess
from typing import List, Optional
from pathlib import Path
from pydantic import Field, model_validator
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from archivebox.misc.logging import STDERR
class WgetConfig(BaseConfigSet):
SAVE_WGET: bool = True
SAVE_WARC: bool = True
USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
WGET_BINARY: str = Field(default='wget')
WGET_ARGS: List[str] = [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
WGET_EXTRA_ARGS: List[str] = []
SAVE_WGET_REQUISITES: bool = Field(default=True)
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_WGET and self.WGET_TIMEOUT < 10:
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]')
STDERR.print(' wget will fail to archive any sites if set to less than ~20 seconds.')
STDERR.print(' (Setting it somewhere over 60 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
STDERR.print()
return self
@property
def WGET_AUTO_COMPRESSION(self) -> bool:
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
return self._WGET_AUTO_COMPRESSION
try:
cmd = [
self.WGET_BINARY,
"--compression=auto",
"--help",
]
self._WGET_AUTO_COMPRESSION = not subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3).returncode
return self._WGET_AUTO_COMPRESSION
except (FileNotFoundError, OSError):
self._WGET_AUTO_COMPRESSION = False
return False
WGET_CONFIG = WgetConfig()

View file

@ -0,0 +1,37 @@
__package__ = 'plugins_extractor.wget'
from pathlib import Path
from pydantic_pkgr import BinName
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from .binaries import WGET_BINARY
from .wget_util import wget_output_path
class WgetExtractor(BaseExtractor):
name: ExtractorName = 'wget'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
wget_index_path = wget_output_path(snapshot.as_link())
if wget_index_path:
return Path(wget_index_path)
return None
WGET_EXTRACTOR = WgetExtractor()
class WarcExtractor(BaseExtractor):
name: ExtractorName = 'warc'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
if warc_files:
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
return None
WARC_EXTRACTOR = WarcExtractor()

View file

@ -0,0 +1,37 @@
__package__ = 'plugins_extractor.ytdlp'
__label__ = 'YT-DLP'
__version__ = '2024.10.14'
__author__ = 'Nick Sweeting'
__homepage__ = 'https://github.com/yt-dlp/yt-dlp'
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'ytdlp': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import YTDLP_CONFIG
return {
'ytdlp': YTDLP_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import YTDLP_BINARY, FFMPEG_BINARY
return {
'ytdlp': YTDLP_BINARY,
'ffmpeg': FFMPEG_BINARY,
}

View file

@ -1,98 +0,0 @@
import sys
from typing import List
from subprocess import run, PIPE
from rich import print
from pydantic import InstanceOf, Field, model_validator, AliasChoices
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_hook import BaseHook
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.pip.apps import pip
###################### Config ##########################
class YtdlpConfig(BaseConfigSet):
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr)
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
print(file=sys.stderr)
return self
YTDLP_CONFIG = YtdlpConfig()
class YtdlpBinary(BaseBinary):
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
YTDLP_BINARY = YtdlpBinary()
class FfmpegBinary(BaseBinary):
name: BinName = 'ffmpeg'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
'env': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
'version': lambda: run(['ffmpeg', '-version'], stdout=PIPE, stderr=PIPE, text=True).stdout,
},
'apt': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
'version': lambda: run(['apt', 'show', 'ffmpeg'], stdout=PIPE, stderr=PIPE, text=True).stdout,
},
'brew': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
'version': lambda: run(['brew', 'info', 'ffmpeg', '--quiet'], stdout=PIPE, stderr=PIPE, text=True).stdout,
},
}
# def get_ffmpeg_version(self) -> Optional[str]:
# return self.exec(cmd=['-version']).stdout
FFMPEG_BINARY = FfmpegBinary()
# class YtdlpExtractor(BaseExtractor):
# name: str = 'ytdlp'
# binary: str = 'ytdlp'
class YtdlpPlugin(BasePlugin):
app_label: str = 'ytdlp'
verbose_name: str = 'YT-DLP'
docs_url: str = 'https://github.com/yt-dlp/yt-dlp'
hooks: List[InstanceOf[BaseHook]] = [
YTDLP_CONFIG,
YTDLP_BINARY,
FFMPEG_BINARY,
]
PLUGIN = YtdlpPlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,42 @@
__package__ = 'plugins_extractor.ytdlp'
import subprocess
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from plugins_pkg.pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
from .config import YTDLP_CONFIG
class YtdlpBinary(BaseBinary):
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
YTDLP_BINARY = YtdlpBinary()
class FfmpegBinary(BaseBinary):
name: BinName = 'ffmpeg'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
overrides: BinaryOverrides = {
'env': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
'version': lambda: subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True).stdout,
},
'apt': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
'version': lambda: subprocess.run(['apt', 'show', 'ffmpeg'], capture_output=True, text=True).stdout,
},
'brew': {
# 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
'version': lambda: subprocess.run(['brew', 'info', 'ffmpeg', '--quiet'], capture_output=True, text=True).stdout,
},
}
FFMPEG_BINARY = FfmpegBinary()

View file

@ -0,0 +1,35 @@
__package__ = 'plugins_extractor.ytdlp'
from typing import List
from pydantic import Field, model_validator, AliasChoices
from abx.archivebox.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.misc.logging import STDERR
class YtdlpConfig(BaseConfigSet):
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
STDERR.print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]')
STDERR.print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
STDERR.print(' (Setting it somewhere over 60 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
STDERR.print()
return self
YTDLP_CONFIG = YtdlpConfig()