mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
new vastly simplified plugin spec without pydantic
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
This commit is contained in:
parent
abf75f49f4
commit
01ba6d49d3
115 changed files with 2466 additions and 2301 deletions
39
archivebox/plugins_extractor/archivedotorg/__init__.py
Normal file
39
archivebox/plugins_extractor/archivedotorg/__init__.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
__package__ = 'plugins_extractor.archivedotorg'
|
||||
__label__ = 'archivedotorg'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://archive.org'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'archivedotorg': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import ARCHIVEDOTORG_CONFIG
|
||||
|
||||
return {
|
||||
'archivedotorg': ARCHIVEDOTORG_CONFIG
|
||||
}
|
||||
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# from .extractors import ARCHIVEDOTORG_EXTRACTOR
|
||||
#
|
||||
# return {
|
||||
# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
|
||||
# }
|
|
@ -1,28 +0,0 @@
|
|||
__package__ = 'archivebox.plugins_extractor.archivedotorg'
|
||||
|
||||
from typing import List
|
||||
|
||||
from abx.archivebox.base_plugin import BasePlugin
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_hook import BaseHook
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class ArchivedotorgConfig(BaseConfigSet):
|
||||
SAVE_ARCHIVE_DOT_ORG: bool = True
|
||||
|
||||
|
||||
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
|
||||
|
||||
|
||||
class ArchivedotorgPlugin(BasePlugin):
|
||||
app_label: str = 'archivedotorg'
|
||||
verbose_name: str = 'Archive.org'
|
||||
|
||||
hooks: List[BaseHook] = [
|
||||
ARCHIVEDOTORG_CONFIG
|
||||
]
|
||||
|
||||
PLUGIN = ArchivedotorgPlugin()
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
11
archivebox/plugins_extractor/archivedotorg/config.py
Normal file
11
archivebox/plugins_extractor/archivedotorg/config.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
__package__ = 'plugins_extractor.archivedotorg'
|
||||
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
class ArchivedotorgConfig(BaseConfigSet):
|
||||
SAVE_ARCHIVE_DOT_ORG: bool = True
|
||||
|
||||
|
||||
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
|
|
@ -0,0 +1,46 @@
|
|||
__package__ = 'plugins_extractor.chrome'
|
||||
__label__ = 'chrome'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'chrome': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import CHROME_CONFIG
|
||||
|
||||
return {
|
||||
'chrome': CHROME_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import CHROME_BINARY
|
||||
|
||||
return {
|
||||
'chrome': CHROME_BINARY,
|
||||
}
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# return {
|
||||
# 'pdf': PDF_EXTRACTOR,
|
||||
# 'screenshot': SCREENSHOT_EXTRACTOR,
|
||||
# 'dom': DOM_EXTRACTOR,
|
||||
# }
|
145
archivebox/plugins_extractor/chrome/binaries.py
Normal file
145
archivebox/plugins_extractor/chrome/binaries.py
Normal file
|
@ -0,0 +1,145 @@
|
|||
__package__ = 'plugins_extractor.chrome'
|
||||
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import (
|
||||
BinProvider,
|
||||
BinName,
|
||||
BinaryOverrides,
|
||||
bin_abspath,
|
||||
)
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
|
||||
# Depends on Other Plugins:
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
|
||||
from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
|
||||
|
||||
|
||||
from .config import CHROME_CONFIG
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
"chromium",
|
||||
"chromium-browser",
|
||||
"chromium-browser-beta",
|
||||
"chromium-browser-unstable",
|
||||
"chromium-browser-canary",
|
||||
"chromium-browser-dev",
|
||||
]
|
||||
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
|
||||
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
|
||||
|
||||
CHROME_BINARY_NAMES_LINUX = [
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"google-chrome-beta",
|
||||
"google-chrome-canary",
|
||||
"google-chrome-unstable",
|
||||
"google-chrome-dev",
|
||||
"chrome"
|
||||
]
|
||||
CHROME_BINARY_NAMES_MACOS = [
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
|
||||
]
|
||||
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
|
||||
|
||||
APT_DEPENDENCIES = [
|
||||
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
|
||||
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
|
||||
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
|
||||
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
|
||||
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
|
||||
]
|
||||
|
||||
|
||||
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
|
||||
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
|
||||
abspath = bin_abspath(bin_name, PATH=env.PATH)
|
||||
if abspath:
|
||||
return abspath
|
||||
return None
|
||||
|
||||
def create_macos_app_symlink(target: Path, shortcut: Path):
|
||||
"""
|
||||
on macOS, some binaries are inside of .app, so we need to
|
||||
create a tiny bash script instead of a symlink
|
||||
(so that ../ parent relationships are relative to original .app instead of callsite dir)
|
||||
"""
|
||||
# TODO: should we enforce this? is it useful in any other situation?
|
||||
# if platform.system().lower() != 'darwin':
|
||||
# raise Exception(...)
|
||||
shortcut.unlink(missing_ok=True)
|
||||
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
|
||||
shortcut.chmod(0o777) # make sure its executable by everyone
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class ChromeBinary(BaseBinary):
|
||||
name: BinName = CHROME_CONFIG.CHROME_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
env.name: {
|
||||
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
|
||||
},
|
||||
PUPPETEER_BINPROVIDER.name: {
|
||||
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
|
||||
},
|
||||
PLAYWRIGHT_BINPROVIDER.name: {
|
||||
'packages': ['chromium'], # playwright install chromium
|
||||
},
|
||||
apt.name: {
|
||||
'packages': APT_DEPENDENCIES,
|
||||
},
|
||||
brew.name: {
|
||||
'packages': ['--cask', 'chromium'],
|
||||
},
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
|
||||
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
|
||||
return
|
||||
|
||||
bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
symlink = bin_dir / binary.name
|
||||
|
||||
try:
|
||||
if platform.system().lower() == 'darwin':
|
||||
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
|
||||
create_macos_app_symlink(binary.abspath, symlink)
|
||||
else:
|
||||
# otherwise on linux we can symlink directly to binary executable
|
||||
symlink.unlink(missing_ok=True)
|
||||
symlink.symlink_to(binary.abspath)
|
||||
except Exception as err:
|
||||
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
|
||||
# not actually needed, we can just run without it
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def chrome_cleanup_lockfile():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
"""
|
||||
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
|
||||
|
||||
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
|
||||
lock_file.unlink()
|
||||
|
||||
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
|
||||
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
|
||||
lock_file.unlink()
|
||||
|
||||
|
||||
|
||||
CHROME_BINARY = ChromeBinary()
|
||||
|
|
@ -1,35 +1,18 @@
|
|||
__package__ = 'archivebox.plugins_extractor.chrome'
|
||||
__package__ = 'plugins_extractor.chrome'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import platform
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
# Depends on other PyPI/vendor packages:
|
||||
from rich import print
|
||||
from pydantic import InstanceOf, Field, model_validator
|
||||
from pydantic_pkgr import (
|
||||
BinProvider,
|
||||
BinName,
|
||||
BinaryOverrides,
|
||||
bin_abspath,
|
||||
)
|
||||
from pydantic import Field, model_validator
|
||||
from pydantic_pkgr import bin_abspath
|
||||
|
||||
# Depends on other Django apps:
|
||||
from abx.archivebox.base_plugin import BasePlugin
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
# from abx.archivebox.base_extractor import BaseExtractor
|
||||
# from abx.archivebox.base_queue import BaseQueue
|
||||
from abx.archivebox.base_hook import BaseHook
|
||||
from abx.archivebox.base_binary import env
|
||||
|
||||
# Depends on Other Plugins:
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
|
||||
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
|
||||
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
|
||||
|
||||
from archivebox.misc.logging import STDERR
|
||||
from archivebox.misc.util import dedupe
|
||||
|
||||
|
||||
|
@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet):
|
|||
@model_validator(mode='after')
|
||||
def validate_use_chrome(self):
|
||||
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
|
||||
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
||||
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
|
||||
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
|
||||
STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
|
||||
STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
|
||||
STDERR.print()
|
||||
STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
STDERR.print()
|
||||
|
||||
# if user has specified a user data dir, make sure its valid
|
||||
if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
|
||||
# check to make sure user_data_dir/<profile_name> exists
|
||||
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
|
||||
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
|
||||
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
|
||||
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
|
||||
print(' For more info see:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
|
||||
STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
|
||||
STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
|
||||
STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
|
||||
STDERR.print(' For more info see:')
|
||||
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
|
||||
if '/Default' in str(self.CHROME_USER_DATA_DIR):
|
||||
print(file=sys.stderr)
|
||||
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
|
||||
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
|
||||
STDERR.print()
|
||||
STDERR.print(' Try removing /Default from the end e.g.:')
|
||||
STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]))
|
||||
|
||||
# hard error is too annoying here, instead just set it to nothing
|
||||
# raise SystemExit(2)
|
||||
self.CHROME_USER_DATA_DIR = None
|
||||
self.update_in_place(CHROME_USER_DATA_DIR=None)
|
||||
else:
|
||||
self.CHROME_USER_DATA_DIR = None
|
||||
if self.CHROME_USER_DATA_DIR is not None:
|
||||
self.update_in_place(CHROME_USER_DATA_DIR=None)
|
||||
|
||||
return self
|
||||
|
||||
|
@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet):
|
|||
|
||||
CHROME_CONFIG = ChromeConfig()
|
||||
|
||||
|
||||
class ChromeBinary(BaseBinary):
|
||||
name: BinName = CHROME_CONFIG.CHROME_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
env.name: {
|
||||
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
|
||||
},
|
||||
PUPPETEER_BINPROVIDER.name: {
|
||||
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
|
||||
},
|
||||
PLAYWRIGHT_BINPROVIDER.name: {
|
||||
'packages': ['chromium'], # playwright install chromium
|
||||
},
|
||||
apt.name: {
|
||||
'packages': APT_DEPENDENCIES,
|
||||
},
|
||||
brew.name: {
|
||||
'packages': ['--cask', 'chromium'],
|
||||
},
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
|
||||
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
|
||||
return
|
||||
|
||||
bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
symlink = bin_dir / binary.name
|
||||
|
||||
try:
|
||||
if platform.system().lower() == 'darwin':
|
||||
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
|
||||
create_macos_app_symlink(binary.abspath, symlink)
|
||||
else:
|
||||
# otherwise on linux we can symlink directly to binary executable
|
||||
symlink.unlink(missing_ok=True)
|
||||
symlink.symlink_to(binary.abspath)
|
||||
except Exception as err:
|
||||
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
|
||||
# not actually needed, we can just run without it
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def chrome_cleanup_lockfile():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
"""
|
||||
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
|
||||
|
||||
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
|
||||
lock_file.unlink()
|
||||
|
||||
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
|
||||
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
|
||||
lock_file.unlink()
|
||||
|
||||
|
||||
|
||||
CHROME_BINARY = ChromeBinary()
|
||||
|
||||
|
||||
class ChromePlugin(BasePlugin):
|
||||
app_label: str = 'chrome'
|
||||
verbose_name: str = 'Chrome Browser'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
CHROME_CONFIG,
|
||||
CHROME_BINARY,
|
||||
]
|
||||
|
||||
|
||||
|
||||
PLUGIN = ChromePlugin()
|
||||
# PLUGIN.register(settings)
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
38
archivebox/plugins_extractor/curl/__init__.py
Normal file
38
archivebox/plugins_extractor/curl/__init__.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
__package__ = 'plugins_extractor.curl'
|
||||
__label__ = 'curl'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://github.com/curl/curl'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'curl': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import CURL_CONFIG
|
||||
|
||||
return {
|
||||
'curl': CURL_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import CURL_BINARY
|
||||
|
||||
return {
|
||||
'curl': CURL_BINARY,
|
||||
}
|
|
@ -1,79 +0,0 @@
|
|||
__package__ = 'plugins_extractor.curl'
|
||||
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import InstanceOf, Field
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
||||
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
|
||||
|
||||
class CurlConfig(BaseConfigSet):
|
||||
|
||||
SAVE_TITLE: bool = Field(default=True)
|
||||
SAVE_HEADERS: bool = Field(default=True)
|
||||
USE_CURL: bool = Field(default=lambda c:
|
||||
ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
|
||||
or FAVICON_CONFIG.SAVE_FAVICON
|
||||
or c.SAVE_HEADERS
|
||||
or c.SAVE_TITLE
|
||||
)
|
||||
|
||||
CURL_BINARY: str = Field(default='curl')
|
||||
CURL_ARGS: List[str] = [
|
||||
'--silent',
|
||||
'--location',
|
||||
'--compressed',
|
||||
]
|
||||
CURL_EXTRA_ARGS: List[str] = []
|
||||
|
||||
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
|
||||
CURL_CONFIG = CurlConfig()
|
||||
|
||||
|
||||
class CurlBinary(BaseBinary):
|
||||
name: BinName = CURL_CONFIG.CURL_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
CURL_BINARY = CurlBinary()
|
||||
|
||||
|
||||
# class CurlExtractor(BaseExtractor):
|
||||
# name: ExtractorName = 'curl'
|
||||
# binary: str = CURL_BINARY.name
|
||||
|
||||
# def get_output_path(self, snapshot) -> Path | None:
|
||||
# curl_index_path = curl_output_path(snapshot.as_link())
|
||||
# if curl_index_path:
|
||||
# return Path(curl_index_path)
|
||||
# return None
|
||||
|
||||
# CURL_EXTRACTOR = CurlExtractor()
|
||||
|
||||
|
||||
|
||||
class CurlPlugin(BasePlugin):
|
||||
app_label: str = 'curl'
|
||||
verbose_name: str = 'CURL'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
CURL_CONFIG,
|
||||
CURL_BINARY,
|
||||
# CURL_EXTRACTOR,
|
||||
]
|
||||
|
||||
|
||||
PLUGIN = CurlPlugin()
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
18
archivebox/plugins_extractor/curl/binaries.py
Normal file
18
archivebox/plugins_extractor/curl/binaries.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
__package__ = 'plugins_extractor.curl'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
|
||||
|
||||
from .config import CURL_CONFIG
|
||||
|
||||
|
||||
class CurlBinary(BaseBinary):
|
||||
name: BinName = CURL_CONFIG.CURL_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
CURL_BINARY = CurlBinary()
|
33
archivebox/plugins_extractor/curl/config.py
Normal file
33
archivebox/plugins_extractor/curl/config.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
__package__ = 'plugins_extractor.curl'
|
||||
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
class CurlConfig(BaseConfigSet):
|
||||
|
||||
SAVE_TITLE: bool = Field(default=True)
|
||||
SAVE_HEADERS: bool = Field(default=True)
|
||||
USE_CURL: bool = Field(default=True)
|
||||
|
||||
CURL_BINARY: str = Field(default='curl')
|
||||
CURL_ARGS: List[str] = [
|
||||
'--silent',
|
||||
'--location',
|
||||
'--compressed',
|
||||
]
|
||||
CURL_EXTRA_ARGS: List[str] = []
|
||||
|
||||
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
|
||||
CURL_CONFIG = CurlConfig()
|
39
archivebox/plugins_extractor/favicon/__init__.py
Normal file
39
archivebox/plugins_extractor/favicon/__init__.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
__package__ = 'plugins_extractor.favicon'
|
||||
__label__ = 'favicon'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'favicon': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import FAVICON_CONFIG
|
||||
|
||||
return {
|
||||
'favicon': FAVICON_CONFIG
|
||||
}
|
||||
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# from .extractors import FAVICON_EXTRACTOR
|
||||
|
||||
# return {
|
||||
# 'favicon': FAVICON_EXTRACTOR,
|
||||
# }
|
|
@ -1,30 +0,0 @@
|
|||
__package__ = 'archivebox.plugins_extractor.favicon'
|
||||
|
||||
from typing import List
|
||||
|
||||
from abx.archivebox.base_plugin import BasePlugin
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_hook import BaseHook
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class FaviconConfig(BaseConfigSet):
|
||||
SAVE_FAVICON: bool = True
|
||||
|
||||
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
|
||||
|
||||
|
||||
FAVICON_CONFIG = FaviconConfig()
|
||||
|
||||
|
||||
class FaviconPlugin(BasePlugin):
|
||||
app_label: str = 'favicon'
|
||||
verbose_name: str = 'Favicon'
|
||||
|
||||
hooks: List[BaseHook] = [
|
||||
FAVICON_CONFIG
|
||||
]
|
||||
|
||||
PLUGIN = FaviconPlugin()
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
13
archivebox/plugins_extractor/favicon/config.py
Normal file
13
archivebox/plugins_extractor/favicon/config.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
__package__ = 'plugins_extractor.favicon'
|
||||
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
class FaviconConfig(BaseConfigSet):
|
||||
SAVE_FAVICON: bool = True
|
||||
|
||||
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
|
||||
|
||||
|
||||
FAVICON_CONFIG = FaviconConfig()
|
46
archivebox/plugins_extractor/git/__init__.py
Normal file
46
archivebox/plugins_extractor/git/__init__.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
__package__ = 'plugins_extractor.git'
|
||||
__label__ = 'git'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://github.com/git/git'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'git': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import GIT_CONFIG
|
||||
|
||||
return {
|
||||
'git': GIT_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import GIT_BINARY
|
||||
|
||||
return {
|
||||
'git': GIT_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
from .extractors import GIT_EXTRACTOR
|
||||
|
||||
return {
|
||||
'git': GIT_EXTRACTOR,
|
||||
}
|
|
@ -1,66 +0,0 @@
|
|||
__package__ = 'plugins_extractor.git'
|
||||
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import InstanceOf, Field
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
class GitConfig(BaseConfigSet):
|
||||
|
||||
SAVE_GIT: bool = True
|
||||
|
||||
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
|
||||
|
||||
GIT_BINARY: str = Field(default='git')
|
||||
GIT_ARGS: List[str] = [
|
||||
'--recursive',
|
||||
]
|
||||
GIT_EXTRA_ARGS: List[str] = []
|
||||
|
||||
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
|
||||
|
||||
GIT_CONFIG = GitConfig()
|
||||
|
||||
|
||||
class GitBinary(BaseBinary):
|
||||
name: BinName = GIT_CONFIG.GIT_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
GIT_BINARY = GitBinary()
|
||||
|
||||
|
||||
class GitExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'git'
|
||||
binary: str = GIT_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
return snapshot.as_link() / 'git'
|
||||
|
||||
GIT_EXTRACTOR = GitExtractor()
|
||||
|
||||
|
||||
|
||||
class GitPlugin(BasePlugin):
|
||||
app_label: str = 'git'
|
||||
verbose_name: str = 'GIT'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
GIT_CONFIG,
|
||||
GIT_BINARY,
|
||||
GIT_EXTRACTOR,
|
||||
]
|
||||
|
||||
|
||||
PLUGIN = GitPlugin()
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
18
archivebox/plugins_extractor/git/binaries.py
Normal file
18
archivebox/plugins_extractor/git/binaries.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
__package__ = 'plugins_extractor.git'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
|
||||
from .config import GIT_CONFIG
|
||||
|
||||
|
||||
|
||||
class GitBinary(BaseBinary):
|
||||
name: BinName = GIT_CONFIG.GIT_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
GIT_BINARY = GitBinary()
|
28
archivebox/plugins_extractor/git/config.py
Normal file
28
archivebox/plugins_extractor/git/config.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
__package__ = 'plugins_extractor.git'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
class GitConfig(BaseConfigSet):
|
||||
|
||||
SAVE_GIT: bool = True
|
||||
|
||||
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
|
||||
|
||||
GIT_BINARY: str = Field(default='git')
|
||||
GIT_ARGS: List[str] = [
|
||||
'--recursive',
|
||||
]
|
||||
GIT_EXTRA_ARGS: List[str] = []
|
||||
|
||||
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
|
||||
|
||||
GIT_CONFIG = GitConfig()
|
17
archivebox/plugins_extractor/git/extractors.py
Normal file
17
archivebox/plugins_extractor/git/extractors.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
__package__ = 'plugins_extractor.git'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from .binaries import GIT_BINARY
|
||||
|
||||
|
||||
class GitExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'git'
|
||||
binary: str = GIT_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
return snapshot.as_link() / 'git'
|
||||
|
||||
GIT_EXTRACTOR = GitExtractor()
|
46
archivebox/plugins_extractor/mercury/__init__.py
Normal file
46
archivebox/plugins_extractor/mercury/__init__.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
__package__ = 'plugins_extractor.mercury'
|
||||
__label__ = 'mercury'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://github.com/postlight/mercury-parser'
|
||||
__dependencies__ = ['npm']
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'mercury': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import MERCURY_CONFIG
|
||||
|
||||
return {
|
||||
'mercury': MERCURY_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import MERCURY_BINARY
|
||||
|
||||
return {
|
||||
'mercury': MERCURY_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
from .extractors import MERCURY_EXTRACTOR
|
||||
|
||||
return {
|
||||
'mercury': MERCURY_EXTRACTOR,
|
||||
}
|
|
@ -1,80 +0,0 @@
|
|||
__package__ = 'plugins_extractor.mercury'
|
||||
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import InstanceOf, Field
|
||||
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
|
||||
|
||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import BaseBinary, env
|
||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||
|
||||
class MercuryConfig(BaseConfigSet):
|
||||
|
||||
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
|
||||
|
||||
MERCURY_BINARY: str = Field(default='postlight-parser')
|
||||
MERCURY_EXTRA_ARGS: List[str] = []
|
||||
|
||||
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
|
||||
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
||||
|
||||
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
|
||||
|
||||
MERCURY_CONFIG = MercuryConfig()
|
||||
|
||||
|
||||
class MercuryBinary(BaseBinary):
|
||||
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
LIB_NPM_BINPROVIDER.name: {
|
||||
'packages': ['@postlight/parser@^2.2.3'],
|
||||
},
|
||||
SYS_NPM_BINPROVIDER.name: {
|
||||
'packages': ['@postlight/parser@^2.2.3'],
|
||||
'install': lambda: None, # never try to install things into global prefix
|
||||
},
|
||||
env.name: {
|
||||
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
|
||||
},
|
||||
}
|
||||
|
||||
MERCURY_BINARY = MercuryBinary()
|
||||
|
||||
|
||||
class MercuryExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'mercury'
|
||||
binary: str = MERCURY_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
return snapshot.link_dir / 'mercury' / 'content.html'
|
||||
|
||||
MERCURY_EXTRACTOR = MercuryExtractor()
|
||||
|
||||
|
||||
|
||||
class MercuryPlugin(BasePlugin):
|
||||
app_label: str = 'mercury'
|
||||
verbose_name: str = 'MERCURY'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
MERCURY_CONFIG,
|
||||
MERCURY_BINARY,
|
||||
MERCURY_EXTRACTOR,
|
||||
]
|
||||
|
||||
|
||||
PLUGIN = MercuryPlugin()
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
32
archivebox/plugins_extractor/mercury/binaries.py
Normal file
32
archivebox/plugins_extractor/mercury/binaries.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
__package__ = 'plugins_extractor.mercury'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env
|
||||
|
||||
from archivebox.plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||
|
||||
from .config import MERCURY_CONFIG
|
||||
|
||||
|
||||
class MercuryBinary(BaseBinary):
|
||||
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
LIB_NPM_BINPROVIDER.name: {
|
||||
'packages': ['@postlight/parser@^2.2.3'],
|
||||
},
|
||||
SYS_NPM_BINPROVIDER.name: {
|
||||
'packages': ['@postlight/parser@^2.2.3'],
|
||||
'install': lambda: None, # never try to install things into global prefix
|
||||
},
|
||||
env.name: {
|
||||
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
|
||||
},
|
||||
}
|
||||
|
||||
MERCURY_BINARY = MercuryBinary()
|
31
archivebox/plugins_extractor/mercury/config.py
Normal file
31
archivebox/plugins_extractor/mercury/config.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
__package__ = 'plugins_extractor.mercury'
|
||||
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
|
||||
|
||||
|
||||
class MercuryConfig(BaseConfigSet):
|
||||
|
||||
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
|
||||
|
||||
MERCURY_BINARY: str = Field(default='postlight-parser')
|
||||
MERCURY_EXTRA_ARGS: List[str] = []
|
||||
|
||||
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
|
||||
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
||||
|
||||
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
|
||||
|
||||
MERCURY_CONFIG = MercuryConfig()
|
19
archivebox/plugins_extractor/mercury/extractors.py
Normal file
19
archivebox/plugins_extractor/mercury/extractors.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
__package__ = 'plugins_extractor.mercury'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from .binaries import MERCURY_BINARY
|
||||
|
||||
|
||||
|
||||
class MercuryExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'mercury'
|
||||
binary: str = MERCURY_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
return snapshot.link_dir / 'mercury' / 'content.html'
|
||||
|
||||
|
||||
MERCURY_EXTRACTOR = MercuryExtractor()
|
46
archivebox/plugins_extractor/readability/__init__.py
Normal file
46
archivebox/plugins_extractor/readability/__init__.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
__package__ = 'plugins_extractor.readability'
|
||||
__label__ = 'readability'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/readability-extractor'
|
||||
__dependencies__ = ['npm']
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'readability': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import READABILITY_CONFIG
|
||||
|
||||
return {
|
||||
'readability': READABILITY_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import READABILITY_BINARY
|
||||
|
||||
return {
|
||||
'readability': READABILITY_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
from .extractors import READABILITY_EXTRACTOR
|
||||
|
||||
return {
|
||||
'readability': READABILITY_EXTRACTOR,
|
||||
}
|
|
@ -1,86 +0,0 @@
|
|||
__package__ = 'archivebox.plugins_extractor.readability'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
# from typing_extensions import Self
|
||||
|
||||
# Depends on other PyPI/vendor packages:
|
||||
from pydantic import InstanceOf, Field
|
||||
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
|
||||
|
||||
# Depends on other Django apps:
|
||||
from abx.archivebox.base_plugin import BasePlugin
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import BaseBinary, env
|
||||
from abx.archivebox.base_extractor import BaseExtractor
|
||||
from abx.archivebox.base_hook import BaseHook
|
||||
|
||||
# Depends on Other Plugins:
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
class ReadabilityConfig(BaseConfigSet):
|
||||
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
|
||||
|
||||
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
|
||||
READABILITY_BINARY: str = Field(default='readability-extractor')
|
||||
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
|
||||
|
||||
|
||||
READABILITY_CONFIG = ReadabilityConfig()
|
||||
|
||||
|
||||
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
|
||||
|
||||
class ReadabilityBinary(BaseBinary):
|
||||
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
|
||||
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
READABILITY_BINARY = ReadabilityBinary()
|
||||
|
||||
|
||||
class ReadabilityExtractor(BaseExtractor):
|
||||
name: str = 'readability'
|
||||
binary: BinName = READABILITY_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path:
|
||||
return Path(snapshot.link_dir) / 'readability' / 'content.html'
|
||||
|
||||
|
||||
READABILITY_BINARY = ReadabilityBinary()
|
||||
READABILITY_EXTRACTOR = ReadabilityExtractor()
|
||||
|
||||
# class ReadabilityQueue(BaseQueue):
|
||||
# name: str = 'singlefile'
|
||||
|
||||
# binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY]
|
||||
|
||||
# READABILITY_QUEUE = ReadabilityQueue()
|
||||
|
||||
class ReadabilityPlugin(BasePlugin):
|
||||
app_label: str ='readability'
|
||||
verbose_name: str = 'Readability'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
READABILITY_CONFIG,
|
||||
READABILITY_BINARY,
|
||||
READABILITY_EXTRACTOR,
|
||||
# READABILITY_QUEUE,
|
||||
]
|
||||
|
||||
|
||||
|
||||
PLUGIN = ReadabilityPlugin()
|
||||
# PLUGIN.register(settings)
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
27
archivebox/plugins_extractor/readability/binaries.py
Normal file
27
archivebox/plugins_extractor/readability/binaries.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
__package__ = 'plugins_extractor.readability'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env
|
||||
|
||||
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||
|
||||
from .config import READABILITY_CONFIG
|
||||
|
||||
|
||||
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
|
||||
|
||||
class ReadabilityBinary(BaseBinary):
|
||||
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
|
||||
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
|
||||
}
|
||||
|
||||
|
||||
READABILITY_BINARY = ReadabilityBinary()
|
19
archivebox/plugins_extractor/readability/config.py
Normal file
19
archivebox/plugins_extractor/readability/config.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
__package__ = 'plugins_extractor.readability'
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
class ReadabilityConfig(BaseConfigSet):
|
||||
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
|
||||
|
||||
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
|
||||
READABILITY_BINARY: str = Field(default='readability-extractor')
|
||||
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
|
||||
|
||||
|
||||
READABILITY_CONFIG = ReadabilityConfig()
|
20
archivebox/plugins_extractor/readability/extractors.py
Normal file
20
archivebox/plugins_extractor/readability/extractors.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
__package__ = 'plugins_extractor.readability'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic_pkgr import BinName
|
||||
|
||||
from abx.archivebox.base_extractor import BaseExtractor
|
||||
|
||||
from .binaries import READABILITY_BINARY
|
||||
|
||||
|
||||
class ReadabilityExtractor(BaseExtractor):
|
||||
name: str = 'readability'
|
||||
binary: BinName = READABILITY_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path:
|
||||
return Path(snapshot.link_dir) / 'readability' / 'content.html'
|
||||
|
||||
|
||||
READABILITY_EXTRACTOR = ReadabilityExtractor()
|
|
@ -0,0 +1,51 @@
|
|||
__package__ = 'plugins_extractor.singlefile'
|
||||
__label__ = 'singlefile'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://github.com/gildas-lormeau/singlefile'
|
||||
__dependencies__ = ['npm']
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'singlefile': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import SINGLEFILE_CONFIG
|
||||
|
||||
return {
|
||||
'singlefile': SINGLEFILE_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import SINGLEFILE_BINARY
|
||||
|
||||
return {
|
||||
'singlefile': SINGLEFILE_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
from .extractors import SINGLEFILE_EXTRACTOR
|
||||
|
||||
return {
|
||||
'singlefile': SINGLEFILE_EXTRACTOR,
|
||||
}
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_INSTALLED_APPS():
|
||||
# # needed to load ./models.py
|
||||
# return [__package__]
|
|
@ -1,110 +0,0 @@
|
|||
__package__ = 'archivebox.plugins_extractor.singlefile'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
# from typing_extensions import Self
|
||||
|
||||
# Depends on other PyPI/vendor packages:
|
||||
from pydantic import InstanceOf, Field
|
||||
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
|
||||
|
||||
# Depends on other Django apps:
|
||||
from abx.archivebox.base_plugin import BasePlugin
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import BaseBinary, env
|
||||
from abx.archivebox.base_extractor import BaseExtractor
|
||||
from abx.archivebox.base_queue import BaseQueue
|
||||
from abx.archivebox.base_hook import BaseHook
|
||||
|
||||
# Depends on Other Plugins:
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
class SinglefileConfig(BaseConfigSet):
|
||||
SAVE_SINGLEFILE: bool = True
|
||||
|
||||
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
SINGLEFILE_BINARY: str = Field(default='single-file')
|
||||
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
||||
|
||||
|
||||
SINGLEFILE_CONFIG = SinglefileConfig()
|
||||
|
||||
|
||||
SINGLEFILE_MIN_VERSION = '1.1.54'
|
||||
SINGLEFILE_MAX_VERSION = '1.1.60'
|
||||
|
||||
|
||||
class SinglefileBinary(BaseBinary):
|
||||
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
LIB_NPM_BINPROVIDER.name: {
|
||||
"abspath": lambda:
|
||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
|
||||
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
||||
},
|
||||
SYS_NPM_BINPROVIDER.name: {
|
||||
"abspath": lambda:
|
||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
|
||||
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
||||
"install": lambda: None,
|
||||
},
|
||||
env.name: {
|
||||
'abspath': lambda:
|
||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
|
||||
or bin_abspath('single-file', PATH=env.PATH)
|
||||
or bin_abspath('single-file-node.js', PATH=env.PATH),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
SINGLEFILE_BINARY = SinglefileBinary()
|
||||
|
||||
PLUGIN_BINARIES = [SINGLEFILE_BINARY]
|
||||
|
||||
class SinglefileExtractor(BaseExtractor):
|
||||
name: str = 'singlefile'
|
||||
binary: BinName = SINGLEFILE_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path:
|
||||
return Path(snapshot.link_dir) / 'singlefile.html'
|
||||
|
||||
|
||||
SINGLEFILE_BINARY = SinglefileBinary()
|
||||
SINGLEFILE_EXTRACTOR = SinglefileExtractor()
|
||||
|
||||
class SinglefileQueue(BaseQueue):
|
||||
name: str = 'singlefile'
|
||||
|
||||
binaries: List[InstanceOf[BaseBinary]] = [SINGLEFILE_BINARY]
|
||||
|
||||
SINGLEFILE_QUEUE = SinglefileQueue()
|
||||
|
||||
class SinglefilePlugin(BasePlugin):
|
||||
app_label: str ='singlefile'
|
||||
verbose_name: str = 'SingleFile'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
SINGLEFILE_CONFIG,
|
||||
SINGLEFILE_BINARY,
|
||||
SINGLEFILE_EXTRACTOR,
|
||||
SINGLEFILE_QUEUE,
|
||||
]
|
||||
|
||||
|
||||
|
||||
PLUGIN = SinglefilePlugin()
|
||||
# PLUGIN.register(settings)
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
48
archivebox/plugins_extractor/singlefile/binaries.py
Normal file
48
archivebox/plugins_extractor/singlefile/binaries.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
__package__ = 'plugins_extractor.singlefile'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env
|
||||
|
||||
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||
|
||||
from .config import SINGLEFILE_CONFIG
|
||||
|
||||
|
||||
SINGLEFILE_MIN_VERSION = '1.1.54'
|
||||
SINGLEFILE_MAX_VERSION = '1.1.60'
|
||||
|
||||
|
||||
class SinglefileBinary(BaseBinary):
|
||||
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
LIB_NPM_BINPROVIDER.name: {
|
||||
"abspath": lambda:
|
||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
|
||||
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
||||
},
|
||||
SYS_NPM_BINPROVIDER.name: {
|
||||
"abspath": lambda:
|
||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
|
||||
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
||||
"install": lambda: None,
|
||||
},
|
||||
env.name: {
|
||||
'abspath': lambda:
|
||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
|
||||
or bin_abspath('single-file', PATH=env.PATH)
|
||||
or bin_abspath('single-file-node.js', PATH=env.PATH),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
SINGLEFILE_BINARY = SinglefileBinary()
|
25
archivebox/plugins_extractor/singlefile/config.py
Normal file
25
archivebox/plugins_extractor/singlefile/config.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
__package__ = 'plugins_extractor.singlefile'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
class SinglefileConfig(BaseConfigSet):
|
||||
SAVE_SINGLEFILE: bool = True
|
||||
|
||||
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
SINGLEFILE_BINARY: str = Field(default='single-file')
|
||||
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
||||
|
||||
|
||||
SINGLEFILE_CONFIG = SinglefileConfig()
|
19
archivebox/plugins_extractor/singlefile/extractors.py
Normal file
19
archivebox/plugins_extractor/singlefile/extractors.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
__package__ = 'plugins_extractor.singlefile'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic_pkgr import BinName
|
||||
from abx.archivebox.base_extractor import BaseExtractor
|
||||
|
||||
from .binaries import SINGLEFILE_BINARY
|
||||
|
||||
|
||||
class SinglefileExtractor(BaseExtractor):
|
||||
name: str = 'singlefile'
|
||||
binary: BinName = SINGLEFILE_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path:
|
||||
return Path(snapshot.link_dir) / 'singlefile.html'
|
||||
|
||||
|
||||
SINGLEFILE_EXTRACTOR = SinglefileExtractor()
|
|
@ -1,26 +0,0 @@
|
|||
# Generated by Django 5.1.1 on 2024-09-10 05:05
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('core', '0074_alter_snapshot_downloaded_at'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='SinglefileResult',
|
||||
fields=[
|
||||
],
|
||||
options={
|
||||
'proxy': True,
|
||||
'indexes': [],
|
||||
'constraints': [],
|
||||
},
|
||||
bases=('core.archiveresult',),
|
||||
),
|
||||
]
|
|
@ -1,40 +0,0 @@
|
|||
__package__ = 'archivebox.queues'
|
||||
|
||||
import time
|
||||
|
||||
from django.core.cache import cache
|
||||
|
||||
from huey import crontab
|
||||
from django_huey import db_task, on_startup, db_periodic_task
|
||||
from huey_monitor.models import TaskModel
|
||||
from huey_monitor.tqdm import ProcessInfo
|
||||
|
||||
@db_task(queue="singlefile", context=True)
|
||||
def extract(url, out_dir, config, task=None, parent_task_id=None):
|
||||
if task and parent_task_id:
|
||||
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
|
||||
|
||||
process_info = ProcessInfo(task, desc="extract_singlefile", parent_task_id=parent_task_id, total=1)
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
process_info.update(n=1)
|
||||
return {'output': 'singlefile.html', 'status': 'succeeded'}
|
||||
|
||||
|
||||
# @on_startup(queue='singlefile')
|
||||
# def start_singlefile_queue():
|
||||
# print("[+] Starting singlefile worker...")
|
||||
# update_version.call_local()
|
||||
|
||||
|
||||
# @db_periodic_task(crontab(minute='*/5'), queue='singlefile')
|
||||
# def update_version():
|
||||
# print('[*] Updating singlefile version... 5 minute interval')
|
||||
# from django.conf import settings
|
||||
|
||||
# bin = settings.BINARIES.SinglefileBinary.load()
|
||||
# if bin.version:
|
||||
# cache.set(f"bin:abspath:{bin.name}", bin.abspath)
|
||||
# cache.set(f"bin:version:{bin.name}:{bin.abspath}", bin.version)
|
||||
# print('[√] Updated singlefile version:', bin.version, bin.abspath)
|
47
archivebox/plugins_extractor/wget/__init__.py
Normal file
47
archivebox/plugins_extractor/wget/__init__.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
__package__ = 'plugins_extractor.wget'
|
||||
__label__ = 'wget'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/wget'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'wget': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import WGET_CONFIG
|
||||
|
||||
return {
|
||||
'wget': WGET_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import WGET_BINARY
|
||||
|
||||
return {
|
||||
'wget': WGET_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR
|
||||
|
||||
return {
|
||||
'wget': WGET_EXTRACTOR,
|
||||
'warc': WARC_EXTRACTOR,
|
||||
}
|
|
@ -1,127 +0,0 @@
|
|||
__package__ = 'plugins_extractor.wget'
|
||||
|
||||
import sys
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
from subprocess import run, DEVNULL
|
||||
|
||||
from rich import print
|
||||
from pydantic import InstanceOf, Field, model_validator
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
from abx.archivebox.base_plugin import BasePlugin, BaseHook
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
from .wget_util import wget_output_path
|
||||
|
||||
|
||||
class WgetConfig(BaseConfigSet):
|
||||
|
||||
SAVE_WGET: bool = True
|
||||
SAVE_WARC: bool = True
|
||||
|
||||
USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
|
||||
|
||||
WGET_BINARY: str = Field(default='wget')
|
||||
WGET_ARGS: List[str] = [
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]
|
||||
WGET_EXTRA_ARGS: List[str] = []
|
||||
|
||||
SAVE_WGET_REQUISITES: bool = Field(default=True)
|
||||
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
||||
|
||||
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_use_ytdlp(self):
|
||||
if self.USE_WGET and self.WGET_TIMEOUT < 10:
|
||||
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
||||
print(' wget will fail to archive any sites if set to less than ~20 seconds.', file=sys.stderr)
|
||||
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
return self
|
||||
|
||||
@property
|
||||
def WGET_AUTO_COMPRESSION(self) -> bool:
|
||||
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
|
||||
return self._WGET_AUTO_COMPRESSION
|
||||
try:
|
||||
cmd = [
|
||||
self.WGET_BINARY,
|
||||
"--compression=auto",
|
||||
"--help",
|
||||
]
|
||||
self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode
|
||||
return self._WGET_AUTO_COMPRESSION
|
||||
except (FileNotFoundError, OSError):
|
||||
self._WGET_AUTO_COMPRESSION = False
|
||||
return False
|
||||
|
||||
WGET_CONFIG = WgetConfig()
|
||||
|
||||
|
||||
class WgetBinary(BaseBinary):
|
||||
name: BinName = WGET_CONFIG.WGET_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
WGET_BINARY = WgetBinary()
|
||||
|
||||
|
||||
class WgetExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'wget'
|
||||
binary: BinName = WGET_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
wget_index_path = wget_output_path(snapshot.as_link())
|
||||
if wget_index_path:
|
||||
return Path(wget_index_path)
|
||||
return None
|
||||
|
||||
WGET_EXTRACTOR = WgetExtractor()
|
||||
|
||||
|
||||
class WarcExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'warc'
|
||||
binary: BinName = WGET_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
|
||||
if warc_files:
|
||||
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
|
||||
return None
|
||||
|
||||
|
||||
WARC_EXTRACTOR = WarcExtractor()
|
||||
|
||||
|
||||
class WgetPlugin(BasePlugin):
|
||||
app_label: str = 'wget'
|
||||
verbose_name: str = 'WGET'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
WGET_CONFIG,
|
||||
WGET_BINARY,
|
||||
WGET_EXTRACTOR,
|
||||
WARC_EXTRACTOR,
|
||||
]
|
||||
|
||||
|
||||
PLUGIN = WgetPlugin()
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
18
archivebox/plugins_extractor/wget/binaries.py
Normal file
18
archivebox/plugins_extractor/wget/binaries.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
__package__ = 'plugins_extractor.wget'
|
||||
|
||||
from typing import List
|
||||
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
|
||||
from .config import WGET_CONFIG
|
||||
|
||||
|
||||
class WgetBinary(BaseBinary):
|
||||
name: BinName = WGET_CONFIG.WGET_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
WGET_BINARY = WgetBinary()
|
72
archivebox/plugins_extractor/wget/config.py
Normal file
72
archivebox/plugins_extractor/wget/config.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
__package__ = 'plugins_extractor.wget'
|
||||
|
||||
import subprocess
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import Field, model_validator
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
from archivebox.misc.logging import STDERR
|
||||
|
||||
|
||||
class WgetConfig(BaseConfigSet):
|
||||
|
||||
SAVE_WGET: bool = True
|
||||
SAVE_WARC: bool = True
|
||||
|
||||
USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC)
|
||||
|
||||
WGET_BINARY: str = Field(default='wget')
|
||||
WGET_ARGS: List[str] = [
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]
|
||||
WGET_EXTRA_ARGS: List[str] = []
|
||||
|
||||
SAVE_WGET_REQUISITES: bool = Field(default=True)
|
||||
WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
||||
|
||||
WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_use_ytdlp(self):
|
||||
if self.USE_WGET and self.WGET_TIMEOUT < 10:
|
||||
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]')
|
||||
STDERR.print(' wget will fail to archive any sites if set to less than ~20 seconds.')
|
||||
STDERR.print(' (Setting it somewhere over 60 seconds is recommended)')
|
||||
STDERR.print()
|
||||
STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
|
||||
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
|
||||
STDERR.print()
|
||||
return self
|
||||
|
||||
@property
|
||||
def WGET_AUTO_COMPRESSION(self) -> bool:
|
||||
if hasattr(self, '_WGET_AUTO_COMPRESSION'):
|
||||
return self._WGET_AUTO_COMPRESSION
|
||||
try:
|
||||
cmd = [
|
||||
self.WGET_BINARY,
|
||||
"--compression=auto",
|
||||
"--help",
|
||||
]
|
||||
self._WGET_AUTO_COMPRESSION = not subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3).returncode
|
||||
return self._WGET_AUTO_COMPRESSION
|
||||
except (FileNotFoundError, OSError):
|
||||
self._WGET_AUTO_COMPRESSION = False
|
||||
return False
|
||||
|
||||
WGET_CONFIG = WgetConfig()
|
||||
|
37
archivebox/plugins_extractor/wget/extractors.py
Normal file
37
archivebox/plugins_extractor/wget/extractors.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
__package__ = 'plugins_extractor.wget'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic_pkgr import BinName
|
||||
|
||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from .binaries import WGET_BINARY
|
||||
from .wget_util import wget_output_path
|
||||
|
||||
class WgetExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'wget'
|
||||
binary: BinName = WGET_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
wget_index_path = wget_output_path(snapshot.as_link())
|
||||
if wget_index_path:
|
||||
return Path(wget_index_path)
|
||||
return None
|
||||
|
||||
WGET_EXTRACTOR = WgetExtractor()
|
||||
|
||||
|
||||
class WarcExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'warc'
|
||||
binary: BinName = WGET_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
|
||||
if warc_files:
|
||||
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
|
||||
return None
|
||||
|
||||
|
||||
WARC_EXTRACTOR = WarcExtractor()
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
__package__ = 'plugins_extractor.ytdlp'
|
||||
__label__ = 'YT-DLP'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'Nick Sweeting'
|
||||
__homepage__ = 'https://github.com/yt-dlp/yt-dlp'
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'ytdlp': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import YTDLP_CONFIG
|
||||
|
||||
return {
|
||||
'ytdlp': YTDLP_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import YTDLP_BINARY, FFMPEG_BINARY
|
||||
|
||||
return {
|
||||
'ytdlp': YTDLP_BINARY,
|
||||
'ffmpeg': FFMPEG_BINARY,
|
||||
}
|
|
@ -1,98 +0,0 @@
|
|||
import sys
|
||||
from typing import List
|
||||
from subprocess import run, PIPE
|
||||
|
||||
from rich import print
|
||||
from pydantic import InstanceOf, Field, model_validator, AliasChoices
|
||||
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
|
||||
|
||||
from abx.archivebox.base_plugin import BasePlugin
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
from abx.archivebox.base_hook import BaseHook
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
from plugins_pkg.pip.apps import pip
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class YtdlpConfig(BaseConfigSet):
|
||||
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
|
||||
|
||||
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
|
||||
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
|
||||
|
||||
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_use_ytdlp(self):
|
||||
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
|
||||
print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
||||
print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr)
|
||||
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
return self
|
||||
|
||||
|
||||
YTDLP_CONFIG = YtdlpConfig()
|
||||
|
||||
|
||||
|
||||
class YtdlpBinary(BaseBinary):
|
||||
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
|
||||
|
||||
YTDLP_BINARY = YtdlpBinary()
|
||||
|
||||
|
||||
class FfmpegBinary(BaseBinary):
|
||||
name: BinName = 'ffmpeg'
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
'env': {
|
||||
# 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
|
||||
'version': lambda: run(['ffmpeg', '-version'], stdout=PIPE, stderr=PIPE, text=True).stdout,
|
||||
},
|
||||
'apt': {
|
||||
# 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
|
||||
'version': lambda: run(['apt', 'show', 'ffmpeg'], stdout=PIPE, stderr=PIPE, text=True).stdout,
|
||||
},
|
||||
'brew': {
|
||||
# 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
|
||||
'version': lambda: run(['brew', 'info', 'ffmpeg', '--quiet'], stdout=PIPE, stderr=PIPE, text=True).stdout,
|
||||
},
|
||||
}
|
||||
|
||||
# def get_ffmpeg_version(self) -> Optional[str]:
|
||||
# return self.exec(cmd=['-version']).stdout
|
||||
|
||||
FFMPEG_BINARY = FfmpegBinary()
|
||||
|
||||
|
||||
# class YtdlpExtractor(BaseExtractor):
|
||||
# name: str = 'ytdlp'
|
||||
# binary: str = 'ytdlp'
|
||||
|
||||
|
||||
|
||||
class YtdlpPlugin(BasePlugin):
|
||||
app_label: str = 'ytdlp'
|
||||
verbose_name: str = 'YT-DLP'
|
||||
docs_url: str = 'https://github.com/yt-dlp/yt-dlp'
|
||||
|
||||
hooks: List[InstanceOf[BaseHook]] = [
|
||||
YTDLP_CONFIG,
|
||||
YTDLP_BINARY,
|
||||
FFMPEG_BINARY,
|
||||
]
|
||||
|
||||
|
||||
PLUGIN = YtdlpPlugin()
|
||||
# PLUGIN.register(settings)
|
||||
DJANGO_APP = PLUGIN.AppConfig
|
42
archivebox/plugins_extractor/ytdlp/binaries.py
Normal file
42
archivebox/plugins_extractor/ytdlp/binaries.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
__package__ = 'plugins_extractor.ytdlp'
|
||||
|
||||
import subprocess
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
|
||||
from plugins_pkg.pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
|
||||
|
||||
from .config import YTDLP_CONFIG
|
||||
|
||||
|
||||
class YtdlpBinary(BaseBinary):
|
||||
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
|
||||
|
||||
YTDLP_BINARY = YtdlpBinary()
|
||||
|
||||
|
||||
class FfmpegBinary(BaseBinary):
|
||||
name: BinName = 'ffmpeg'
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
'env': {
|
||||
# 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH),
|
||||
'version': lambda: subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True).stdout,
|
||||
},
|
||||
'apt': {
|
||||
# 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH),
|
||||
'version': lambda: subprocess.run(['apt', 'show', 'ffmpeg'], capture_output=True, text=True).stdout,
|
||||
},
|
||||
'brew': {
|
||||
# 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH),
|
||||
'version': lambda: subprocess.run(['brew', 'info', 'ffmpeg', '--quiet'], capture_output=True, text=True).stdout,
|
||||
},
|
||||
}
|
||||
|
||||
FFMPEG_BINARY = FfmpegBinary()
|
35
archivebox/plugins_extractor/ytdlp/config.py
Normal file
35
archivebox/plugins_extractor/ytdlp/config.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
__package__ = 'plugins_extractor.ytdlp'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import Field, model_validator, AliasChoices
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
from archivebox.misc.logging import STDERR
|
||||
|
||||
|
||||
class YtdlpConfig(BaseConfigSet):
|
||||
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
|
||||
|
||||
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
|
||||
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
|
||||
|
||||
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_use_ytdlp(self):
|
||||
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
|
||||
STDERR.print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]')
|
||||
STDERR.print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
|
||||
STDERR.print(' (Setting it somewhere over 60 seconds is recommended)')
|
||||
STDERR.print()
|
||||
STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
|
||||
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
|
||||
STDERR.print()
|
||||
return self
|
||||
|
||||
|
||||
YTDLP_CONFIG = YtdlpConfig()
|
Loading…
Add table
Add a link
Reference in a new issue