mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-12 22:25:44 -04:00
move pdf, screenshot, dom, singlefile, and ytdlp extractor config to new plugin system
This commit is contained in:
parent
a2a586e369
commit
a5ffd4e9d3
11 changed files with 333 additions and 353 deletions
|
@ -1,10 +1,10 @@
|
|||
__package__ = 'archivebox'
|
||||
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
import shutil
|
||||
import platform
|
||||
import archivebox
|
||||
|
||||
from typing import Dict, List, Optional, Iterable, IO, Union
|
||||
from pathlib import Path
|
||||
|
@ -69,6 +69,7 @@ from .extractors import archive_links, archive_link, ignore_methods
|
|||
from .misc.logging import stderr, hint
|
||||
from .misc.checks import check_data_folder, check_dependencies
|
||||
from .config import (
|
||||
setup_django_minimal,
|
||||
ConfigDict,
|
||||
ANSI,
|
||||
IS_TTY,
|
||||
|
@ -81,8 +82,6 @@ from .config import (
|
|||
TIMEZONE,
|
||||
ENFORCE_ATOMIC_WRITES,
|
||||
OUTPUT_PERMISSIONS,
|
||||
PYTHON_BINARY,
|
||||
ARCHIVEBOX_BINARY,
|
||||
ONLY_NEW,
|
||||
OUTPUT_DIR,
|
||||
SOURCES_DIR,
|
||||
|
@ -95,31 +94,22 @@ from .config import (
|
|||
HTML_INDEX_FILENAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
ALLOWED_IN_OUTPUT_DIR,
|
||||
SEARCH_BACKEND_ENGINE,
|
||||
LDAP,
|
||||
get_version,
|
||||
write_config_file,
|
||||
VERSION,
|
||||
VERSIONS_AVAILABLE,
|
||||
CAN_UPGRADE,
|
||||
COMMIT_HASH,
|
||||
BUILD_TIME,
|
||||
CODE_LOCATIONS,
|
||||
DATA_LOCATIONS,
|
||||
DEPENDENCIES,
|
||||
CHROME_BINARY,
|
||||
CHROME_VERSION,
|
||||
YOUTUBEDL_BINARY,
|
||||
YOUTUBEDL_VERSION,
|
||||
SINGLEFILE_VERSION,
|
||||
READABILITY_VERSION,
|
||||
MERCURY_VERSION,
|
||||
NODE_VERSION,
|
||||
load_all_config,
|
||||
CONFIG,
|
||||
USER_CONFIG,
|
||||
ADMIN_USERNAME,
|
||||
ADMIN_PASSWORD,
|
||||
get_real_name,
|
||||
setup_django,
|
||||
)
|
||||
|
@ -216,6 +206,11 @@ def version(quiet: bool=False,
|
|||
out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""Print the ArchiveBox version and dependency information"""
|
||||
|
||||
setup_django_minimal()
|
||||
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG
|
||||
from plugins_auth.ldap.apps import LDAP_CONFIG
|
||||
from django.conf import settings
|
||||
|
||||
print(VERSION)
|
||||
|
||||
if not quiet:
|
||||
|
@ -227,7 +222,7 @@ def version(quiet: bool=False,
|
|||
|
||||
p = platform.uname()
|
||||
print(
|
||||
'ArchiveBox v{}'.format(get_version(CONFIG)),
|
||||
'ArchiveBox v{}'.format(archivebox.__version__),
|
||||
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||
f'BUILD_TIME={BUILD_TIME}',
|
||||
)
|
||||
|
@ -241,29 +236,35 @@ def version(quiet: bool=False,
|
|||
)
|
||||
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
|
||||
print(
|
||||
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
f'FS_USER={PUID}:{PGID}',
|
||||
f'FS_PERMS={OUTPUT_PERMISSIONS}',
|
||||
f'FS_USER={SHELL_CONFIG.PUID}:{SHELL_CONFIG.PGID}',
|
||||
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
|
||||
)
|
||||
print(
|
||||
f'DEBUG={DEBUG}',
|
||||
f'IS_TTY={IS_TTY}',
|
||||
f'DEBUG={SHELL_CONFIG.DEBUG}',
|
||||
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
|
||||
f'TZ={TIMEZONE}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
|
||||
f'LDAP={LDAP}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
|
||||
f'LDAP={LDAP_CONFIG.LDAP_ENABLED}',
|
||||
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||
)
|
||||
print()
|
||||
|
||||
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
|
||||
print('{white}[i] Old dependency versions:{reset}'.format(**ANSI))
|
||||
for name, dependency in DEPENDENCIES.items():
|
||||
print(printable_dependency_version(name, dependency))
|
||||
|
||||
# add a newline between core dependencies and extractor dependencies for easier reading
|
||||
if name == 'ARCHIVEBOX_BINARY':
|
||||
print()
|
||||
|
||||
|
||||
print()
|
||||
print('{white}[i] New dependency versions:{reset}'.format(**ANSI))
|
||||
for name, binary in settings.BINARIES.items():
|
||||
loaded_bin = binary.load()
|
||||
print('', '√' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath)
|
||||
|
||||
print()
|
||||
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
|
||||
for name, path in CODE_LOCATIONS.items():
|
||||
|
@ -431,10 +432,11 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
|
|||
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from plugins_sys.config.apps import SERVER_CONFIG
|
||||
|
||||
if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
|
||||
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
|
||||
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
|
||||
User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
|
||||
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
|
||||
|
||||
if existing_index:
|
||||
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
||||
|
@ -693,8 +695,8 @@ def add(urls: Union[str, List[str]],
|
|||
|
||||
# tail_worker_logs(worker['stdout_logfile'])
|
||||
|
||||
if CAN_UPGRADE:
|
||||
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
# if CAN_UPGRADE:
|
||||
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
|
||||
return new_links
|
||||
|
||||
|
@ -967,6 +969,8 @@ def list_folders(links: List[Link],
|
|||
def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""Automatically install all ArchiveBox dependencies and extras"""
|
||||
|
||||
|
||||
|
||||
if not (out_dir / ARCHIVE_DIR_NAME).exists():
|
||||
run_subcommand('init', stdin=None, pwd=out_dir)
|
||||
|
||||
|
@ -980,24 +984,26 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
|
||||
stderr('\n[+] Installing enabled ArchiveBox dependencies automatically...', color='green')
|
||||
|
||||
from plugins_pkg.pip.apps import PYTHON_BINARY
|
||||
|
||||
stderr('\n Installing YOUTUBEDL_BINARY automatically using pip...')
|
||||
if YOUTUBEDL_VERSION:
|
||||
print(f'{YOUTUBEDL_VERSION} is already installed', YOUTUBEDL_BINARY)
|
||||
else:
|
||||
try:
|
||||
run_shell([
|
||||
PYTHON_BINARY, '-m', 'pip',
|
||||
PYTHON_BINARY.load().abspath, '-m', 'pip',
|
||||
'install',
|
||||
'--upgrade',
|
||||
'--no-cache-dir',
|
||||
'--no-warn-script-location',
|
||||
'yt-dlp',
|
||||
], capture_output=False, cwd=out_dir)
|
||||
], capture_output=False, cwd=out_dir, text=True)
|
||||
pkg_path = run_shell([
|
||||
PYTHON_BINARY, '-m', 'pip',
|
||||
PYTHON_BINARY.load().abspath, '-m', 'pip',
|
||||
'show',
|
||||
'yt-dlp',
|
||||
], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
|
||||
], capture_output=True, text=True, cwd=out_dir).stdout.split('Location: ')[-1].split('\n', 1)[0]
|
||||
NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
|
||||
os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
|
||||
assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
|
||||
|
@ -1006,33 +1012,18 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
stderr(f'[X] Failed to install python packages: {e}', color='red')
|
||||
raise SystemExit(1)
|
||||
|
||||
if platform.machine() == 'armv7l':
|
||||
stderr('\n Skip the automatic installation of CHROME_BINARY because playwright is not available on armv7.')
|
||||
else:
|
||||
stderr('\n Installing CHROME_BINARY automatically using playwright...')
|
||||
if CHROME_VERSION:
|
||||
print(f'{CHROME_VERSION} is already installed', CHROME_BINARY)
|
||||
else:
|
||||
try:
|
||||
run_shell([
|
||||
PYTHON_BINARY, '-m', 'pip',
|
||||
'install',
|
||||
'--upgrade',
|
||||
'--no-cache-dir',
|
||||
'--no-warn-script-location',
|
||||
'playwright',
|
||||
], capture_output=False, cwd=out_dir)
|
||||
run_shell([PYTHON_BINARY, '-m', 'playwright', 'install', 'chromium'], capture_output=False, cwd=out_dir)
|
||||
proc = run_shell([PYTHON_BINARY, '-c', 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)'], capture_output=True, text=True, cwd=out_dir)
|
||||
NEW_CHROME_BINARY = proc.stdout.decode().strip() if isinstance(proc.stdout, bytes) else proc.stdout.strip()
|
||||
assert NEW_CHROME_BINARY and len(NEW_CHROME_BINARY), 'CHROME_BINARY must contain a path'
|
||||
config(f'CHROME_BINARY={NEW_CHROME_BINARY}', set=True, out_dir=out_dir)
|
||||
except BaseException as e: # lgtm [py/catch-base-exception]
|
||||
stderr(f'[X] Failed to install chromium using playwright: {e.__class__.__name__} {e}', color='red')
|
||||
raise SystemExit(1)
|
||||
|
||||
from plugins_extractor.chrome.apps import CHROME_BINARY
|
||||
|
||||
CHROME_BINARY.load_or_install()
|
||||
|
||||
from plugins_pkg.npm.apps import NPM_BINARY
|
||||
from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
|
||||
|
||||
SINGLEFILE_BINARY.load_or_install()
|
||||
|
||||
stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...')
|
||||
if not NODE_VERSION:
|
||||
if not NPM_BINARY.load().version:
|
||||
stderr('[X] You must first install node & npm using your system package manager', color='red')
|
||||
hint([
|
||||
'https://github.com/nodesource/distributions#table-of-contents',
|
||||
|
@ -1077,7 +1068,9 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
|
||||
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
|
||||
|
||||
run_shell([PYTHON_BINARY, ARCHIVEBOX_BINARY, '--version'], capture_output=False, cwd=out_dir)
|
||||
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||
|
||||
run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
|
||||
|
||||
@enforce_types
|
||||
def config(config_options_str: Optional[str]=None,
|
||||
|
@ -1192,6 +1185,8 @@ def schedule(add: bool=False,
|
|||
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
setup_django_minimal()
|
||||
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||
|
||||
Path(LOGS_DIR).mkdir(exist_ok=True)
|
||||
|
||||
|
@ -1212,7 +1207,7 @@ def schedule(add: bool=False,
|
|||
'cd',
|
||||
quoted(out_dir),
|
||||
'&&',
|
||||
quoted(ARCHIVEBOX_BINARY),
|
||||
quoted(ARCHIVEBOX_BINARY.load().abspath),
|
||||
*([
|
||||
'add',
|
||||
*(['--overwrite'] if overwrite else []),
|
||||
|
@ -1300,8 +1295,8 @@ def schedule(add: bool=False,
|
|||
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
|
||||
raise SystemExit(1)
|
||||
|
||||
if CAN_UPGRADE:
|
||||
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
# if CAN_UPGRADE:
|
||||
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
@ -1386,6 +1381,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
"""Run an ArchiveBox Django management command"""
|
||||
|
||||
check_data_folder(CONFIG)
|
||||
setup_django_minimal()
|
||||
from django.core.management import execute_from_command_line
|
||||
|
||||
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
|
||||
|
@ -1393,7 +1389,9 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||
stderr('')
|
||||
|
||||
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
|
||||
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
|
||||
|
||||
execute_from_command_line([ARCHIVEBOX_BINARY.load().abspath, 'manage', *(args or ['help'])])
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue