diff --git a/archivebox/plugins_extractor/wget/apps.py b/archivebox/plugins_extractor/wget/apps.py index 85239173..19db12f0 100644 --- a/archivebox/plugins_extractor/wget/apps.py +++ b/archivebox/plugins_extractor/wget/apps.py @@ -1,70 +1,106 @@ -from typing import List -from abx.archivebox.base_plugin import BasePlugin, InstanceOf, BaseHook +import sys +from typing import List, Optional +from pathlib import Path + +from rich import print +from pydantic import InstanceOf, Field, model_validator +from pydantic_pkgr import BinProvider, BinName + +from abx.archivebox.base_plugin import BasePlugin, BaseHook +from abx.archivebox.base_configset import BaseConfigSet +from abx.archivebox.base_binary import BaseBinary, env, apt, brew +from abx.archivebox.base_extractor import BaseExtractor, ExtractorName + +from archivebox.extractors.wget import wget_output_path -# class WgetToggleConfig(ConfigSet): +class WgetConfig(BaseConfigSet): -# SAVE_WGET: bool = True -# SAVE_WARC: bool = True + SAVE_WGET: bool = True + SAVE_WARC: bool = True + + USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC) + + WGET_BINARY: str = Field(default='wget') + WGET_ARGS: List[str] = [ + '--no-verbose', + '--adjust-extension', + '--convert-links', + '--force-directories', + '--backup-converted', + '--span-hosts', + '--no-parent', + '-e', 'robots=off', + ] + WGET_EXTRA_ARGS: List[str] = [] + + WGET_AUTO_COMPRESSION: bool = Field(default=True) + SAVE_WGET_REQUISITES: bool = Field(default=True) + WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT') + WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT') + WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY') + WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES') + WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE') + + @model_validator(mode='after') + def validate_use_ytdlp(self): + if self.USE_WGET and self.WGET_TIMEOUT < 10: + print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]', file=sys.stderr) + print(' wget will fail to archive any sites if set to less than ~20 seconds.', file=sys.stderr) + print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr) + print(file=sys.stderr) + print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr) + print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr) + print(file=sys.stderr) + return self -# class WgetDependencyConfig(ConfigSet): - -# WGET_BINARY: str = Field(default='wget') -# WGET_ARGS: Optional[List[str]] = Field(default=None) -# WGET_EXTRA_ARGS: List[str] = [] -# WGET_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] - -# class WgetOptionsConfig(ConfigSet): - -# # loaded from shared config -# WGET_AUTO_COMPRESSION: bool = Field(default=True) -# SAVE_WGET_REQUISITES: bool = Field(default=True) -# WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT') -# WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT') -# WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY') -# WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES') -# WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE') +WGET_CONFIG = WgetConfig() -# CONFIG = { -# 'CHECK_SSL_VALIDITY': False, -# 'SAVE_WARC': False, -# 'TIMEOUT': 999, -# } +class WgetBinary(BaseBinary): + name: BinName = WGET_CONFIG.WGET_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + +WGET_BINARY = WgetBinary() -# WGET_CONFIG = [ -# WgetToggleConfig(**CONFIG), -# WgetDependencyConfig(**CONFIG), -# WgetOptionsConfig(**CONFIG), -# ] +class WgetExtractor(BaseExtractor): + name: ExtractorName = 'wget' + binary: str = WGET_BINARY.name + + def get_output_path(self, snapshot) -> Path | None: + wget_index_path = wget_output_path(snapshot.as_link()) + if wget_index_path: + return Path(wget_index_path) + return None + +WGET_EXTRACTOR = WgetExtractor() +class WarcExtractor(BaseExtractor): + name: ExtractorName = 'warc' + binary: str = WGET_BINARY.name -# class WgetExtractor(Extractor): -# name: ExtractorName = 'wget' -# binary: Binary = WgetBinary() - -# def get_output_path(self, snapshot) -> Path: -# return get_wget_output_path(snapshot) - - -# class WarcExtractor(Extractor): -# name: ExtractorName = 'warc' -# binary: Binary = WgetBinary() - -# def get_output_path(self, snapshot) -> Path: -# return get_wget_output_path(snapshot) - + def get_output_path(self, snapshot) -> Path | None: + warc_files = (snapshot.link_dir / 'warc').glob('*.warc.gz') + if warc_files: + return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0] + return None +WARC_EXTRACTOR = WarcExtractor() class WgetPlugin(BasePlugin): app_label: str = 'wget' verbose_name: str = 'WGET' - hooks: List[InstanceOf[BaseHook]] = [] + hooks: List[InstanceOf[BaseHook]] = [ + WGET_CONFIG, + WGET_BINARY, + WGET_EXTRACTOR, + WARC_EXTRACTOR, + ] PLUGIN = WgetPlugin() diff --git a/archivebox/plugins_extractor/ytdlp/apps.py b/archivebox/plugins_extractor/ytdlp/apps.py index 8d13af35..21dfa0bc 100644 --- a/archivebox/plugins_extractor/ytdlp/apps.py +++ b/archivebox/plugins_extractor/ytdlp/apps.py @@ -1,10 +1,11 @@ import sys -from typing import List, Dict, ClassVar +from typing import List, Dict from subprocess import run, PIPE +from rich import print from pydantic import InstanceOf, Field, model_validator, AliasChoices - from pydantic_pkgr import BinProvider, BinName, BinProviderName, ProviderLookupDict + from abx.archivebox.base_plugin import BasePlugin from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_binary import BaseBinary, env, apt, brew