mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
new vastly simplified plugin spec without pydantic
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
Some checks are pending
Build Debian package / build (push) Waiting to run
Build Docker image / buildx (push) Waiting to run
Build Homebrew package / build (push) Waiting to run
Run linters / lint (push) Waiting to run
Build Pip package / build (push) Waiting to run
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Waiting to run
Run tests / docker_tests (push) Waiting to run
This commit is contained in:
parent
abf75f49f4
commit
01ba6d49d3
115 changed files with 2466 additions and 2301 deletions
37
archivebox/plugins_extractor/wget/extractors.py
Normal file
37
archivebox/plugins_extractor/wget/extractors.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
__package__ = 'plugins_extractor.wget'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic_pkgr import BinName
|
||||
|
||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from .binaries import WGET_BINARY
|
||||
from .wget_util import wget_output_path
|
||||
|
||||
class WgetExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'wget'
|
||||
binary: BinName = WGET_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
wget_index_path = wget_output_path(snapshot.as_link())
|
||||
if wget_index_path:
|
||||
return Path(wget_index_path)
|
||||
return None
|
||||
|
||||
WGET_EXTRACTOR = WgetExtractor()
|
||||
|
||||
|
||||
class WarcExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'warc'
|
||||
binary: BinName = WGET_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
|
||||
if warc_files:
|
||||
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
|
||||
return None
|
||||
|
||||
|
||||
WARC_EXTRACTOR = WarcExtractor()
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue