mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-31 23:08:25 -04:00
wip
This commit is contained in:
parent
4b6f08b0fe
commit
5d9a32c364
178 changed files with 2982 additions and 1322 deletions
0
packages/abx-spec-extractor/README.md
Normal file
0
packages/abx-spec-extractor/README.md
Normal file
211
packages/abx-spec-extractor/abx_spec_extractor.py
Normal file
211
packages/abx-spec-extractor/abx_spec_extractor.py
Normal file
|
@ -0,0 +1,211 @@
|
|||
import os
|
||||
|
||||
from typing import Optional, List, Annotated, Tuple
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import AfterValidator
|
||||
from pydantic_pkgr import BinName
|
||||
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
def assert_no_empty_args(args: List[str]) -> List[str]:
|
||||
assert all(len(arg) for arg in args)
|
||||
return args
|
||||
|
||||
ExtractorName = Annotated[str, AfterValidator(lambda s: s.isidentifier())]
|
||||
|
||||
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
|
||||
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(assert_no_empty_args)]
|
||||
|
||||
|
||||
@abx.hookspec
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
return []
|
||||
|
||||
@abx.hookspec
|
||||
@abx.hookimpl
|
||||
def extract(uri: str, config: dict | None=None):
|
||||
return {}
|
||||
|
||||
@abx.hookspec(firstresult=True)
|
||||
@abx.hookimpl(trylast=True)
|
||||
def should_extract(uri: str, extractor: str, config: dict | None=None):
|
||||
return False
|
||||
|
||||
|
||||
class BaseExtractor:
|
||||
name: ExtractorName
|
||||
binary: BinName
|
||||
|
||||
default_args: CmdArgsList = []
|
||||
extra_args: CmdArgsList = []
|
||||
|
||||
def get_output_path(self, snapshot) -> Path:
|
||||
return Path(self.__class__.__name__.lower())
|
||||
|
||||
def should_extract(self, uri: str, config: dict | None=None) -> bool:
|
||||
try:
|
||||
assert self.detect_installed_binary().version
|
||||
except Exception:
|
||||
raise
|
||||
# could not load binary
|
||||
return False
|
||||
|
||||
# output_dir = self.get_output_path(snapshot)
|
||||
# if output_dir.glob('*.*'):
|
||||
# return False
|
||||
return True
|
||||
|
||||
# @abx.hookimpl
|
||||
# def extract(self, snapshot_id: str) -> Dict[str, Any]:
|
||||
# from core.models import Snapshot
|
||||
# from archivebox import CONSTANTS
|
||||
|
||||
# snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
# if not self.should_extract(snapshot.url):
|
||||
# return {}
|
||||
|
||||
# status = 'failed'
|
||||
# start_ts = timezone.now()
|
||||
# uplink = self.detect_network_interface()
|
||||
# installed_binary = self.detect_installed_binary()
|
||||
# machine = installed_binary.machine
|
||||
# assert uplink.machine == installed_binary.machine # it would be *very* weird if this wasn't true
|
||||
|
||||
# output_dir = CONSTANTS.DATA_DIR / '.tmp' / 'extractors' / self.name / str(snapshot.abid)
|
||||
# output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# # execute the extractor binary with the given args
|
||||
# args = [snapshot.url, *self.args] if self.args is not None else [snapshot.url, *self.default_args, *self.extra_args]
|
||||
# cmd = [str(installed_binary.abspath), *args]
|
||||
# proc = self.exec(installed_binary=installed_binary, args=args, cwd=output_dir)
|
||||
|
||||
# # collect the output
|
||||
# end_ts = timezone.now()
|
||||
# output_files = list(str(path.relative_to(output_dir)) for path in output_dir.glob('**/*.*'))
|
||||
# stdout = proc.stdout.strip()
|
||||
# stderr = proc.stderr.strip()
|
||||
# output_json = None
|
||||
# output_text = stdout
|
||||
# try:
|
||||
# output_json = json.loads(stdout.strip())
|
||||
# output_text = None
|
||||
# except json.JSONDecodeError:
|
||||
# pass
|
||||
|
||||
# errors = []
|
||||
# if proc.returncode == 0:
|
||||
# status = 'success'
|
||||
# else:
|
||||
# errors.append(f'{installed_binary.name} returned non-zero exit code: {proc.returncode}')
|
||||
|
||||
# # increment health stats counters
|
||||
# if status == 'success':
|
||||
# machine.record_health_success()
|
||||
# uplink.record_health_success()
|
||||
# installed_binary.record_health_success()
|
||||
# else:
|
||||
# machine.record_health_failure()
|
||||
# uplink.record_health_failure()
|
||||
# installed_binary.record_health_failure()
|
||||
|
||||
# return {
|
||||
# 'extractor': self.name,
|
||||
|
||||
# 'snapshot': {
|
||||
# 'id': snapshot.id,
|
||||
# 'abid': snapshot.abid,
|
||||
# 'url': snapshot.url,
|
||||
# 'created_by_id': snapshot.created_by_id,
|
||||
# },
|
||||
|
||||
# 'machine': {
|
||||
# 'id': machine.id,
|
||||
# 'abid': machine.abid,
|
||||
# 'guid': machine.guid,
|
||||
# 'hostname': machine.hostname,
|
||||
# 'hw_in_docker': machine.hw_in_docker,
|
||||
# 'hw_in_vm': machine.hw_in_vm,
|
||||
# 'hw_manufacturer': machine.hw_manufacturer,
|
||||
# 'hw_product': machine.hw_product,
|
||||
# 'hw_uuid': machine.hw_uuid,
|
||||
# 'os_arch': machine.os_arch,
|
||||
# 'os_family': machine.os_family,
|
||||
# 'os_platform': machine.os_platform,
|
||||
# 'os_release': machine.os_release,
|
||||
# 'os_kernel': machine.os_kernel,
|
||||
# },
|
||||
|
||||
# 'uplink': {
|
||||
# 'id': uplink.id,
|
||||
# 'abid': uplink.abid,
|
||||
# 'mac_address': uplink.mac_address,
|
||||
# 'ip_public': uplink.ip_public,
|
||||
# 'ip_local': uplink.ip_local,
|
||||
# 'dns_server': uplink.dns_server,
|
||||
# 'hostname': uplink.hostname,
|
||||
# 'iface': uplink.iface,
|
||||
# 'isp': uplink.isp,
|
||||
# 'city': uplink.city,
|
||||
# 'region': uplink.region,
|
||||
# 'country': uplink.country,
|
||||
# },
|
||||
|
||||
# 'binary': {
|
||||
# 'id': installed_binary.id,
|
||||
# 'abid': installed_binary.abid,
|
||||
# 'name': installed_binary.name,
|
||||
# 'binprovider': installed_binary.binprovider,
|
||||
# 'abspath': installed_binary.abspath,
|
||||
# 'version': installed_binary.version,
|
||||
# 'sha256': installed_binary.sha256,
|
||||
# },
|
||||
|
||||
# 'cmd': cmd,
|
||||
# 'stdout': stdout,
|
||||
# 'stderr': stderr,
|
||||
# 'returncode': proc.returncode,
|
||||
# 'start_ts': start_ts,
|
||||
# 'end_ts': end_ts,
|
||||
|
||||
# 'status': status,
|
||||
# 'errors': errors,
|
||||
# 'output_dir': str(output_dir.relative_to(CONSTANTS.DATA_DIR)),
|
||||
# 'output_files': output_files,
|
||||
# 'output_json': output_json or {},
|
||||
# 'output_text': output_text or '',
|
||||
# }
|
||||
|
||||
# TODO: move this to a hookimpl
|
||||
def exec(self, args: CmdArgsList=(), cwd: Optional[Path]=None, installed_binary=None):
|
||||
cwd = cwd or Path(os.getcwd())
|
||||
binary = self.load_binary(installed_binary=installed_binary)
|
||||
|
||||
return binary.exec(cmd=args, cwd=cwd)
|
||||
|
||||
# @cached_property
|
||||
@property
|
||||
def BINARY(self):
|
||||
# import abx.archivebox.reads
|
||||
# for binary in abx.archivebox.reads.get_BINARIES().values():
|
||||
# if binary.name == self.binary:
|
||||
# return binary
|
||||
raise ValueError(f'Binary {self.binary} not found')
|
||||
|
||||
def detect_installed_binary(self):
|
||||
from machine.models import InstalledBinary
|
||||
# hydrates binary from DB/cache if record of installed version is recent enough
|
||||
# otherwise it finds it from scratch by detecting installed version/abspath/sha256 on host
|
||||
return InstalledBinary.objects.get_from_db_or_cache(self.BINARY)
|
||||
|
||||
def load_binary(self, installed_binary=None):
|
||||
installed_binary = installed_binary or self.detect_installed_binary()
|
||||
return installed_binary.load_from_db()
|
||||
|
||||
# def detect_network_interface(self):
|
||||
# from machine.models import NetworkInterface
|
||||
# return NetworkInterface.objects.current()
|
18
packages/abx-spec-extractor/pyproject.toml
Normal file
18
packages/abx-spec-extractor/pyproject.toml
Normal file
|
@ -0,0 +1,18 @@
|
|||
[project]
|
||||
name = "abx-spec-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"python-benedict>=0.26.0",
|
||||
"pydantic>=2.5.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_spec_extractor = "abx_spec_extractor"
|
Loading…
Add table
Add a link
Reference in a new issue