mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
220 lines
No EOL
9.2 KiB
Python
220 lines
No EOL
9.2 KiB
Python
import hashlib
|
|
import mimetypes
|
|
import os
|
|
|
|
import subprocess
|
|
from typing import ClassVar
|
|
from datetime import timedelta
|
|
from zipfile import Path
|
|
|
|
from django.utils import timezone
|
|
|
|
from archivebox.misc.hashing import get_dir_info
|
|
|
|
from core.models import ArchiveResult
|
|
|
|
import abx
|
|
import archivebox
|
|
|
|
class Extractor:
|
|
# static class variables
|
|
name: ClassVar[str] = 'ytdlp'
|
|
verbose_name: ClassVar[str] = 'YT-DLP'
|
|
binaries: ClassVar[tuple[str, ...]] = ()
|
|
daemons: ClassVar[tuple[str, ...]] = ()
|
|
timeout: ClassVar[int] = 60
|
|
|
|
# instance variables
|
|
ARCHIVERESULT: ArchiveResult
|
|
CONFIG: dict[str, object]
|
|
BINARIES: dict[str, object]
|
|
DAEMONS: dict[str, object]
|
|
|
|
def __init__(self, archiveresult: ArchiveResult, extra_config: dict | None=None):
|
|
assert archiveresult.pk, 'ArchiveResult must be saved to DB before it can be extracted'
|
|
self.archiveresult = self.ARCHIVERESULT = archiveresult
|
|
self.CONFIG = archivebox.pm.hook.get_SCOPE_CONFIG(archiveresult=self.archiveresult, extra=extra_config)
|
|
all_binaries = abx.as_dict(archivebox.pm.hook.get_BINARIES())
|
|
all_daemons = abx.as_dict(archivebox.pm.hook.get_DAEMONS())
|
|
self.BINARIES = {
|
|
binary_name: all_binaries[binary_name]
|
|
for binary_name in self.binaries
|
|
}
|
|
self.DAEMONS = {
|
|
daemon_name: all_daemons[daemon_name]
|
|
for daemon_name in self.daemons
|
|
}
|
|
|
|
def extract(self, config: dict | None=None) -> 'ArchiveResult':
|
|
"""
|
|
- making sure any binaries the extractor depends on are installed and loaded
|
|
- creating a new temporary working directory under the snapshot dir to hold extractor output
|
|
- setting up a timer signal to kill the extractor if it runs too long
|
|
- passing the extractor the URLs, temporary working directory, and config dict of options
|
|
- running the extractor in a shell subprocess and collecting stdout/stderr
|
|
- capturing the extractor's exit code
|
|
- if extractor exits with 29 (RetryError), it should set the status to 'BACKOFF' and set retry_at to a datetime in the future
|
|
- if extractor exits with 50 (NotApplicable), it should set the status to 'SKIPPED', and set retry_at to None
|
|
- setting the correct permissions and ownership on all the output files
|
|
- generating the merkle tree of all the output files and their hashes
|
|
- generating a thumbnail of the main output (or collecting one provided by the extractor)
|
|
- detecting any special outputs files that need to be parsed for other parts of the system (content-types? )
|
|
- metadata.json -> ArchiveResult.output_json
|
|
- outlinks.jsonl -> ArchiveResult.output_links
|
|
- search_texts.txt -> ArchiveResult.index_texts
|
|
- .merkle.json -> ArchiveResult.output_files
|
|
- videos.jsonl -> ArchiveResult.output_videos
|
|
- audios.jsonl -> ArchiveResult.output_audios
|
|
- images.jsonl -> ArchiveResult.output_images
|
|
- htmls.jsonl -> ArchiveResult.output_htmls
|
|
- saving all the result metadata to the ArchiveResult in the database
|
|
"""
|
|
|
|
archiveresult = self.ARCHIVERESULT
|
|
# config = get_scope_config(archiveresult=archiveresult.snapshot.url, env=...)
|
|
|
|
self.before_extract()
|
|
|
|
error = Exception('Failed to start extractor')
|
|
stdout = ''
|
|
stderr = ''
|
|
try:
|
|
proc = archiveresult.EXTRACTOR.spawn(url=archiveresult.snapshot.url, binaries=binaries, daemons=daemons, cwd=cwd, config=config)
|
|
stdout, stderr = proc.communicate()
|
|
error = None
|
|
except Exception as err:
|
|
error = err
|
|
finally:
|
|
self.after_extract(error=error)
|
|
|
|
return archiveresult
|
|
|
|
def should_extract(self):
|
|
if self.archiveresult.snapshot.url.startswith('https://youtube.com/'):
|
|
return True
|
|
return False
|
|
|
|
def load_binaries(self):
|
|
return {
|
|
bin_name: binary.load()
|
|
for bin_name, binary in self.BINARIES.items()
|
|
}
|
|
|
|
def load_daemons(self):
|
|
return {
|
|
daemon_name: daemon.load()
|
|
for daemon_name, daemon in self.DAEMONS.items()
|
|
}
|
|
|
|
def output_dir_name(self):
|
|
# e.g. 'ytdlp'
|
|
return f'{self.name}'
|
|
|
|
@property
|
|
def OUTPUT_DIR(self):
|
|
return self.archiveresult.snapshot_dir / self.output_dir_name()
|
|
|
|
def before_extract(self):
|
|
# create self.archiveresult.snapshot_dir / self.archiveresult.extractor / dir
|
|
# chown, chmod, etc.
|
|
binaries = self.load_binaries()
|
|
daemons = self.load_daemons()
|
|
cmd = self.archiveresult.EXTRACTOR.get_cmd(binaries=binaries, daemons=daemons)
|
|
cmd_version = self.archiveresult.EXTRACTOR.get_cmd_version(binaries=binaries, daemons=daemons)
|
|
|
|
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
os.chmod(self.OUTPUT_DIR, 0o755)
|
|
self.archiveresult.status = self.archiveresult.StatusChoices.STARTED
|
|
self.archiveresult.retry_at = timezone.now() + timedelta(seconds=self.timeout)
|
|
self.archiveresult.start_ts = timezone.now()
|
|
self.archiveresult.end_ts = None
|
|
self.archiveresult.output = None
|
|
self.archiveresult.output_path = str(self.OUTPUT_DIR.relative_to(self.archiveresult.snapshot_dir))
|
|
self.archiveresult.cmd = cmd
|
|
self.archiveresult.cmd_version = cmd_version
|
|
self.archiveresult.machine = Machine.objects.get_current()
|
|
self.archiveresult.iface = NetworkInterface.objects.get_current()
|
|
self.archiveresult.save()
|
|
self.archiveresult.write_indexes()
|
|
|
|
def extract(self, url: str, binaries: dict, daemons: dict, cwd: Path, config: dict):
|
|
proc = subprocess.run(self.archiveresult.cmd, cwd=self.archiveresult.cwd, env=os.environ.update(binaries), timeout=self.timeout, shell=True, capture_output=True, text=True)
|
|
self.archiveresult.stdout = proc.stdout
|
|
self.archiveresult.stderr = proc.stderr
|
|
self.archiveresult.returncode = proc.returncode
|
|
self.archiveresult.save()
|
|
self.archiveresult.write_indexes()
|
|
|
|
def determine_status(self):
|
|
if self.archiveresult.returncode == 29:
|
|
return self.archiveresult.StatusChoices.BACKOFF, timezone.now() + timedelta(seconds=self.timeout)
|
|
elif self.archiveresult.returncode == 50:
|
|
return self.archiveresult.StatusChoices.SKIPPED, None
|
|
else:
|
|
return self.archiveresult.StatusChoices.FAILED, None
|
|
|
|
def collect_outputs(self, cwd: Path):
|
|
for file in cwd.rglob('*'):
|
|
path = file.relative_to(cwd)
|
|
os.chmod(file, 0o644)
|
|
#os.chown(file, ARCHIVEBOX_UID, ARCHIVEBOX_GID)
|
|
|
|
self.archiveresult.outputs.append({
|
|
'type': 'FILE',
|
|
'path': file.relative_to(cwd),
|
|
'size': file.stat().st_size,
|
|
'ext': file.suffix,
|
|
'mimetype': mimetypes.guess_type(file)[0],
|
|
'sha256': hashlib.sha256(file.read_bytes()).hexdigest(),
|
|
'blake3': hashlib.blake3(file.read_bytes()).hexdigest(),
|
|
'created_at': file.stat().st_ctime,
|
|
'modified_at': file.stat().st_mtime,
|
|
'symlinks': [
|
|
'screenshot.png',
|
|
'example.com',
|
|
]
|
|
})
|
|
outlinks = parse_outlinks(file)
|
|
if outlinks:
|
|
self.archiveresult.outputs.append({
|
|
'type': 'OUTLINK',
|
|
'url': outlink.target,
|
|
'selector': outlink.selector,
|
|
'text': outlink.text,
|
|
})
|
|
|
|
if path.endswith('favicon.ico'):
|
|
self.archiveresult.outputs.append({
|
|
'type': 'FAVICON',
|
|
'symlinks': {
|
|
'favicon': output_file['path'],
|
|
'favicon.ico': output_file['path'],
|
|
'favicon.png': output_file['path'].with_suffix('.png'),
|
|
},
|
|
'path': output_file['path'],
|
|
})
|
|
if path.endswith('.pdf'):
|
|
self.archiveresult.outputs.append({
|
|
'type': 'PDF',
|
|
'path': file.relative_to(cwd),
|
|
})
|
|
|
|
if 'text/plain' in mimetypes.guess_type(file):
|
|
self.archiveresult.outputs.append({
|
|
'type': 'SEARCHTEXT',
|
|
'path': file.relative_to(self.archiveresult.OUTPUT_DIR),
|
|
'archiveresult_id': self.archiveresult.id,
|
|
})
|
|
|
|
def after_extract(self, error: Exception | None=None):
|
|
status, retry_at = self.determine_status()
|
|
|
|
|
|
self.archiveresult.error = f'{type(error).__name__}: {error}' if error else None
|
|
self.archiveresult.status = self.archiveresult.StatusChoices.FAILED if error else self.archiveresult.StatusChoices.SUCCEEDED
|
|
self.archiveresult.retry_at = None
|
|
self.archiveresult.end_ts = timezone.now()
|
|
self.archiveresult.output = self.archiveresult.outputs[0].path
|
|
self.archiveresult.save()
|
|
self.archiveresult.write_indexes()
|
|
|