mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
add gallerydl plugin
This commit is contained in:
parent
3421833830
commit
b6520243bc
11 changed files with 3888 additions and 0 deletions
1
archivebox/plugins/gallerydl/__init__.py
Normal file
1
archivebox/plugins/gallerydl/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
__package__ = 'archivebox.plugins.replaywebpage'
|
8
archivebox/plugins/gallerydl/apps.py
Normal file
8
archivebox/plugins/gallerydl/apps.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class ReplayWebPageConfig(AppConfig):
|
||||||
|
label = "ReplayWeb.Page"
|
||||||
|
name = "plugin_replaywebpage"
|
||||||
|
|
||||||
|
default_auto_field = "django.db.models.BigAutoField"
|
50
archivebox/plugins/gallerydl/extractors.py
Normal file
50
archivebox/plugins/gallerydl/extractors.py
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
# browsertrix extractor
|
||||||
|
|
||||||
|
def save_browsertrix(link, out_dir, timeout, config):
|
||||||
|
|
||||||
|
|
||||||
|
browsertrix_dir = out_dir / 'browsertrix'
|
||||||
|
browsertrix_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
crawl_id = link.timestamp
|
||||||
|
|
||||||
|
browsertrix_crawler_cmd = [
|
||||||
|
'crawl',
|
||||||
|
f'--url', link.url,
|
||||||
|
f'--collection={crawl_id}',
|
||||||
|
'--scopeType=page',
|
||||||
|
'--generateWACZ',
|
||||||
|
'--text=final-to-warc',
|
||||||
|
'--timeLimit=60',
|
||||||
|
]
|
||||||
|
|
||||||
|
remote_cmd = """
|
||||||
|
rm /tmp/dump.rdb;
|
||||||
|
rm -rf /crawls/collections;
|
||||||
|
mkdir /crawls/collections;
|
||||||
|
env CRAWL_ID={crawl_id}
|
||||||
|
"""
|
||||||
|
|
||||||
|
local_cmd = ['nc', 'browsertrix', '2222']
|
||||||
|
|
||||||
|
status = 'succeeded'
|
||||||
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
try:
|
||||||
|
result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
|
||||||
|
|
||||||
|
cmd_output = result.stdout.decode()
|
||||||
|
|
||||||
|
wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
|
||||||
|
|
||||||
|
copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
TEMPLATE = """
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# rm /tmp/dump.rdb;
|
||||||
|
# rm -rf /crawls/collections;
|
||||||
|
# mkdir /crawls/collections;
|
||||||
|
# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60
|
182
archivebox/plugins/gallerydl/models.py
Normal file
182
archivebox/plugins/gallerydl/models.py
Normal file
|
@ -0,0 +1,182 @@
|
||||||
|
from solo.models import SingletonModel
|
||||||
|
|
||||||
|
|
||||||
|
class GalleryDLDependency(SingletonModel):
|
||||||
|
GALLERYDL_ENABLED = models.BooleanField(default=True)
|
||||||
|
GALLERYDL_BINARY = models.CharField(max_length=255, default='gallery-dl')
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "GalleryDL Dependency Configuration"
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = "GalleryDL Dependency Configuration"
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def bin_path(self):
|
||||||
|
return bin_path(self.GALLERYDL_BINARY)
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def bin_version(self):
|
||||||
|
return bin_version(self.bin_path)
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def is_valid(self):
|
||||||
|
return self.bin_path and self.bin_version
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def enabled(self):
|
||||||
|
return self.GALLERYDL_ENABLED and self.is_valid
|
||||||
|
|
||||||
|
|
||||||
|
def pretty_version(self):
|
||||||
|
if self.enabled:
|
||||||
|
if self.is_valid:
|
||||||
|
color, symbol, note, version = 'green', '√', 'valid', ''
|
||||||
|
|
||||||
|
parsed_version_num = re.search(r'[\d\.]+', self.bin_version)
|
||||||
|
if parsed_version_num:
|
||||||
|
version = f'v{parsed_version_num[0]}'
|
||||||
|
|
||||||
|
if not self.bin_version:
|
||||||
|
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||||
|
else:
|
||||||
|
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||||
|
|
||||||
|
path = pretty_path(self.bin_path)
|
||||||
|
|
||||||
|
return ' '.join((
|
||||||
|
ANSI[color],
|
||||||
|
symbol,
|
||||||
|
ANSI['reset'],
|
||||||
|
name.ljust(21),
|
||||||
|
version.ljust(14),
|
||||||
|
ANSI[color],
|
||||||
|
note.ljust(8),
|
||||||
|
ANSI['reset'],
|
||||||
|
path.ljust(76),
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class GalleryDLExtractor(SingletonModel):
|
||||||
|
GALLERYDL_EXTRACTOR_NAME = 'gallerydl'
|
||||||
|
|
||||||
|
SAVE_GALLERYDL = models.BooleanField(default=True)
|
||||||
|
|
||||||
|
GALLERYDL_DEPENDENCY = GalleryDLDependency.get_solo()
|
||||||
|
|
||||||
|
# https://github.com/mikf/gallery-dl
|
||||||
|
GALLERYDL_ARGS = models.CSVField(max_length=255, default=[])
|
||||||
|
GALLERYDL_TIMEOUT = models.IntegerField(default=lambda c: c['TIMEOUT'])
|
||||||
|
GALLERYDL_USER_AGENT = models.CharField(max_length=255, default='{USER_AGENT}')
|
||||||
|
GALLERYDL_COOKIES_TXT = models.CharField(max_length=255, default='{COOKIES_TXT}')
|
||||||
|
|
||||||
|
ALIASES = {
|
||||||
|
'SAVE_GALLERYDL': ('USE_GALLERYDL', 'FETCH_GALLERYDL'),
|
||||||
|
}
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def enabled(self):
|
||||||
|
return self.SAVE_GALLERYDL and self.GALLERYDL_DEPENDENCY.is_valid
|
||||||
|
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "GalleryDL Extractor Configuration"
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = "GalleryDL Extractor Configuration"
|
||||||
|
|
||||||
|
def __json__(self):
|
||||||
|
return {
|
||||||
|
'SAVE_GALLERYDL': self.SAVE_GALLERYDL,
|
||||||
|
'GALLERYDL_DEPENDENCY': self.GALLERYDL_DEPENDENCY.__json__(),
|
||||||
|
'GALLERYDL_ARGS': self.GALLERYDL_ARGS,
|
||||||
|
'GALLERYDL_TIMEOUT': self.GALLERYDL_TIMEOUT,
|
||||||
|
'GALLERYDL_USER_AGENT': self.GALLERYDL_USER_AGENT,
|
||||||
|
'GALLERYDL_COOKIES_TXT': self.GALLERYDL_COOKIES_TXT,
|
||||||
|
}
|
||||||
|
|
||||||
|
def validate(self):
|
||||||
|
assert 5 < self.GALLERYDL_TIMEOUT, 'GALLERYDL_TIMEOUT must be at least 5 seconds'
|
||||||
|
# assert Path(self.GALLERYDL_COOKIES_TXT).exists()
|
||||||
|
# TODO: validate user agent with uaparser
|
||||||
|
# TODO: validate args, cookies.txt?
|
||||||
|
|
||||||
|
|
||||||
|
def save(self, *args, **kwargs):
|
||||||
|
self.validate()
|
||||||
|
with transaction.atomic():
|
||||||
|
result = super().save(*args, **kwargs)
|
||||||
|
emit_event({'type': 'GalleryDLExtractor.save', 'diff': self.__json__(), 'kwargs': kwargs})
|
||||||
|
# potential consumers of this event:
|
||||||
|
# - event logger: write to events.log
|
||||||
|
# - config file updater: writes to ArchiveBox.conf
|
||||||
|
# - supervisor: restarts relevant dependencies/extractors
|
||||||
|
# - etc...
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def create_extractor_directory(self, parent_dir: Path):
|
||||||
|
return subdir = (parent_dir / self.GALLERYDL_EXTRACTOR_NAME).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
def should_extract(self, parent_dir: Path):
|
||||||
|
existing_files = (parent_dir / self.GALLERYDL_EXTRACTOR_NAME).glob('*')
|
||||||
|
return not existing_files
|
||||||
|
|
||||||
|
|
||||||
|
def extract(self, url: str, out_dir: Path):
|
||||||
|
if not self.enabled:
|
||||||
|
return
|
||||||
|
|
||||||
|
extractor_dir = self.create_extractor_directory(out_dir)
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
self.GALLERYDL_DEPENDENCY.bin_path,
|
||||||
|
url,
|
||||||
|
'--timeout', GALLERYDL_TIMEOUT,
|
||||||
|
'--cookies', GALLERYDL_COOKIES_TXT,
|
||||||
|
'--user-agent', GALLERYDL_USER_AGENT,
|
||||||
|
'--verify', config.CHECK_SSL_VALIDITY
|
||||||
|
*self.GALLERYDL_ARGS,
|
||||||
|
]
|
||||||
|
|
||||||
|
status, stdout, stderr, output_path = 'failed', '', '', None
|
||||||
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
try:
|
||||||
|
proc = run(cmd, cwd=extractor_dir, timeout=self.GALLERYDL_TIMEOUT, text=True)
|
||||||
|
stdout, stderr = proc.stdout, proc.stderr
|
||||||
|
|
||||||
|
if 'ERROR: Unsupported URL' in stderr:
|
||||||
|
hints = ('gallery-dl doesnt support this type of url yet',)
|
||||||
|
raise ArchiveError('Failed to save gallerydl', hints)
|
||||||
|
|
||||||
|
if proc.returncode == 0 and 'finished' in stdout:
|
||||||
|
output_path = extractor_dir / 'index.html'
|
||||||
|
status = 'succeeded'
|
||||||
|
|
||||||
|
except Exception as err:
|
||||||
|
stderr += err
|
||||||
|
finally:
|
||||||
|
timer.end()
|
||||||
|
|
||||||
|
num_bytes, num_dirs, num_files = get_dir_size(extractor_dir)
|
||||||
|
|
||||||
|
return ArchiveResult(
|
||||||
|
status=status,
|
||||||
|
|
||||||
|
cmd=cmd,
|
||||||
|
pwd=str(out_dir),
|
||||||
|
cmd_version=self.GALLERYDL_DEPENDENCY.bin_version,
|
||||||
|
cmd_path=self.GALLERYDL_DEPENDENCY.bin_path,
|
||||||
|
cmd_hostname=config.HOSTNAME,
|
||||||
|
|
||||||
|
output_path=output_path,
|
||||||
|
stdout=stdout,
|
||||||
|
stderr=stderr,
|
||||||
|
|
||||||
|
num_bytes=num_bytes,
|
||||||
|
num_files=num_files,
|
||||||
|
num_dirs=num_dirs,
|
||||||
|
**timer.stats,
|
||||||
|
)
|
124
archivebox/plugins/gallerydl/static/sw.js
Normal file
124
archivebox/plugins/gallerydl/static/sw.js
Normal file
File diff suppressed because one or more lines are too long
1
archivebox/plugins/gallerydl/static/test.txt
Normal file
1
archivebox/plugins/gallerydl/static/test.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
test content this should be visible
|
BIN
archivebox/plugins/gallerydl/static/test.wacz
Normal file
BIN
archivebox/plugins/gallerydl/static/test.wacz
Normal file
Binary file not shown.
3392
archivebox/plugins/gallerydl/static/ui.js
Normal file
3392
archivebox/plugins/gallerydl/static/ui.js
Normal file
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,40 @@
|
||||||
|
{% load tz core_tags static %}
|
||||||
|
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<title>{{title}}</title>
|
||||||
|
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
|
||||||
|
</style>
|
||||||
|
<style>
|
||||||
|
html, body {
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
background-color: #ddd;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
ReplayWeb.page for: {{snapshot.url}} ({{timestamp}}) /{{warc_filename}}
|
||||||
|
|
||||||
|
{{snapshot}}
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/sw.min.js
|
||||||
|
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/ui.min.js
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
</style>
|
||||||
|
<script src="/static/ui.js"></script>
|
||||||
|
|
||||||
|
<replay-web-page
|
||||||
|
style="height: 600px"
|
||||||
|
embed="replay"
|
||||||
|
replayBase="/static/"
|
||||||
|
source="/static/test.wacz"
|
||||||
|
url="https://example.com/">
|
||||||
|
</replay-web-page>
|
||||||
|
</body>
|
||||||
|
</html>
|
12
archivebox/plugins/gallerydl/urls.py
Normal file
12
archivebox/plugins/gallerydl/urls.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
from django.urls import path
|
||||||
|
|
||||||
|
from .views import GalleryDLIconView, GalleryDLEmbedView, GalleryDLOutputView, GalleryDLDependencyView, GalleryDLExtractorView
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('/plugins/gallerydl/icon/<path:path>', GalleryDLIconView(.as_view), name='gallerydl_icon'),
|
||||||
|
path('/plugins/gallerydl/embed/<path:path>', GalleryDLEmbedView.as_view(), name='gallerydl_embed'),
|
||||||
|
path('/plugins/gallerydl/output/<path:path>', GalleryDLOutputView.as_view(), name='gallerydl_output'),
|
||||||
|
|
||||||
|
path('/plugins/gallerydl/dependency/', GalleryDLDependencyView.as_view(), name='gallerydl_dependency'),
|
||||||
|
path('/plugins/gallerydl/extractor/', GalleryDLExtractorView.as_view(), name='gallerydl_extractor'),
|
||||||
|
]
|
78
archivebox/plugins/gallerydl/views.py
Normal file
78
archivebox/plugins/gallerydl/views.py
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.views import View
|
||||||
|
from django.shortcuts import render
|
||||||
|
from django.db.models import Q
|
||||||
|
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
|
# from archivebox.config import PUBLIC_SNAPSHOTS
|
||||||
|
PUBLIC_SNAPSHOTS = True
|
||||||
|
|
||||||
|
|
||||||
|
class GalleryDLIconView(View):
|
||||||
|
template_name = 'plugin_gallerydl__icon.html'
|
||||||
|
|
||||||
|
# render static html index from filesystem archive/<timestamp>/index.html
|
||||||
|
|
||||||
|
def get_context_data(self, **kwargs):
|
||||||
|
return {
|
||||||
|
# **super().get_context_data(**kwargs),
|
||||||
|
# 'VERSION': VERSION,
|
||||||
|
# 'COMMIT_HASH': COMMIT_HASH,
|
||||||
|
# 'FOOTER_INFO': FOOTER_INFO,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get(self, request, path):
|
||||||
|
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||||
|
return redirect(f'/admin/login/?next={request.path}')
|
||||||
|
|
||||||
|
# ...
|
||||||
|
return render(template_name=self.template_name, request=self.request, context=context)
|
||||||
|
|
||||||
|
|
||||||
|
class GalleryDLEmbedView(View):
|
||||||
|
template_name = 'plugin_gallerydl__embed.html'
|
||||||
|
|
||||||
|
# render static html index from filesystem archive/<timestamp>/index.html
|
||||||
|
|
||||||
|
def get_context_data(self, **kwargs):
|
||||||
|
return {
|
||||||
|
# **super().get_context_data(**kwargs),
|
||||||
|
# 'VERSION': VERSION,
|
||||||
|
# 'COMMIT_HASH': COMMIT_HASH,
|
||||||
|
# 'FOOTER_INFO': FOOTER_INFO,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get(self, request, path):
|
||||||
|
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||||
|
return redirect(f'/admin/login/?next={request.path}')
|
||||||
|
|
||||||
|
# ...
|
||||||
|
return render(template_name=self.template_name, request=self.request, context=context)
|
||||||
|
|
||||||
|
|
||||||
|
class GalleryDLOutputView(View):
|
||||||
|
template_name = 'plugin_gallerydl__output.html'
|
||||||
|
|
||||||
|
# render static html index from filesystem archive/<timestamp>/index.html
|
||||||
|
|
||||||
|
def get_context_data(self, **kwargs):
|
||||||
|
return {
|
||||||
|
# **super().get_context_data(**kwargs),
|
||||||
|
# 'VERSION': VERSION,
|
||||||
|
# 'COMMIT_HASH': COMMIT_HASH,
|
||||||
|
# 'FOOTER_INFO': FOOTER_INFO,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get(self, request, path):
|
||||||
|
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||||
|
return redirect(f'/admin/login/?next={request.path}')
|
||||||
|
|
||||||
|
# ...
|
||||||
|
return render(template_name=self.template_name, request=self.request, context=context)
|
Loading…
Add table
Add a link
Reference in a new issue