mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 23:24:30 -04:00
wip new plugins system with browsertrix
This commit is contained in:
parent
16adff46dd
commit
1d4ec6f825
15 changed files with 3714 additions and 3 deletions
1
archivebox/plugins/replaywebpage/__init__.py
Normal file
1
archivebox/plugins/replaywebpage/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
__package__ = 'archivebox.plugins.replaywebpage'
|
8
archivebox/plugins/replaywebpage/apps.py
Normal file
8
archivebox/plugins/replaywebpage/apps.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class ReplayWebPageConfig(AppConfig):
|
||||
label = "ReplayWeb.Page"
|
||||
name = "plugin_replaywebpage"
|
||||
|
||||
default_auto_field = "django.db.models.BigAutoField"
|
50
archivebox/plugins/replaywebpage/extractors.py
Normal file
50
archivebox/plugins/replaywebpage/extractors.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
# browsertrix extractor
|
||||
|
||||
def save_browsertrix(link, out_dir, timeout, config):
|
||||
|
||||
|
||||
browsertrix_dir = out_dir / 'browsertrix'
|
||||
browsertrix_dir.mkdir(exist_ok=True)
|
||||
|
||||
crawl_id = link.timestamp
|
||||
|
||||
browsertrix_crawler_cmd = [
|
||||
'crawl',
|
||||
f'--url', link.url,
|
||||
f'--collection={crawl_id}',
|
||||
'--scopeType=page',
|
||||
'--generateWACZ',
|
||||
'--text=final-to-warc',
|
||||
'--timeLimit=60',
|
||||
]
|
||||
|
||||
remote_cmd = """
|
||||
rm /tmp/dump.rdb;
|
||||
rm -rf /crawls/collections;
|
||||
mkdir /crawls/collections;
|
||||
env CRAWL_ID={crawl_id}
|
||||
"""
|
||||
|
||||
local_cmd = ['nc', 'browsertrix', '2222']
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
|
||||
|
||||
cmd_output = result.stdout.decode()
|
||||
|
||||
wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
|
||||
|
||||
copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
|
||||
|
||||
|
||||
|
||||
TEMPLATE = """
|
||||
|
||||
"""
|
||||
|
||||
# rm /tmp/dump.rdb;
|
||||
# rm -rf /crawls/collections;
|
||||
# mkdir /crawls/collections;
|
||||
# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60
|
124
archivebox/plugins/replaywebpage/static/sw.js
Normal file
124
archivebox/plugins/replaywebpage/static/sw.js
Normal file
File diff suppressed because one or more lines are too long
1
archivebox/plugins/replaywebpage/static/test.txt
Normal file
1
archivebox/plugins/replaywebpage/static/test.txt
Normal file
|
@ -0,0 +1 @@
|
|||
test content this should be visible
|
BIN
archivebox/plugins/replaywebpage/static/test.wacz
Normal file
BIN
archivebox/plugins/replaywebpage/static/test.wacz
Normal file
Binary file not shown.
3392
archivebox/plugins/replaywebpage/static/ui.js
Normal file
3392
archivebox/plugins/replaywebpage/static/ui.js
Normal file
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,39 @@
|
|||
{% load tz core_tags static %}
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>{{title}}</title>
|
||||
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
|
||||
|
||||
</style>
|
||||
<style>
|
||||
html, body {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: #ddd;
|
||||
}
|
||||
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
ReplayWeb.page for: {{snapshot.url}} ({{timestamp}}) /{{warc_filename}}
|
||||
|
||||
{{snapshot}}
|
||||
|
||||
<script>
|
||||
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/sw.min.js
|
||||
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/ui.min.js
|
||||
</script>
|
||||
|
||||
<style>
|
||||
replay-web-page {
|
||||
width: 100%;
|
||||
height: 900px;
|
||||
}
|
||||
</style>
|
||||
<script src="/static/ui.js"></script>
|
||||
|
||||
<replay-web-page style="height: 600px" embed="replay-with-info" replayBase="/static/" source="{% static 'test.wacz' %}" url="https://example.com/"></replay-web-page>
|
||||
</body>
|
||||
</html>
|
7
archivebox/plugins/replaywebpage/urls.py
Normal file
7
archivebox/plugins/replaywebpage/urls.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
from django.urls import path
|
||||
|
||||
from .views import ReplayWebPageViewer
|
||||
|
||||
urlpatterns = [
|
||||
path('<path:path>', ReplayWebPageViewer.as_view(), name='plugin_replaywebpage__viewer'),
|
||||
]
|
47
archivebox/plugins/replaywebpage/views.py
Normal file
47
archivebox/plugins/replaywebpage/views.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from django.views import View
|
||||
from django.shortcuts import render
|
||||
from django.db.models import Q
|
||||
|
||||
from core.models import Snapshot
|
||||
|
||||
# from archivebox.config import PUBLIC_SNAPSHOTS
|
||||
PUBLIC_SNAPSHOTS = True
|
||||
|
||||
|
||||
class ReplayWebPageViewer(View):
|
||||
template_name = 'plugin_replaywebpage__viewer.html'
|
||||
|
||||
# render static html index from filesystem archive/<timestamp>/index.html
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
return {
|
||||
# **super().get_context_data(**kwargs),
|
||||
# 'VERSION': VERSION,
|
||||
# 'COMMIT_HASH': COMMIT_HASH,
|
||||
# 'FOOTER_INFO': FOOTER_INFO,
|
||||
}
|
||||
|
||||
|
||||
def get(self, request, path):
|
||||
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
try:
|
||||
timestamp, warc_filename = path.split('/', 1)
|
||||
except (IndexError, ValueError):
|
||||
timestamp, warc_filename = path.split('/', 1)[0], ''
|
||||
|
||||
snapshot = Snapshot.objects.get(Q(timestamp=timestamp) | Q(id__startswith=timestamp))
|
||||
|
||||
context = self.get_context_data()
|
||||
context.update({
|
||||
"snapshot": snapshot,
|
||||
"timestamp": timestamp,
|
||||
"warc_filename": warc_filename,
|
||||
})
|
||||
return render(template_name=self.template_name, request=self.request, context=context)
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue