wip new plugins system with browsertrix

2025-05-14 23:24:30 -04:00 · 2024-01-17 05:14:43 -08:00 · 2024-01-17 05:14:43 -08:00 · 1d4ec6f825
commit 1d4ec6f825
parent 16adff46dd
15 changed files with 3714 additions and 3 deletions
--- a/archivebox/plugins/replaywebpage/init.py
+++ b/archivebox/plugins/replaywebpage/init.py
@ -0,0 +1 @@
+__package__ = 'archivebox.plugins.replaywebpage'
--- a/archivebox/plugins/replaywebpage/apps.py
+++ b/archivebox/plugins/replaywebpage/apps.py
@ -0,0 +1,8 @@
+from django.apps import AppConfig
+
+
+class ReplayWebPageConfig(AppConfig):
+    label = "ReplayWeb.Page"
+    name = "plugin_replaywebpage"
+    
+    default_auto_field = "django.db.models.BigAutoField"
--- a/archivebox/plugins/replaywebpage/extractors.py
+++ b/archivebox/plugins/replaywebpage/extractors.py
@ -0,0 +1,50 @@
+# browsertrix extractor
+
+def save_browsertrix(link, out_dir, timeout, config):
+
+
+	browsertrix_dir = out_dir / 'browsertrix'
+	browsertrix_dir.mkdir(exist_ok=True)
+
+	crawl_id = link.timestamp
+
+	browsertrix_crawler_cmd = [
+		'crawl',
+		f'--url', link.url,
+		f'--collection={crawl_id}',
+		'--scopeType=page',
+		'--generateWACZ',
+		'--text=final-to-warc',
+		'--timeLimit=60',
+	]
+
+	remote_cmd = """
+	rm /tmp/dump.rdb;
+	rm -rf /crawls/collections;
+	mkdir /crawls/collections;
+	env CRAWL_ID={crawl_id} 
+	"""
+
+	local_cmd = ['nc', 'browsertrix', '2222']
+
+	status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
+		
+		cmd_output = result.stdout.decode()
+
+		wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
+
+		copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
+
+
+
+TEMPLATE = """
+
+"""
+
+# rm /tmp/dump.rdb;
+# rm -rf /crawls/collections;
+# mkdir /crawls/collections;
+# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60
--- a/archivebox/plugins/replaywebpage/static/sw.js
+++ b/archivebox/plugins/replaywebpage/static/sw.js
--- a/archivebox/plugins/replaywebpage/static/test.txt
+++ b/archivebox/plugins/replaywebpage/static/test.txt
@ -0,0 +1 @@
+test content this should be visible
--- a/archivebox/plugins/replaywebpage/static/test.wacz
+++ b/archivebox/plugins/replaywebpage/static/test.wacz
--- a/archivebox/plugins/replaywebpage/static/ui.js
+++ b/archivebox/plugins/replaywebpage/static/ui.js
--- a/archivebox/plugins/replaywebpage/templates/plugin_replaywebpage__viewer.html
+++ b/archivebox/plugins/replaywebpage/templates/plugin_replaywebpage__viewer.html
@ -0,0 +1,39 @@
+{% load tz core_tags static %}
+
+<!DOCTYPE html>
+<html lang="en">
+    <head>
+        <title>{{title}}</title>
+        <meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
+      
+        </style>
+        <style>
+            html, body {
+                width: 100%;
+                height: 100%;
+                background-color: #ddd;
+            }
+            
+        </style>
+    </head>
+    <body>
+        ReplayWeb.page for: {{snapshot.url}} ({{timestamp}}) /{{warc_filename}}
+
+        {{snapshot}}
+
+        <script>
+            // https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/sw.min.js
+            // https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/ui.min.js
+        </script>
+
+        <style>
+            replay-web-page {
+                width: 100%;
+                height: 900px;
+            }
+        </style>
+        <script src="/static/ui.js"></script>
+
+        <replay-web-page style="height: 600px" embed="replay-with-info" replayBase="/static/" source="{% static 'test.wacz' %}" url="https://example.com/"></replay-web-page>
+    </body>
+</html>
--- a/archivebox/plugins/replaywebpage/urls.py
+++ b/archivebox/plugins/replaywebpage/urls.py
@ -0,0 +1,7 @@
+from django.urls import path
+
+from .views import ReplayWebPageViewer
+
+urlpatterns = [
+	path('<path:path>', ReplayWebPageViewer.as_view(), name='plugin_replaywebpage__viewer'),
+]
--- a/archivebox/plugins/replaywebpage/views.py
+++ b/archivebox/plugins/replaywebpage/views.py
@ -0,0 +1,47 @@
+import os
+import sys
+from pathlib import Path
+
+from django.views import View
+from django.shortcuts import render
+from django.db.models import Q
+
+from core.models import Snapshot
+
+# from archivebox.config import PUBLIC_SNAPSHOTS
+PUBLIC_SNAPSHOTS = True
+
+
+class ReplayWebPageViewer(View):
+    template_name = 'plugin_replaywebpage__viewer.html'
+
+    # render static html index from filesystem archive/<timestamp>/index.html
+
+    def get_context_data(self, **kwargs):
+        return {
+            # **super().get_context_data(**kwargs),
+            # 'VERSION': VERSION,
+            # 'COMMIT_HASH': COMMIT_HASH,
+            # 'FOOTER_INFO': FOOTER_INFO,
+        }
+
+
+    def get(self, request, path):
+        if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
+            return redirect(f'/admin/login/?next={request.path}')
+
+        try:
+            timestamp, warc_filename = path.split('/', 1)
+        except (IndexError, ValueError):
+            timestamp, warc_filename = path.split('/', 1)[0], ''
+
+        snapshot = Snapshot.objects.get(Q(timestamp=timestamp) | Q(id__startswith=timestamp))
+
+        context = self.get_context_data()
+        context.update({
+            "snapshot": snapshot,
+            "timestamp": timestamp,
+            "warc_filename": warc_filename,
+        })
+        return render(template_name=self.template_name, request=self.request, context=context)
+
				`@ -0,0 +1 @@`
				`__package__ = 'archivebox.plugins.replaywebpage'`