mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
wip new plugins system with browsertrix
This commit is contained in:
parent
16adff46dd
commit
1d4ec6f825
15 changed files with 3714 additions and 3 deletions
|
@ -62,6 +62,12 @@ INSTALLED_APPS = [
|
||||||
|
|
||||||
'core',
|
'core',
|
||||||
|
|
||||||
|
# Plugins
|
||||||
|
'plugins.replaywebpage',
|
||||||
|
# ...
|
||||||
|
# someday we may have enough plugins to justify dynamic loading:
|
||||||
|
# *(path.parent.name for path in (Path(PACKAGE_DIR) / 'plugins').glob('*/apps.py')),,
|
||||||
|
|
||||||
'django_extensions',
|
'django_extensions',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -162,7 +168,7 @@ if DEBUG_TOOLBAR:
|
||||||
'debug_toolbar.panels.request.RequestPanel',
|
'debug_toolbar.panels.request.RequestPanel',
|
||||||
'debug_toolbar.panels.sql.SQLPanel',
|
'debug_toolbar.panels.sql.SQLPanel',
|
||||||
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
|
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
|
||||||
# 'debug_toolbar.panels.templates.TemplatesPanel',
|
# 'debug_toolbar.panels.templates.TemplatesPanel', # buggy/slow
|
||||||
'debug_toolbar.panels.cache.CachePanel',
|
'debug_toolbar.panels.cache.CachePanel',
|
||||||
'debug_toolbar.panels.signals.SignalsPanel',
|
'debug_toolbar.panels.signals.SignalsPanel',
|
||||||
'debug_toolbar.panels.logging.LoggingPanel',
|
'debug_toolbar.panels.logging.LoggingPanel',
|
||||||
|
@ -178,16 +184,35 @@ if DEBUG_TOOLBAR:
|
||||||
|
|
||||||
STATIC_URL = '/static/'
|
STATIC_URL = '/static/'
|
||||||
|
|
||||||
|
STATIC_ROOT = Path(PACKAGE_DIR) / 'collected_static'
|
||||||
|
|
||||||
STATICFILES_DIRS = [
|
STATICFILES_DIRS = [
|
||||||
*([str(CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_DIR else []),
|
*([str(CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_DIR else []),
|
||||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'static'),
|
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'static'),
|
||||||
|
|
||||||
|
# Plugins
|
||||||
|
str(Path(PACKAGE_DIR) / 'plugins/replaywebpage/static'),
|
||||||
|
# ...
|
||||||
|
# someday if there are many more plugins / user-addable plugins:
|
||||||
|
# *(str(path) for path in (Path(PACKAGE_DIR) / 'plugins').glob('*/static')),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
MEDIA_URL = '/archive/'
|
||||||
|
MEDIA_ROOT = OUTPUT_DIR / 'archive'
|
||||||
|
|
||||||
|
|
||||||
TEMPLATE_DIRS = [
|
TEMPLATE_DIRS = [
|
||||||
*([str(CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_DIR else []),
|
*([str(CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_DIR else []),
|
||||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'core'),
|
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'core'),
|
||||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'admin'),
|
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'admin'),
|
||||||
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
|
str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
|
||||||
|
|
||||||
|
# Plugins
|
||||||
|
str(Path(PACKAGE_DIR) / 'plugins/replaywebpage/templates')
|
||||||
|
# ...
|
||||||
|
#
|
||||||
|
# someday if there are many more plugins / user-addable plugins:
|
||||||
|
# *(str(path) for path in (Path(PACKAGE_DIR) / 'plugins').glob('*/templates')),
|
||||||
]
|
]
|
||||||
|
|
||||||
TEMPLATES = [
|
TEMPLATES = [
|
||||||
|
|
|
@ -8,6 +8,7 @@ from django.views.generic.base import RedirectView
|
||||||
|
|
||||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||||
|
|
||||||
|
|
||||||
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
||||||
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
||||||
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
|
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
|
||||||
|
@ -26,6 +27,9 @@ urlpatterns = [
|
||||||
path('archive/', RedirectView.as_view(url='/')),
|
path('archive/', RedirectView.as_view(url='/')),
|
||||||
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
|
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
|
||||||
|
|
||||||
|
path('plugins/replaywebpage/', include('plugins.replaywebpage.urls')),
|
||||||
|
# ... dynamic load these someday if there are more of them
|
||||||
|
|
||||||
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
|
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
|
||||||
path('add/', AddView.as_view(), name='add'),
|
path('add/', AddView.as_view(), name='add'),
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
||||||
# versions of ./manage.py commands whenever possible. When that's not possible
|
# versions of ./manage.py commands whenever possible. When that's not possible
|
||||||
# (e.g. makemigrations), you can comment out this check temporarily
|
# (e.g. makemigrations), you can comment out this check temporarily
|
||||||
|
|
||||||
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
|
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'collectstatic' in sys.argv):
|
||||||
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
|
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
|
||||||
print()
|
print()
|
||||||
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')
|
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')
|
||||||
|
|
3
archivebox/plugins/__init__.py
Normal file
3
archivebox/plugins/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
__package__ = 'archivebox.plugins'
|
||||||
|
|
||||||
|
|
1
archivebox/plugins/replaywebpage/__init__.py
Normal file
1
archivebox/plugins/replaywebpage/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
__package__ = 'archivebox.plugins.replaywebpage'
|
8
archivebox/plugins/replaywebpage/apps.py
Normal file
8
archivebox/plugins/replaywebpage/apps.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class ReplayWebPageConfig(AppConfig):
|
||||||
|
label = "ReplayWeb.Page"
|
||||||
|
name = "plugin_replaywebpage"
|
||||||
|
|
||||||
|
default_auto_field = "django.db.models.BigAutoField"
|
50
archivebox/plugins/replaywebpage/extractors.py
Normal file
50
archivebox/plugins/replaywebpage/extractors.py
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
# browsertrix extractor
|
||||||
|
|
||||||
|
def save_browsertrix(link, out_dir, timeout, config):
|
||||||
|
|
||||||
|
|
||||||
|
browsertrix_dir = out_dir / 'browsertrix'
|
||||||
|
browsertrix_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
crawl_id = link.timestamp
|
||||||
|
|
||||||
|
browsertrix_crawler_cmd = [
|
||||||
|
'crawl',
|
||||||
|
f'--url', link.url,
|
||||||
|
f'--collection={crawl_id}',
|
||||||
|
'--scopeType=page',
|
||||||
|
'--generateWACZ',
|
||||||
|
'--text=final-to-warc',
|
||||||
|
'--timeLimit=60',
|
||||||
|
]
|
||||||
|
|
||||||
|
remote_cmd = """
|
||||||
|
rm /tmp/dump.rdb;
|
||||||
|
rm -rf /crawls/collections;
|
||||||
|
mkdir /crawls/collections;
|
||||||
|
env CRAWL_ID={crawl_id}
|
||||||
|
"""
|
||||||
|
|
||||||
|
local_cmd = ['nc', 'browsertrix', '2222']
|
||||||
|
|
||||||
|
status = 'succeeded'
|
||||||
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
try:
|
||||||
|
result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
|
||||||
|
|
||||||
|
cmd_output = result.stdout.decode()
|
||||||
|
|
||||||
|
wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
|
||||||
|
|
||||||
|
copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
TEMPLATE = """
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# rm /tmp/dump.rdb;
|
||||||
|
# rm -rf /crawls/collections;
|
||||||
|
# mkdir /crawls/collections;
|
||||||
|
# env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60
|
124
archivebox/plugins/replaywebpage/static/sw.js
Normal file
124
archivebox/plugins/replaywebpage/static/sw.js
Normal file
File diff suppressed because one or more lines are too long
1
archivebox/plugins/replaywebpage/static/test.txt
Normal file
1
archivebox/plugins/replaywebpage/static/test.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
test content this should be visible
|
BIN
archivebox/plugins/replaywebpage/static/test.wacz
Normal file
BIN
archivebox/plugins/replaywebpage/static/test.wacz
Normal file
Binary file not shown.
3392
archivebox/plugins/replaywebpage/static/ui.js
Normal file
3392
archivebox/plugins/replaywebpage/static/ui.js
Normal file
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,39 @@
|
||||||
|
{% load tz core_tags static %}
|
||||||
|
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<title>{{title}}</title>
|
||||||
|
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
|
||||||
|
</style>
|
||||||
|
<style>
|
||||||
|
html, body {
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
background-color: #ddd;
|
||||||
|
}
|
||||||
|
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
ReplayWeb.page for: {{snapshot.url}} ({{timestamp}}) /{{warc_filename}}
|
||||||
|
|
||||||
|
{{snapshot}}
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/sw.min.js
|
||||||
|
// https://cdn.jsdelivr.net/npm/replaywebpage@1.8.14/ui.min.js
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
replay-web-page {
|
||||||
|
width: 100%;
|
||||||
|
height: 900px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
<script src="/static/ui.js"></script>
|
||||||
|
|
||||||
|
<replay-web-page style="height: 600px" embed="replay-with-info" replayBase="/static/" source="{% static 'test.wacz' %}" url="https://example.com/"></replay-web-page>
|
||||||
|
</body>
|
||||||
|
</html>
|
7
archivebox/plugins/replaywebpage/urls.py
Normal file
7
archivebox/plugins/replaywebpage/urls.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
from django.urls import path
|
||||||
|
|
||||||
|
from .views import ReplayWebPageViewer
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('<path:path>', ReplayWebPageViewer.as_view(), name='plugin_replaywebpage__viewer'),
|
||||||
|
]
|
47
archivebox/plugins/replaywebpage/views.py
Normal file
47
archivebox/plugins/replaywebpage/views.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.views import View
|
||||||
|
from django.shortcuts import render
|
||||||
|
from django.db.models import Q
|
||||||
|
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
|
# from archivebox.config import PUBLIC_SNAPSHOTS
|
||||||
|
PUBLIC_SNAPSHOTS = True
|
||||||
|
|
||||||
|
|
||||||
|
class ReplayWebPageViewer(View):
|
||||||
|
template_name = 'plugin_replaywebpage__viewer.html'
|
||||||
|
|
||||||
|
# render static html index from filesystem archive/<timestamp>/index.html
|
||||||
|
|
||||||
|
def get_context_data(self, **kwargs):
|
||||||
|
return {
|
||||||
|
# **super().get_context_data(**kwargs),
|
||||||
|
# 'VERSION': VERSION,
|
||||||
|
# 'COMMIT_HASH': COMMIT_HASH,
|
||||||
|
# 'FOOTER_INFO': FOOTER_INFO,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get(self, request, path):
|
||||||
|
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||||
|
return redirect(f'/admin/login/?next={request.path}')
|
||||||
|
|
||||||
|
try:
|
||||||
|
timestamp, warc_filename = path.split('/', 1)
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
timestamp, warc_filename = path.split('/', 1)[0], ''
|
||||||
|
|
||||||
|
snapshot = Snapshot.objects.get(Q(timestamp=timestamp) | Q(id__startswith=timestamp))
|
||||||
|
|
||||||
|
context = self.get_context_data()
|
||||||
|
context.update({
|
||||||
|
"snapshot": snapshot,
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"warc_filename": warc_filename,
|
||||||
|
})
|
||||||
|
return render(template_name=self.template_name, request=self.request, context=context)
|
||||||
|
|
|
@ -13,12 +13,14 @@ version: '3.9'
|
||||||
services:
|
services:
|
||||||
archivebox:
|
archivebox:
|
||||||
#image: ${DOCKER_IMAGE:-archivebox/archivebox:dev}
|
#image: ${DOCKER_IMAGE:-archivebox/archivebox:dev}
|
||||||
image: archivebox/archivebox:dev
|
image: archivebox:test
|
||||||
|
# image: archivebox/archivebox:dev
|
||||||
command: server --quick-init 0.0.0.0:8000
|
command: server --quick-init 0.0.0.0:8000
|
||||||
ports:
|
ports:
|
||||||
- 8000:8000
|
- 8000:8000
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/data
|
- ./data:/data
|
||||||
|
- /Volumes/OPT/browsertrix:/browsertrix:z
|
||||||
# - ./etc/crontabs:/var/spool/cron/crontabs # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs
|
# - ./etc/crontabs:/var/spool/cron/crontabs # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs
|
||||||
# - ./archivebox:/app/archivebox # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox)
|
# - ./archivebox:/app/archivebox # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox)
|
||||||
# build: . # uncomment this to build the image from source code at buildtime (for developers working on archivebox)
|
# build: . # uncomment this to build the image from source code at buildtime (for developers working on archivebox)
|
||||||
|
@ -48,6 +50,14 @@ services:
|
||||||
# dns:
|
# dns:
|
||||||
# - 172.20.0.53
|
# - 172.20.0.53
|
||||||
|
|
||||||
|
browsertrix:
|
||||||
|
image: webrecorder/browsertrix-crawler:latest
|
||||||
|
command: /bin/docker_ipc_listener.py
|
||||||
|
expose:
|
||||||
|
- 2222
|
||||||
|
volumes:
|
||||||
|
- /Volumes/OPT/browsertrix:/crawls:z
|
||||||
|
- ./bin/docker_ipc_listener.py:/bin/docker_ipc_listener.py
|
||||||
|
|
||||||
######## Optional Addons: tweak examples below as needed for your specific use case ########
|
######## Optional Addons: tweak examples below as needed for your specific use case ########
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue