add new live django template for snapshot detail page

2025-05-15 07:34:27 -04:00 · 2024-05-11 22:33:02 -07:00 · 2024-05-11 22:33:02 -07:00 · 8841e8b181
commit 8841e8b181
parent 457c42bf84
2 changed files with 630 additions and 2 deletions
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@ -3,6 +3,7 @@ __package__ = 'archivebox.core'
 from typing import Callable

 from io import StringIO
+from pathlib import Path
 from contextlib import redirect_stdout

 from django.shortcuts import render, redirect
@ -36,10 +37,14 @@ from ..config import (
    CONFIG_SCHEMA,
    DYNAMIC_CONFIG_SCHEMA,
    USER_CONFIG,
+    SAVE_ARCHIVE_DOT_ORG,
+    PREVIEW_ORIGINALS,
 )
+from ..logging_util import printable_filesize
 from ..main import add
-from ..util import base_url, ansi_to_html
+from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str
 from ..search import query_search_index
+from ..extractors.wget import wget_output_path


 class HomepageView(View):
@ -56,10 +61,85 @@ class HomepageView(View):
 class SnapshotView(View):
    # render static html index from filesystem archive/<timestamp>/index.html

+    @staticmethod
+    def render_live_index(request, snapshot):
+        TITLE_LOADING_MSG = 'Not yet archived...'
+        HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')
+
+        archiveresults = {}
+
+        results = snapshot.archiveresult_set.all()
+
+        for result in results:
+            embed_path = result.embed_path()
+            abs_path = result.snapshot_dir / (embed_path or 'None')
+
+            if (result.status == 'succeeded'
+                and (result.extractor not in HIDDEN_RESULTS)
+                and embed_path
+                and abs_path.exists()):
+                if abs_path.is_dir() and not any(abs_path.glob('*.*')):
+                    continue
+
+                result_info = {
+                    'name': result.extractor,
+                    'path': embed_path,
+                    'ts': ts_to_date_str(result.end_ts),
+                }
+                archiveresults[result.extractor] = result_info
+
+        preferred_types = ('singlefile', 'wget', 'screenshot', 'dom', 'media', 'pdf', 'readability', 'mercury')
+        all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
+
+        best_result = {'path': 'None'}
+        for result_type in preferred_types:
+            if result_type in archiveresults:
+                best_result = archiveresults[result_type]
+                break
+
+        link = snapshot.as_link()
+
+        link_info = link._asdict(extended=True)
+
+        try:
+            warc_path = 'warc/' + list(Path(snapshot.link_dir).glob('warc/*.warc.*'))[0].name
+        except IndexError:
+            warc_path = 'warc/'
+
+        context = {
+            **link_info,
+            **link_info['canonical'],
+            'title': htmlencode(
+                link.title
+                or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
+            ),
+            'url_str': htmlencode(urldecode(link.base_url)),
+            'archive_url': urlencode(
+                wget_output_path(link)
+                or (link.domain if link.is_archived else '')
+            ) or 'about:blank',
+            'extension': link.extension or 'html',
+            'tags': link.tags or 'untagged',
+            'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
+            'status': 'archived' if link.is_archived else 'not yet archived',
+            'status_color': 'success' if link.is_archived else 'danger',
+            'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
+            'warc_path': warc_path,
+            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
+            'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,
+            'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name'])),
+            'best_result': best_result,
+            # 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234',
+        }
+        return render(template_name='core/snapshot_live.html', request=request, context=context)
+
+
    def get(self, request, path):
        if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
            return redirect(f'/admin/login/?next={request.path}')

+        snapshot = None
+
        try:
            slug, archivefile = path.split('/', 1)
        except (IndexError, ValueError):
@ -75,7 +155,11 @@ class SnapshotView(View):
            try:
                try:
                    snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
-                    response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
+                    if archivefile == 'index.html':
+                        # if they requested snapshot index, serve live rendered template instead of static html
+                        response = self.render_live_index(request, snapshot)
+                    else:
+                        response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
                    response["Link"] = f'<{snapshot.url}>; rel="canonical"'
                    return response
                except Snapshot.DoesNotExist: