diff --git a/archivebox/core/views.py b/archivebox/core/views.py index e0a58ed7..c52903f0 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -3,6 +3,7 @@ __package__ = 'archivebox.core' from typing import Callable from io import StringIO +from pathlib import Path from contextlib import redirect_stdout from django.shortcuts import render, redirect @@ -36,10 +37,14 @@ from ..config import ( CONFIG_SCHEMA, DYNAMIC_CONFIG_SCHEMA, USER_CONFIG, + SAVE_ARCHIVE_DOT_ORG, + PREVIEW_ORIGINALS, ) +from ..logging_util import printable_filesize from ..main import add -from ..util import base_url, ansi_to_html +from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str from ..search import query_search_index +from ..extractors.wget import wget_output_path class HomepageView(View): @@ -56,10 +61,85 @@ class HomepageView(View): class SnapshotView(View): # render static html index from filesystem archive//index.html + @staticmethod + def render_live_index(request, snapshot): + TITLE_LOADING_MSG = 'Not yet archived...' + HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org') + + archiveresults = {} + + results = snapshot.archiveresult_set.all() + + for result in results: + embed_path = result.embed_path() + abs_path = result.snapshot_dir / (embed_path or 'None') + + if (result.status == 'succeeded' + and (result.extractor not in HIDDEN_RESULTS) + and embed_path + and abs_path.exists()): + if abs_path.is_dir() and not any(abs_path.glob('*.*')): + continue + + result_info = { + 'name': result.extractor, + 'path': embed_path, + 'ts': ts_to_date_str(result.end_ts), + } + archiveresults[result.extractor] = result_info + + preferred_types = ('singlefile', 'wget', 'screenshot', 'dom', 'media', 'pdf', 'readability', 'mercury') + all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types) + + best_result = {'path': 'None'} + for result_type in preferred_types: + if result_type in archiveresults: + best_result = archiveresults[result_type] + break + + link = snapshot.as_link() + + link_info = link._asdict(extended=True) + + try: + warc_path = 'warc/' + list(Path(snapshot.link_dir).glob('warc/*.warc.*'))[0].name + except IndexError: + warc_path = 'warc/' + + context = { + **link_info, + **link_info['canonical'], + 'title': htmlencode( + link.title + or (link.base_url if link.is_archived else TITLE_LOADING_MSG) + ), + 'url_str': htmlencode(urldecode(link.base_url)), + 'archive_url': urlencode( + wget_output_path(link) + or (link.domain if link.is_archived else '') + ) or 'about:blank', + 'extension': link.extension or 'html', + 'tags': link.tags or 'untagged', + 'size': printable_filesize(link.archive_size) if link.archive_size else 'pending', + 'status': 'archived' if link.is_archived else 'not yet archived', + 'status_color': 'success' if link.is_archived else 'danger', + 'oldest_archive_date': ts_to_date_str(link.oldest_archive_date), + 'warc_path': warc_path, + 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, + 'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS, + 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name'])), + 'best_result': best_result, + # 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234', + } + return render(template_name='core/snapshot_live.html', request=request, context=context) + + def get(self, request, path): if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS: return redirect(f'/admin/login/?next={request.path}') + snapshot = None + try: slug, archivefile = path.split('/', 1) except (IndexError, ValueError): @@ -75,7 +155,11 @@ class SnapshotView(View): try: try: snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug)) - response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True) + if archivefile == 'index.html': + # if they requested snapshot index, serve live rendered template instead of static html + response = self.render_live_index(request, snapshot) + else: + response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True) response["Link"] = f'<{snapshot.url}>; rel="canonical"' return response except Snapshot.DoesNotExist: diff --git a/archivebox/templates/core/snapshot_live.html b/archivebox/templates/core/snapshot_live.html new file mode 100644 index 00000000..08c608f9 --- /dev/null +++ b/archivebox/templates/core/snapshot_live.html @@ -0,0 +1,544 @@ +{% load static tz core_tags %} + + + + + {{title}} + + + + + +
+
+ +
+
+
+ {% for result in archiveresults %} +
+ +
+ {% endfor %} + +
+
+
+ + + + + + +