From f75ae805f82c50c3ba0d49c8a62b15e43cb40c10 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 14 Oct 2024 15:41:58 -0700 Subject: [PATCH] comment out Crawl api methods temporarily --- archivebox/api/admin.py | 33 +++ archivebox/api/v1_core.py | 74 ++++- archivebox/core/admin_archiveresults.py | 198 +++++++++++++ archivebox/core/admin_snapshots.py | 368 ++++++++++++++++++++++++ archivebox/core/admin_tags.py | 81 ++++++ archivebox/core/admin_users.py | 91 ++++++ archivebox/crawls/admin.py | 29 +- archivebox/machine/admin.py | 94 ++++++ archivebox/misc/paginators.py | 30 ++ archivebox/queues/admin.py | 26 ++ archivebox/search/admin.py | 23 ++ 11 files changed, 1038 insertions(+), 9 deletions(-) create mode 100644 archivebox/api/admin.py create mode 100644 archivebox/core/admin_archiveresults.py create mode 100644 archivebox/core/admin_snapshots.py create mode 100644 archivebox/core/admin_tags.py create mode 100644 archivebox/core/admin_users.py create mode 100644 archivebox/machine/admin.py create mode 100644 archivebox/misc/paginators.py create mode 100644 archivebox/queues/admin.py create mode 100644 archivebox/search/admin.py diff --git a/archivebox/api/admin.py b/archivebox/api/admin.py new file mode 100644 index 00000000..49114936 --- /dev/null +++ b/archivebox/api/admin.py @@ -0,0 +1,33 @@ +# __package__ = 'archivebox.api' + +# import abx + +# from signal_webhooks.admin import WebhookAdmin +# from signal_webhooks.utils import get_webhook_model + +# from abid_utils.admin import ABIDModelAdmin + +# from .models import APIToken + + +# class APITokenAdmin(ABIDModelAdmin): +# list_display = ('created_at', 'abid', 'created_by', 'token_redacted', 'expires') +# sort_fields = ('abid', 'created_at', 'created_by', 'expires') +# readonly_fields = ('created_at', 'modified_at', 'abid_info') +# search_fields = ('id', 'abid', 'created_by__username', 'token') +# fields = ('created_by', 'token', 'expires', *readonly_fields) + +# list_filter = ('created_by',) +# ordering = ['-created_at'] +# list_per_page = 100 + +# class CustomWebhookAdmin(WebhookAdmin, ABIDModelAdmin): +# list_display = ('created_at', 'created_by', 'abid', *WebhookAdmin.list_display) +# sort_fields = ('created_at', 'created_by', 'abid', 'referenced_model', 'endpoint', 'last_success', 'last_error') +# readonly_fields = ('created_at', 'modified_at', 'abid_info', *WebhookAdmin.readonly_fields) + + +# @abx.hookimpl +# def register_admin(admin_site): +# admin_site.register(APIToken, APITokenAdmin) +# admin_site.register(get_webhook_model(), CustomWebhookAdmin) diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 9676b0d9..bcc957ee 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -15,7 +15,6 @@ from ninja.errors import HttpError from core.models import Snapshot, ArchiveResult, Tag from api.models import APIToken, OutboundWebhook -from abid_utils.abid import ABID from .auth import API_AUTH_METHODS @@ -396,11 +395,70 @@ def get_tag(request, tag_id: str, with_snapshots: bool=True): +# class CrawlSchema(Schema): +# TYPE: str = 'core.models.Crawl' + +# id: UUID +# abid: str + +# modified_at: datetime +# created_at: datetime +# created_by_id: str +# created_by_username: str + +# urls: str +# depth: int +# parser: str + +# # snapshots: List[SnapshotSchema] + +# @staticmethod +# def resolve_created_by_id(obj): +# return str(obj.created_by_id) + +# @staticmethod +# def resolve_created_by_username(obj): +# User = get_user_model() +# return User.objects.get(id=obj.created_by_id).username + +# @staticmethod +# def resolve_snapshots(obj, context): +# if context['request'].with_snapshots: +# return obj.snapshot_set.all().distinct() +# return Snapshot.objects.none() + + +# @router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl") +# def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False): +# """Get a specific Crawl by id or abid.""" +# crawl = None +# request.with_snapshots = with_snapshots +# request.with_archiveresults = with_archiveresults + +# try: +# crawl = Crawl.objects.get(abid__icontains=crawl_id) +# except Exception: +# pass + +# try: +# crawl = crawl or Crawl.objects.get(id__icontains=crawl_id) +# except Exception: +# pass +# return crawl + + +# [..., CrawlSchema] @router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema], url_name="get_any") def get_any(request, abid: str): request.with_snapshots = False request.with_archiveresults = False + if abid.startswith(APIToken.abid_prefix): + raise HttpError(403, 'APIToken objects are not accessible via REST API') + + if abid.startswith(OutboundWebhook.abid_prefix): + raise HttpError(403, 'OutboundWebhook objects are not accessible via REST API') + response = None try: response = response or get_snapshot(request, abid) @@ -416,11 +474,13 @@ def get_any(request, abid: str): response = response or get_tag(request, abid) except Exception: pass - - if abid.startswith(APIToken.abid_prefix): - raise HttpError(403, 'APIToken objects are not accessible via REST API') - if abid.startswith(OutboundWebhook.abid_prefix): - raise HttpError(403, 'OutboundWebhook objects are not accessible via REST API') + # try: + # response = response or get_crawl(request, abid) + # except Exception: + # pass - raise HttpError(404, 'Object with given ABID not found') + if not response: + raise HttpError(404, 'Object with given ABID not found') + + return response diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py new file mode 100644 index 00000000..e9645b03 --- /dev/null +++ b/archivebox/core/admin_archiveresults.py @@ -0,0 +1,198 @@ +__package__ = 'archivebox.core' + +import os +from pathlib import Path + +from django.contrib import admin +from django.utils.html import format_html, mark_safe +from django.core.exceptions import ValidationError +from django.urls import reverse, resolve +from django.utils import timezone +from django.forms import forms + +from huey_monitor.admin import TaskModel + +import abx + +from archivebox.config import DATA_DIR +from archivebox.config.common import SERVER_CONFIG +from archivebox.misc.paginators import AccelleratedPaginator +from archivebox.abid_utils.admin import ABIDModelAdmin + +from .models import ArchiveResult, Snapshot + + + + +def result_url(result: TaskModel) -> str: + url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)]) + return format_html('See progress...'.format(url=url)) + + + +class ArchiveResultInline(admin.TabularInline): + name = 'Archive Results Log' + model = ArchiveResult + parent_model = Snapshot + # fk_name = 'snapshot' + extra = 0 + sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version') + readonly_fields = ('id', 'result_id', 'completed', 'command', 'version') + fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output') + # exclude = ('id',) + ordering = ('end_ts',) + show_change_link = True + # # classes = ['collapse'] + # # list_display_links = ['abid'] + + def get_parent_object_from_request(self, request): + resolved = resolve(request.path_info) + try: + return self.parent_model.objects.get(pk=resolved.kwargs['object_id']) + except (self.parent_model.DoesNotExist, ValidationError): + return self.parent_model.objects.get(pk=self.parent_model.id_from_abid(resolved.kwargs['object_id'])) + + @admin.display( + description='Completed', + ordering='end_ts', + ) + def completed(self, obj): + return format_html('

{}

', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S')) + + def result_id(self, obj): + return format_html('[{}]', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid) + + def command(self, obj): + return format_html('{}', " ".join(obj.cmd or [])) + + def version(self, obj): + return format_html('{}', obj.cmd_version or '-') + + def get_formset(self, request, obj=None, **kwargs): + formset = super().get_formset(request, obj, **kwargs) + snapshot = self.get_parent_object_from_request(request) + + # import ipdb; ipdb.set_trace() + # formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget() + + # default values for new entries + formset.form.base_fields['status'].initial = 'succeeded' + formset.form.base_fields['start_ts'].initial = timezone.now() + formset.form.base_fields['end_ts'].initial = timezone.now() + formset.form.base_fields['cmd_version'].initial = '-' + formset.form.base_fields['pwd'].initial = str(snapshot.link_dir) + formset.form.base_fields['created_by'].initial = request.user + formset.form.base_fields['cmd'] = forms.JSONField(initial=['-']) + formset.form.base_fields['output'].initial = 'Manually recorded cmd output...' + + if obj is not None: + # hidden values for existing entries and new entries + formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget() + formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget() + formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget() + formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget() + formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget() + formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget() + return formset + + def get_readonly_fields(self, request, obj=None): + if obj is not None: + return self.readonly_fields + else: + return [] + + + +class ArchiveResultAdmin(ABIDModelAdmin): + list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str') + sort_fields = ('start_ts', 'extractor', 'status') + readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'abid_info', 'output_summary') + search_fields = ('id', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') + fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'start_ts', 'end_ts', 'created_by', 'cmd_version', 'cmd', *readonly_fields) + autocomplete_fields = ['snapshot'] + + list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') + ordering = ['-start_ts'] + list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE + + paginator = AccelleratedPaginator + save_on_top = True + + actions = ['delete_selected'] + + class Meta: + verbose_name = 'Archive Result' + verbose_name_plural = 'Archive Results' + + def change_view(self, request, object_id, form_url="", extra_context=None): + self.request = request + return super().change_view(request, object_id, form_url, extra_context) + + @admin.display( + description='Snapshot Info' + ) + def snapshot_info(self, result): + return format_html( + '[{}]   {}   {}
', + result.snapshot.timestamp, + result.snapshot.abid, + result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'), + result.snapshot.url[:128], + ) + + + @admin.display( + description='Snapshot Tags' + ) + def tags_str(self, result): + return result.snapshot.tags_str() + + def cmd_str(self, result): + return format_html( + '
{}
', + ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd), + ) + + def output_str(self, result): + return format_html( + 'â†—ī¸
{}
', + result.snapshot.timestamp, + result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html', + result.output, + ) + + def output_summary(self, result): + snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1] + output_str = format_html( + '
{}

', + result.output, + ) + output_str += format_html('See result files ...
', str(result.snapshot.timestamp))
+        path_from_output_str = (snapshot_dir / result.output)
+        output_str += format_html('{}/{}

', str(snapshot_dir), str(result.output)) + if os.access(path_from_output_str, os.R_OK): + root_dir = str(path_from_output_str) + else: + root_dir = str(snapshot_dir) + + # print(root_dir, str(list(os.walk(root_dir)))) + + for root, dirs, files in os.walk(root_dir): + depth = root.replace(root_dir, '').count(os.sep) + 1 + if depth > 2: + continue + indent = ' ' * 4 * (depth) + output_str += format_html('{}{}/
', indent, os.path.basename(root)) + indentation_str = ' ' * 4 * (depth + 1) + for filename in sorted(files): + is_hidden = filename.startswith('.') + output_str += format_html('{}{}
', int(not is_hidden), indentation_str, filename.strip()) + + return output_str + format_html('
') + + + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(ArchiveResult, ArchiveResultAdmin) diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py new file mode 100644 index 00000000..60d194f5 --- /dev/null +++ b/archivebox/core/admin_snapshots.py @@ -0,0 +1,368 @@ + +__package__ = 'archivebox.core' + +import os +from pathlib import Path + +from django.contrib import admin, messages +from django.urls import path +from django.utils.html import format_html, mark_safe +from django.utils import timezone +from django.forms import forms +from django.template import Template, RequestContext +from django.contrib.admin.helpers import ActionForm +from django.contrib.admin.widgets import FilteredSelectMultiple + + + +import abx + +from archivebox.config import DATA_DIR, VERSION +from archivebox.config.common import SERVER_CONFIG +from archivebox.misc.util import htmldecode, urldecode +from archivebox.misc.paginators import AccelleratedPaginator +from archivebox.abid_utils.admin import ABIDModelAdmin +from archivebox.search.admin import SearchResultsAdminMixin + +from archivebox.logging_util import printable_filesize +from archivebox.index.html import snapshot_icons +from archivebox.extractors import archive_links +from archivebox.main import remove + +from archivebox.queues.tasks import bg_archive_links, bg_add + + +from .models import Snapshot +from .admin_archiveresults import ArchiveResultInline, result_url +from .admin_tags import TagInline + + +GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False} + + + +class SnapshotActionForm(ActionForm): + tags = forms.ModelMultipleChoiceField( + label='Edit tags', + queryset=Tag.objects.all(), + required=False, + widget=FilteredSelectMultiple( + 'core_tag__name', + False, + ), + ) + + # TODO: allow selecting actions for specific extractors? is this useful? + # extractor = forms.ChoiceField( + # choices=ArchiveResult.EXTRACTOR_CHOICES, + # required=False, + # widget=forms.MultileChoiceField(attrs={'class': "form-control"}) + # ) + + +class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): + list_display = ('created_at', 'title_str', 'files', 'size', 'url_str') + sort_fields = ('title_str', 'url_str', 'created_at') + readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir') + search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name') + list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name') + fields = ('url', 'title', 'created_by', 'bookmarked_at', *readonly_fields) + ordering = ['-created_at'] + actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] + inlines = [TagInline, ArchiveResultInline] + list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000) + + action_form = SnapshotActionForm + paginator = AccelleratedPaginator + + save_on_top = True + show_full_result_count = False + + def changelist_view(self, request, extra_context=None): + self.request = request + extra_context = extra_context or {} + try: + return super().changelist_view(request, extra_context | GLOBAL_CONTEXT) + except Exception as e: + self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}') + return super().changelist_view(request, GLOBAL_CONTEXT) + + + def get_urls(self): + urls = super().get_urls() + custom_urls = [ + path('grid/', self.admin_site.admin_view(self.grid_view), name='grid') + ] + return custom_urls + urls + + # def get_queryset(self, request): + # # tags_qs = SnapshotTag.objects.all().select_related('tag') + # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs) + + # self.request = request + # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult')) + + @admin.action( + description="Imported Timestamp" + ) + def imported_timestamp(self, obj): + context = RequestContext(self.request, { + 'bookmarked_date': obj.bookmarked, + 'timestamp': obj.timestamp, + }) + + html = Template("""{{bookmarked_date}} ({{timestamp}})""") + return mark_safe(html.render(context)) + + # pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S') + # return f'{pretty_time} ({obj.timestamp})' + + # TODO: figure out a different way to do this, you cant nest forms so this doenst work + # def action(self, obj): + # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0 + # # action: update_snapshots + # # select_across: 0 + # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3 + # return format_html( + # ''' + #
+ # + # + # + # + # + # + # + #
+ # ''', + # csrf.get_token(self.request), + # obj.pk, + # ) + + def admin_actions(self, obj): + return format_html( + # URL Hash: {}
+ ''' + Summary page âžĄī¸     + Result files 📑     + Admin actions âš™ī¸ + ''', + obj.timestamp, + obj.timestamp, + obj.pk, + ) + + def status_info(self, obj): + return format_html( + # URL Hash: {}
+ ''' + Archived: {} ({} files {})     + Favicon:     + Status code: {}    
+ Server: {}     + Content type: {}     + Extension: {}     + ''', + '✅' if obj.is_archived else '❌', + obj.num_outputs, + self.size(obj) or '0kb', + f'/archive/{obj.timestamp}/favicon.ico', + obj.status_code or '-', + obj.headers and obj.headers.get('Server') or '-', + obj.headers and obj.headers.get('Content-Type') or '-', + obj.extension or '-', + ) + + @admin.display( + description='Title', + ordering='title', + ) + def title_str(self, obj): + tags = ''.join( + format_html('{} ', tag.pk, tag.name) + for tag in obj.tags.all() + if str(tag.name).strip() + ) + return format_html( + '' + '' + '' + '' + '{}' + '', + obj.archive_path, + obj.archive_path, + obj.archive_path, + 'fetched' if obj.latest_title or obj.title else 'pending', + urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' + ) + mark_safe(f' {tags}') + + @admin.display( + description='Files Saved', + # ordering='archiveresult_count', + ) + def files(self, obj): + # return '-' + return snapshot_icons(obj) + + + @admin.display( + # ordering='archiveresult_count' + ) + def size(self, obj): + archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size + if archive_size: + size_txt = printable_filesize(archive_size) + if archive_size > 52428800: + size_txt = mark_safe(f'{size_txt}') + else: + size_txt = mark_safe('...') + return format_html( + '{}', + obj.archive_path, + size_txt, + ) + + + @admin.display( + description='Original URL', + ordering='url', + ) + def url_str(self, obj): + return format_html( + '{}', + obj.url, + obj.url[:128], + ) + + def grid_view(self, request, extra_context=None): + + # cl = self.get_changelist_instance(request) + + # Save before monkey patching to restore for changelist list view + saved_change_list_template = self.change_list_template + saved_list_per_page = self.list_per_page + saved_list_max_show_all = self.list_max_show_all + + # Monkey patch here plus core_tags.py + self.change_list_template = 'private_index_grid.html' + self.list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE + self.list_max_show_all = self.list_per_page + + # Call monkey patched view + rendered_response = self.changelist_view(request, extra_context=extra_context) + + # Restore values + self.change_list_template = saved_change_list_template + self.list_per_page = saved_list_per_page + self.list_max_show_all = saved_list_max_show_all + + return rendered_response + + # for debugging, uncomment this to print all requests: + # def changelist_view(self, request, extra_context=None): + # print('[*] Got request', request.method, request.POST) + # return super().changelist_view(request, extra_context=None) + + @admin.action( + description="â„šī¸ Get Title" + ) + def update_titles(self, request, queryset): + links = [snapshot.as_link() for snapshot in queryset] + if len(links) < 3: + # run syncronously if there are only 1 or 2 links + archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR) + messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.") + else: + # otherwise run in a background worker + result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR}) + messages.success( + request, + mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"), + ) + + @admin.action( + description="âŦ‡ī¸ Get Missing" + ) + def update_snapshots(self, request, queryset): + links = [snapshot.as_link() for snapshot in queryset] + + result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR}) + + messages.success( + request, + mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"), + ) + + + @admin.action( + description="🆕 Archive Again" + ) + def resnapshot_snapshot(self, request, queryset): + for snapshot in queryset: + timestamp = timezone.now().isoformat('T', 'seconds') + new_url = snapshot.url.split('#')[0] + f'#{timestamp}' + + result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()}) + + messages.success( + request, + mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"), + ) + + @admin.action( + description="🔄 Redo" + ) + def overwrite_snapshots(self, request, queryset): + links = [snapshot.as_link() for snapshot in queryset] + + result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR}) + + messages.success( + request, + mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"), + ) + + @admin.action( + description="â˜ ī¸ Delete" + ) + def delete_snapshots(self, request, queryset): + remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR) + messages.success( + request, + mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."), + ) + + + @admin.action( + description="+" + ) + def add_tags(self, request, queryset): + tags = request.POST.getlist('tags') + print('[+] Adding tags', tags, 'to Snapshots', queryset) + for obj in queryset: + obj.tags.add(*tags) + messages.success( + request, + f"Added {len(tags)} tags to {queryset.count()} Snapshots.", + ) + + + @admin.action( + description="–" + ) + def remove_tags(self, request, queryset): + tags = request.POST.getlist('tags') + print('[-] Removing tags', tags, 'to Snapshots', queryset) + for obj in queryset: + obj.tags.remove(*tags) + messages.success( + request, + f"Removed {len(tags)} tags from {queryset.count()} Snapshots.", + ) + + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(Snapshot, SnapshotAdmin) diff --git a/archivebox/core/admin_tags.py b/archivebox/core/admin_tags.py new file mode 100644 index 00000000..8d2d28c8 --- /dev/null +++ b/archivebox/core/admin_tags.py @@ -0,0 +1,81 @@ +__package__ = 'archivebox.core' + +from django.contrib import admin +from django.utils.html import format_html, mark_safe + +import abx + +from archivebox.abid_utils.admin import ABIDModelAdmin +from archivebox.misc.paginators import AccelleratedPaginator + + +class TagInline(admin.TabularInline): + model = Tag.snapshot_set.through # type: ignore + # fk_name = 'snapshot' + fields = ('id', 'tag') + extra = 1 + # min_num = 1 + max_num = 1000 + autocomplete_fields = ( + 'tag', + ) + + +# class AutocompleteTags: +# model = Tag +# search_fields = ['name'] +# name = 'name' +# # source_field = 'name' +# remote_field = Tag._meta.get_field('name') + +# class AutocompleteTagsAdminStub: +# name = 'admin' + +class TagAdmin(ABIDModelAdmin): + list_display = ('created_at', 'created_by', 'abid', 'name', 'num_snapshots', 'snapshots') + list_filter = ('created_at', 'created_by') + sort_fields = ('name', 'slug', 'abid', 'created_by', 'created_at') + readonly_fields = ('slug', 'abid', 'created_at', 'modified_at', 'abid_info', 'snapshots') + search_fields = ('abid', 'name', 'slug') + fields = ('name', 'created_by', *readonly_fields) + actions = ['delete_selected'] + ordering = ['-created_at'] + + paginator = AccelleratedPaginator + + + def num_snapshots(self, tag): + return format_html( + '{} total', + tag.id, + tag.snapshot_set.count(), + ) + + def snapshots(self, tag): + total_count = tag.snapshot_set.count() + return mark_safe('
'.join( + format_html( + '[{}] {}', + snap.pk, + snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...', + snap.url[:64], + ) + for snap in tag.snapshot_set.order_by('-downloaded_at')[:10] + ) + (f'
{total_count} total snapshots...')) + + + +# @admin.register(SnapshotTag, site=archivebox_admin) +# class SnapshotTagAdmin(ABIDModelAdmin): +# list_display = ('id', 'snapshot', 'tag') +# sort_fields = ('id', 'snapshot', 'tag') +# search_fields = ('id', 'snapshot_id', 'tag_id') +# fields = ('snapshot', 'id') +# actions = ['delete_selected'] +# ordering = ['-id'] + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(Tag, TagAdmin) + diff --git a/archivebox/core/admin_users.py b/archivebox/core/admin_users.py new file mode 100644 index 00000000..259d2daf --- /dev/null +++ b/archivebox/core/admin_users.py @@ -0,0 +1,91 @@ +__package__ = 'archivebox.core' + +from django.contrib import admin +from django.contrib.auth.admin import UserAdmin +from django.utils.html import format_html, mark_safe +from django.contrib.auth import get_user_model + +import abx + + +class CustomUserAdmin(UserAdmin): + sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined'] + list_display = ['username', 'id', 'email', 'is_superuser', 'last_login', 'date_joined'] + readonly_fields = ('snapshot_set', 'archiveresult_set', 'tag_set', 'apitoken_set', 'outboundwebhook_set') + fieldsets = [*UserAdmin.fieldsets, ('Data', {'fields': readonly_fields})] + + @admin.display(description='Snapshots') + def snapshot_set(self, obj): + total_count = obj.snapshot_set.count() + return mark_safe('
'.join( + format_html( + '
[{}] 📅 {} {}', + snap.pk, + snap.abid, + snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...', + snap.url[:64], + ) + for snap in obj.snapshot_set.order_by('-modified_at')[:10] + ) + f'
{total_count} total records...') + + @admin.display(description='Archive Result Logs') + def archiveresult_set(self, obj): + total_count = obj.archiveresult_set.count() + return mark_safe('
'.join( + format_html( + '
[{}] 📅 {} 📄 {} {}', + result.pk, + result.abid, + result.snapshot.downloaded_at.strftime('%Y-%m-%d %H:%M') if result.snapshot.downloaded_at else 'pending...', + result.extractor, + result.snapshot.url[:64], + ) + for result in obj.archiveresult_set.order_by('-modified_at')[:10] + ) + f'
{total_count} total records...') + + @admin.display(description='Tags') + def tag_set(self, obj): + total_count = obj.tag_set.count() + return mark_safe(', '.join( + format_html( + '{}', + tag.pk, + tag.name, + ) + for tag in obj.tag_set.order_by('-modified_at')[:10] + ) + f'
{total_count} total records...') + + @admin.display(description='API Tokens') + def apitoken_set(self, obj): + total_count = obj.apitoken_set.count() + return mark_safe('
'.join( + format_html( + '
[{}] {} (expires {})', + apitoken.pk, + apitoken.abid, + apitoken.token_redacted[:64], + apitoken.expires, + ) + for apitoken in obj.apitoken_set.order_by('-modified_at')[:10] + ) + f'
{total_count} total records...') + + @admin.display(description='API Outbound Webhooks') + def outboundwebhook_set(self, obj): + total_count = obj.outboundwebhook_set.count() + return mark_safe('
'.join( + format_html( + '
[{}] {} -> {}', + outboundwebhook.pk, + outboundwebhook.abid, + outboundwebhook.referenced_model, + outboundwebhook.endpoint, + ) + for outboundwebhook in obj.outboundwebhook_set.order_by('-modified_at')[:10] + ) + f'
{total_count} total records...') + + + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(get_user_model(), CustomUserAdmin) diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index 8c38f3f3..fc52d9a3 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -1,3 +1,28 @@ -from django.contrib import admin +# __package__ = 'archivebox.crawls' -# Register your models here. +# import abx + +# from abid_utils.admin import ABIDModelAdmin + +# from .models import Crawl + + + +# class CrawlAdmin(ABIDModelAdmin): +# list_display = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls') +# sort_fields = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls') +# search_fields = ('abid', 'created_by__username', 'depth', 'parser', 'urls') + +# readonly_fields = ('created_at', 'modified_at', 'abid_info') +# fields = ('urls', 'depth', 'parser', 'created_by', *readonly_fields) + +# list_filter = ('depth', 'parser', 'created_by') +# ordering = ['-created_at'] +# list_per_page = 100 +# actions = ["delete_selected"] + + + +# @abx.hookimpl +# def register_admin(admin_site): +# admin_site.register(Crawl, CrawlAdmin) diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py new file mode 100644 index 00000000..97fa3b19 --- /dev/null +++ b/archivebox/machine/admin.py @@ -0,0 +1,94 @@ +# __package__ = 'archivebox.machine' + +# import abx + +# from django.contrib import admin +# from django.utils.html import format_html + +# from abid_utils.admin import ABIDModelAdmin + +# from .models import Machine, NetworkInterface, InstalledBinary + + + +# class MachineAdmin(ABIDModelAdmin): +# list_display = ('abid', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health') +# sort_fields = ('abid', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid') +# # search_fields = ('id', 'abid', 'guid', 'hostname', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release') + +# readonly_fields = ('guid', 'created_at', 'modified_at', 'abid_info', 'ips') +# fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'num_uses_succeeded', 'num_uses_failed') + +# list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform') +# ordering = ['-created_at'] +# list_per_page = 100 +# actions = ["delete_selected"] + +# @admin.display( +# description='Public IP', +# ordering='networkinterface__ip_public', +# ) +# def ips(self, machine): +# return format_html( +# '{}', +# machine.abid, +# ', '.join(machine.networkinterface_set.values_list('ip_public', flat=True)), +# ) + +# class NetworkInterfaceAdmin(ABIDModelAdmin): +# list_display = ('abid', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health') +# sort_fields = ('abid', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address') +# search_fields = ('abid', 'machine__abid', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country') + +# readonly_fields = ('machine', 'created_at', 'modified_at', 'abid_info', 'mac_address', 'ip_public', 'ip_local', 'dns_server') +# fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed') + +# list_filter = ('isp', 'country', 'region') +# ordering = ['-created_at'] +# list_per_page = 100 +# actions = ["delete_selected"] + +# @admin.display( +# description='Machine', +# ordering='machine__abid', +# ) +# def machine_info(self, iface): +# return format_html( +# '[{}]   {}', +# iface.machine.id, +# iface.machine.abid, +# iface.machine.hostname, +# ) + +# class InstalledBinaryAdmin(ABIDModelAdmin): +# list_display = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health') +# sort_fields = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256') +# search_fields = ('abid', 'machine__abid', 'name', 'binprovider', 'version', 'abspath', 'sha256') + +# readonly_fields = ('created_at', 'modified_at', 'abid_info') +# fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed') + +# list_filter = ('name', 'binprovider', 'machine_id') +# ordering = ['-created_at'] +# list_per_page = 100 +# actions = ["delete_selected"] + +# @admin.display( +# description='Machine', +# ordering='machine__abid', +# ) +# def machine_info(self, installed_binary): +# return format_html( +# '[{}]   {}', +# installed_binary.machine.id, +# installed_binary.machine.abid, +# installed_binary.machine.hostname, +# ) + + + +# @abx.hookimpl +# def register_admin(admin_site): +# admin_site.register(Machine, MachineAdmin) +# admin_site.register(NetworkInterface, NetworkInterfaceAdmin) +# admin_site.register(InstalledBinary, InstalledBinaryAdmin) diff --git a/archivebox/misc/paginators.py b/archivebox/misc/paginators.py new file mode 100644 index 00000000..2e623a65 --- /dev/null +++ b/archivebox/misc/paginators.py @@ -0,0 +1,30 @@ +__package__ = 'archivebox.misc' + +from django.core.paginator import Paginator +from django.utils.functional import cached_property + + +class AccelleratedPaginator(Paginator): + """ + Accellerated Pagniator ignores DISTINCT when counting total number of rows. + Speeds up SELECT Count(*) on Admin views by >20x. + https://hakibenita.com/optimizing-the-django-admin-paginator + """ + + @cached_property + def count(self): + if self.object_list._has_filters(): # type: ignore + # fallback to normal count method on filtered queryset + return super().count + else: + # otherwise count total rows in a separate fast query + return self.object_list.model.objects.count() + + # Alternative approach for PostgreSQL: fallback count takes > 200ms + # from django.db import connection, transaction, OperationalError + # with transaction.atomic(), connection.cursor() as cursor: + # cursor.execute('SET LOCAL statement_timeout TO 200;') + # try: + # return super().count + # except OperationalError: + # return 9999999999999 diff --git a/archivebox/queues/admin.py b/archivebox/queues/admin.py new file mode 100644 index 00000000..aee5788b --- /dev/null +++ b/archivebox/queues/admin.py @@ -0,0 +1,26 @@ +__package__ = 'archivebox.queues' + +import abx + +from django.contrib.auth import get_permission_codename + +from huey_monitor.apps import HueyMonitorConfig +from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin + + +HueyMonitorConfig.verbose_name = 'Background Workers' + + +class CustomTaskModelAdmin(TaskModelAdmin): + actions = ["delete_selected"] + + def has_delete_permission(self, request, obj=None): + codename = get_permission_codename("delete", self.opts) + return request.user.has_perm("%s.%s" % (self.opts.app_label, codename)) + + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(TaskModel, CustomTaskModelAdmin) + admin_site.register(SignalInfoModel, SignalInfoModelAdmin) diff --git a/archivebox/search/admin.py b/archivebox/search/admin.py new file mode 100644 index 00000000..42aadf6f --- /dev/null +++ b/archivebox/search/admin.py @@ -0,0 +1,23 @@ +__package__ = 'archivebox.search' + +from django.contrib import messages + +from archivebox.search import query_search_index + +class SearchResultsAdminMixin: + def get_search_results(self, request, queryset, search_term: str): + """Enhances the search queryset with results from the search backend""" + + qs, use_distinct = super().get_search_results(request, queryset, search_term) + + search_term = search_term.strip() + if not search_term: + return qs.distinct(), use_distinct + try: + qsearch = query_search_index(search_term) + qs = qs | qsearch + except Exception as err: + print(f'[!] Error while using search backend: {err.__class__.__name__} {err}') + messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}') + + return qs.distinct(), use_distinct