from django.utils.html import format_html from core.models import Snapshot, EXTRACTORS from pathlib import Path def get_icons(snapshot: Snapshot) -> str: archive_results = snapshot.archiveresult_set link = snapshot.as_link() canon = link.canonical_outputs() output = "" output_template = '{} ' icons = { "singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄", "screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛", "readability": "🆁", "mercury": "🅼", "warc": "📦" } exclude = ["favicon"] # Missing specific entry for WARC for extractor, _ in EXTRACTORS: result = archive_results.filter(extractor=extractor, status="succeeded") path, exists = link.archive_path, result.exists() try: if extractor not in exclude: output += output_template.format(path, canon[f"{extractor}_path"], exists, extractor, icons.get(extractor, "?")) if extractor == "wget": # warc isn't technically it's own extractor, so we have to add it after wget exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) if exists: output += output_template.format(exists[0], "", True, "warc", icons.get("warc", "?")) except Exception as e: print(e) return format_html(f'{output}') #def get_icons(snapshot: Snapshot) -> str: # link = snapshot.as_link() # canon = link.canonical_outputs() # out_dir = Path(link.link_dir) # # # slow version: highlights icons based on whether files exist or not for that output # # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) # # fast version: all icons are highlighted without checking for outputs in filesystem # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) # # return format_html( # '' # '❶ ' # '🆆 ' # '🅷 ' # '📄 ' # '💻 ' # '📦 ' # '📼 ' # '🅶 ' # '🏛 ' # '', # *link_tuple(link, 'singlefile_path'), # *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')), # *link_tuple(link, 'pdf_path'), # *link_tuple(link, 'screenshot_path'), # *link_tuple(link, 'dom_path'), # *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), # *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), # *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), # canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), # ) #