from django.utils.html import format_html
from core.models import Snapshot, EXTRACTORS
from pathlib import Path
def get_icons(snapshot: Snapshot) -> str:
archive_results = snapshot.archiveresult_set
link = snapshot.as_link()
canon = link.canonical_outputs()
output = ""
output_template = '{} '
icons = {
"singlefile": "❶",
"wget": "🆆",
"dom": "🅷",
"pdf": "📄",
"screenshot": "💻",
"media": "📼",
"git": "🅶",
"archive_org": "🏛",
"readability": "🆁",
"mercury": "🅼",
"warc": "📦"
}
exclude = ["favicon"]
# Missing specific entry for WARC
for extractor, _ in EXTRACTORS:
result = archive_results.filter(extractor=extractor, status="succeeded")
path, exists = link.archive_path, result.exists()
try:
if extractor not in exclude:
output += output_template.format(path, canon[f"{extractor}_path"],
exists, extractor, icons.get(extractor, "?"))
if extractor == "wget":
# warc isn't technically it's own extractor, so we have to add it after wget
exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
if exists:
output += output_template.format(exists[0], "",
True, "warc", icons.get("warc", "?"))
except Exception as e:
print(e)
return format_html(f'{output}')
#def get_icons(snapshot: Snapshot) -> str:
# link = snapshot.as_link()
# canon = link.canonical_outputs()
# out_dir = Path(link.link_dir)
#
# # slow version: highlights icons based on whether files exist or not for that output
# # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
# # fast version: all icons are highlighted without checking for outputs in filesystem
# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
#
# return format_html(
# ''
# '❶ '
# '🆆 '
# '🅷 '
# '📄 '
# '💻 '
# '📦 '
# '📼 '
# '🅶 '
# '🏛 '
# '',
# *link_tuple(link, 'singlefile_path'),
# *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
# *link_tuple(link, 'pdf_path'),
# *link_tuple(link, 'screenshot_path'),
# *link_tuple(link, 'dom_path'),
# *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
# *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
# *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
# canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
# )
#