mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
feat: initial functional version with icons calculated based on archive results
This commit is contained in:
parent
309a87e8fe
commit
b3e0400bc0
3 changed files with 104 additions and 29 deletions
|
@ -1,8 +1,43 @@
|
||||||
# Generated by Django 3.0.8 on 2020-11-04 12:25
|
# Generated by Django 3.0.8 on 2020-11-04 12:25
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from django.db import migrations, models
|
from django.db import migrations, models
|
||||||
import django.db.models.deletion
|
import django.db.models.deletion
|
||||||
|
|
||||||
|
from config import CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
def forwards_func(apps, schema_editor):
|
||||||
|
from core.models import EXTRACTORS
|
||||||
|
|
||||||
|
Snapshot = apps.get_model("core", "Snapshot")
|
||||||
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||||
|
|
||||||
|
snapshots = Snapshot.objects.all()
|
||||||
|
for snapshot in snapshots:
|
||||||
|
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(out_dir / "index.json", "r") as f:
|
||||||
|
fs_index = json.load(f)
|
||||||
|
except Exception as e:
|
||||||
|
continue
|
||||||
|
|
||||||
|
history = fs_index["history"]
|
||||||
|
|
||||||
|
for extractor in history:
|
||||||
|
for result in history[extractor]:
|
||||||
|
ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=json.dumps(result["cmd"]), cmd_version=result["cmd_version"],
|
||||||
|
start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def reverse_func(apps, schema_editor):
|
||||||
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||||
|
ArchiveResult.objects.all().delete()
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
@ -18,6 +53,7 @@ class Migration(migrations.Migration):
|
||||||
('cmd', models.CharField(default='', max_length=500)),
|
('cmd', models.CharField(default='', max_length=500)),
|
||||||
('pwd', models.CharField(default='', max_length=200)),
|
('pwd', models.CharField(default='', max_length=200)),
|
||||||
('cmd_version', models.CharField(default='', max_length=20)),
|
('cmd_version', models.CharField(default='', max_length=20)),
|
||||||
|
('status', models.CharField(max_length=10)),
|
||||||
('output', models.CharField(default='', max_length=500)),
|
('output', models.CharField(default='', max_length=500)),
|
||||||
('start_ts', models.DateTimeField()),
|
('start_ts', models.DateTimeField()),
|
||||||
('end_ts', models.DateTimeField()),
|
('end_ts', models.DateTimeField()),
|
||||||
|
@ -25,4 +61,5 @@ class Migration(migrations.Migration):
|
||||||
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
|
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
|
migrations.RunPython(forwards_func, reverse_func),
|
||||||
]
|
]
|
||||||
|
|
|
@ -161,4 +161,8 @@ class ArchiveResult(models.Model):
|
||||||
output = models.CharField(max_length=500, default="")
|
output = models.CharField(max_length=500, default="")
|
||||||
start_ts = models.DateTimeField()
|
start_ts = models.DateTimeField()
|
||||||
end_ts = models.DateTimeField()
|
end_ts = models.DateTimeField()
|
||||||
|
status = models.CharField(max_length=10)
|
||||||
extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
|
extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.extractor
|
||||||
|
|
|
@ -2,38 +2,72 @@ from pathlib import Path
|
||||||
|
|
||||||
from django.utils.html import format_html
|
from django.utils.html import format_html
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot, ArchiveResult, EXTRACTORS
|
||||||
|
|
||||||
|
|
||||||
def get_icons(snapshot: Snapshot) -> str:
|
def get_icons(snapshot: Snapshot) -> str:
|
||||||
|
archive_results = snapshot.archiveresult_set
|
||||||
link = snapshot.as_link()
|
link = snapshot.as_link()
|
||||||
canon = link.canonical_outputs()
|
canon = link.canonical_outputs()
|
||||||
out_dir = Path(link.link_dir)
|
output = ""
|
||||||
|
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'
|
||||||
|
icons = {
|
||||||
|
"singlefile": "❶",
|
||||||
|
"wget": "🆆",
|
||||||
|
"dom": "🅷",
|
||||||
|
"pdf": "📄",
|
||||||
|
"screenshot": "💻",
|
||||||
|
"media": "📼",
|
||||||
|
"git": "🅶",
|
||||||
|
"archive_org": "🏛",
|
||||||
|
"readability": "🆁",
|
||||||
|
"mercury": "🅼",
|
||||||
|
}
|
||||||
|
exclude = ["favicon"]
|
||||||
|
# Missing specific entry for WARC
|
||||||
|
|
||||||
# slow version: highlights icons based on whether files exist or not for that output
|
|
||||||
# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
|
|
||||||
# fast version: all icons are highlighted without checking for outputs in filesystem
|
|
||||||
link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
|
|
||||||
|
|
||||||
return format_html(
|
for extractor in EXTRACTORS:
|
||||||
'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
|
result = archive_results.filter(extractor=extractor[0])
|
||||||
'<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
|
try:
|
||||||
'<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
|
if extractor[0] not in exclude:
|
||||||
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
|
output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"],
|
||||||
'<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
|
result.exists(), extractor[0], icons.get(extractor[0], "?"))
|
||||||
'<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
|
except Exception as e:
|
||||||
'<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
|
print(e)
|
||||||
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
|
|
||||||
'<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
|
return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')
|
||||||
'<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
|
|
||||||
'</span>',
|
#def get_icons(snapshot: Snapshot) -> str:
|
||||||
*link_tuple(link, 'singlefile_path'),
|
# link = snapshot.as_link()
|
||||||
*link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
|
# canon = link.canonical_outputs()
|
||||||
*link_tuple(link, 'pdf_path'),
|
# out_dir = Path(link.link_dir)
|
||||||
*link_tuple(link, 'screenshot_path'),
|
#
|
||||||
*link_tuple(link, 'dom_path'),
|
# # slow version: highlights icons based on whether files exist or not for that output
|
||||||
*link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
|
# # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
|
||||||
*link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
|
# # fast version: all icons are highlighted without checking for outputs in filesystem
|
||||||
*link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
|
# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
|
||||||
canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
|
#
|
||||||
)
|
# return format_html(
|
||||||
|
# '<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
|
||||||
|
# '<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
|
||||||
|
# '<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
|
||||||
|
# '<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
|
||||||
|
# '<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
|
||||||
|
# '<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
|
||||||
|
# '<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
|
||||||
|
# '<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
|
||||||
|
# '<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
|
||||||
|
# '<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
|
||||||
|
# '</span>',
|
||||||
|
# *link_tuple(link, 'singlefile_path'),
|
||||||
|
# *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
|
||||||
|
# *link_tuple(link, 'pdf_path'),
|
||||||
|
# *link_tuple(link, 'screenshot_path'),
|
||||||
|
# *link_tuple(link, 'dom_path'),
|
||||||
|
# *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
|
||||||
|
# *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
|
||||||
|
# *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
|
||||||
|
# canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
|
||||||
|
# )
|
||||||
|
#
|
Loading…
Add table
Add a link
Reference in a new issue