mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-23 11:17:02 -04:00
Merge pull request #525 from cdvv7788/archive-result
This commit is contained in:
commit
be6dcfad4a
9 changed files with 180 additions and 40 deletions
|
@ -58,6 +58,7 @@ archivebox/core/migrations/0003_auto_20200630_1034.py
|
||||||
archivebox/core/migrations/0004_auto_20200713_1552.py
|
archivebox/core/migrations/0004_auto_20200713_1552.py
|
||||||
archivebox/core/migrations/0005_auto_20200728_0326.py
|
archivebox/core/migrations/0005_auto_20200728_0326.py
|
||||||
archivebox/core/migrations/0006_auto_20201012_1520.py
|
archivebox/core/migrations/0006_auto_20201012_1520.py
|
||||||
|
archivebox/core/migrations/0007_archiveresult.py
|
||||||
archivebox/core/migrations/__init__.py
|
archivebox/core/migrations/__init__.py
|
||||||
archivebox/extractors/__init__.py
|
archivebox/extractors/__init__.py
|
||||||
archivebox/extractors/archive_org.py
|
archivebox/extractors/archive_org.py
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
requests==2.24.0
|
requests==2.24.0
|
||||||
atomicwrites==1.4.0
|
atomicwrites==1.4.0
|
||||||
mypy-extensions==0.4.3
|
mypy-extensions==0.4.3
|
||||||
django==3.0.8
|
django==3.1.3
|
||||||
django-extensions==3.0.3
|
django-extensions==3.0.3
|
||||||
dateparser
|
dateparser
|
||||||
ipython
|
ipython
|
||||||
|
|
91
archivebox/core/migrations/0007_archiveresult.py
Normal file
91
archivebox/core/migrations/0007_archiveresult.py
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
# Generated by Django 3.0.8 on 2020-11-04 12:25
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
import django.db.models.deletion
|
||||||
|
|
||||||
|
from config import CONFIG
|
||||||
|
from index.json import to_json
|
||||||
|
|
||||||
|
|
||||||
|
def forwards_func(apps, schema_editor):
|
||||||
|
from core.models import EXTRACTORS
|
||||||
|
|
||||||
|
Snapshot = apps.get_model("core", "Snapshot")
|
||||||
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||||
|
|
||||||
|
snapshots = Snapshot.objects.all()
|
||||||
|
for snapshot in snapshots:
|
||||||
|
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(out_dir / "index.json", "r") as f:
|
||||||
|
fs_index = json.load(f)
|
||||||
|
except Exception as e:
|
||||||
|
continue
|
||||||
|
|
||||||
|
history = fs_index["history"]
|
||||||
|
|
||||||
|
for extractor in history:
|
||||||
|
for result in history[extractor]:
|
||||||
|
ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=result["cmd"], cmd_version=result["cmd_version"],
|
||||||
|
start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
|
||||||
|
|
||||||
|
|
||||||
|
def verify_json_index_integrity(snapshot):
|
||||||
|
results = snapshot.archiveresult_set.all()
|
||||||
|
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
|
||||||
|
with open(out_dir / "index.json", "r") as f:
|
||||||
|
index = json.load(f)
|
||||||
|
|
||||||
|
history = index["history"]
|
||||||
|
index_results = [result for extractor in history for result in history[extractor]]
|
||||||
|
flattened_results = [result["start_ts"] for result in index_results]
|
||||||
|
|
||||||
|
missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
|
||||||
|
|
||||||
|
for missing in missing_results:
|
||||||
|
index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
|
||||||
|
"start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
|
||||||
|
"schema": "ArchiveResult", "status": missing.status})
|
||||||
|
|
||||||
|
json_index = to_json(index)
|
||||||
|
with open(out_dir / "index.json", "w") as f:
|
||||||
|
f.write(json_index)
|
||||||
|
|
||||||
|
|
||||||
|
def reverse_func(apps, schema_editor):
|
||||||
|
Snapshot = apps.get_model("core", "Snapshot")
|
||||||
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||||
|
for snapshot in Snapshot.objects.all():
|
||||||
|
verify_json_index_integrity(snapshot)
|
||||||
|
|
||||||
|
ArchiveResult.objects.all().delete()
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('core', '0006_auto_20201012_1520'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='ArchiveResult',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('cmd', models.JSONField()),
|
||||||
|
('pwd', models.CharField(max_length=256)),
|
||||||
|
('cmd_version', models.CharField(max_length=32)),
|
||||||
|
('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),
|
||||||
|
('output', models.CharField(max_length=512)),
|
||||||
|
('start_ts', models.DateTimeField()),
|
||||||
|
('end_ts', models.DateTimeField()),
|
||||||
|
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
|
||||||
|
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
migrations.RunPython(forwards_func, reverse_func),
|
||||||
|
]
|
|
@ -8,6 +8,14 @@ from django.utils.text import slugify
|
||||||
|
|
||||||
from ..util import parse_date
|
from ..util import parse_date
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
|
from ..extractors import get_default_archive_methods
|
||||||
|
|
||||||
|
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
||||||
|
STATUS_CHOICES = [
|
||||||
|
("succeeded", "succeeded"),
|
||||||
|
("failed", "failed"),
|
||||||
|
("skipped", "skipped")
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class Tag(models.Model):
|
class Tag(models.Model):
|
||||||
|
@ -148,3 +156,18 @@ class Snapshot(models.Model):
|
||||||
tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
|
tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
|
||||||
self.tags.clear()
|
self.tags.clear()
|
||||||
self.tags.add(*tags_id)
|
self.tags.add(*tags_id)
|
||||||
|
|
||||||
|
|
||||||
|
class ArchiveResult(models.Model):
|
||||||
|
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||||
|
cmd = models.JSONField()
|
||||||
|
pwd = models.CharField(max_length=256)
|
||||||
|
cmd_version = models.CharField(max_length=32)
|
||||||
|
output = models.CharField(max_length=512)
|
||||||
|
start_ts = models.DateTimeField()
|
||||||
|
end_ts = models.DateTimeField()
|
||||||
|
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
|
||||||
|
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.extractor
|
||||||
|
|
|
@ -1,39 +1,54 @@
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from django.utils.html import format_html
|
from django.utils.html import format_html
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot, EXTRACTORS
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def get_icons(snapshot: Snapshot) -> str:
|
def get_icons(snapshot: Snapshot) -> str:
|
||||||
|
archive_results = snapshot.archiveresult_set.filter(status="succeeded")
|
||||||
link = snapshot.as_link()
|
link = snapshot.as_link()
|
||||||
|
path = link.archive_path
|
||||||
canon = link.canonical_outputs()
|
canon = link.canonical_outputs()
|
||||||
out_dir = Path(link.link_dir)
|
output = ""
|
||||||
|
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{} </a>'
|
||||||
|
icons = {
|
||||||
|
"singlefile": "❶",
|
||||||
|
"wget": "🆆",
|
||||||
|
"dom": "🅷",
|
||||||
|
"pdf": "📄",
|
||||||
|
"screenshot": "💻",
|
||||||
|
"media": "📼",
|
||||||
|
"git": "🅶",
|
||||||
|
"archive_org": "🏛",
|
||||||
|
"readability": "🆁",
|
||||||
|
"mercury": "🅼",
|
||||||
|
"warc": "📦"
|
||||||
|
}
|
||||||
|
exclude = ["favicon", "title", "headers", "archive_org"]
|
||||||
|
# Missing specific entry for WARC
|
||||||
|
|
||||||
# slow version: highlights icons based on whether files exist or not for that output
|
extractor_items = defaultdict(lambda: None)
|
||||||
# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
|
for extractor, _ in EXTRACTORS:
|
||||||
# fast version: all icons are highlighted without checking for outputs in filesystem
|
for result in archive_results:
|
||||||
link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
|
if result.extractor == extractor:
|
||||||
|
extractor_items[extractor] = result
|
||||||
|
|
||||||
return format_html(
|
for extractor, _ in EXTRACTORS:
|
||||||
'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
|
if extractor not in exclude:
|
||||||
'<a href="/{}/{}" class="exists-{}" title="SingleFile">❶ </a>'
|
exists = extractor_items[extractor] is not None
|
||||||
'<a href="/{}/{}" class="exists-{}" title="Wget clone">🆆 </a> '
|
output += output_template.format(path, canon[f"{extractor}_path"], str(exists),
|
||||||
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
|
extractor, icons.get(extractor, "?"))
|
||||||
'<a href="/{}/{}" class="exists-{}" title="PDF">📄 </a> '
|
if extractor == "wget":
|
||||||
'<a href="/{}/{}" class="exists-{}" title="Screenshot">💻 </a> '
|
# warc isn't technically it's own extractor, so we have to add it after wget
|
||||||
'<a href="/{}/{}" class="exists-{}" title="WARC">📦 </a> '
|
exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
|
||||||
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
|
output += output_template.format(exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
|
||||||
'<a href="/{}/{}/" class="exists-{}" title="Git repos">🅶 </a> '
|
|
||||||
'<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
|
if extractor == "archive_org":
|
||||||
'</span>',
|
# The check for archive_org is different, so it has to be handled separately
|
||||||
*link_tuple(link, 'singlefile_path'),
|
target_path = Path(path) / "archive.org.txt"
|
||||||
*link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
|
exists = target_path.exists()
|
||||||
*link_tuple(link, 'dom_path'),
|
output += '<a href="{}" class="exists-{}" title="{}">{} </a>'.format(canon["archive_org_path"], str(exists),
|
||||||
*link_tuple(link, 'pdf_path'),
|
"archive_org", icons.get("archive_org", "?"))
|
||||||
*link_tuple(link, 'screenshot_path'),
|
|
||||||
*link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
|
return format_html(f'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">{output}<span>')
|
||||||
*link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
|
|
||||||
*link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
|
|
||||||
canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
|
|
||||||
)
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ from datetime import datetime
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
|
from ..index.sql import write_link_to_sql_index
|
||||||
from ..index import (
|
from ..index import (
|
||||||
load_link_details,
|
load_link_details,
|
||||||
write_link_details,
|
write_link_details,
|
||||||
|
@ -65,6 +66,14 @@ def ignore_methods(to_ignore: List[str]):
|
||||||
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, skip_index: bool=False) -> Link:
|
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None, skip_index: bool=False) -> Link:
|
||||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||||
|
|
||||||
|
# TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
|
||||||
|
if not skip_index:
|
||||||
|
from core.models import Snapshot, ArchiveResult
|
||||||
|
try:
|
||||||
|
snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
snapshot = write_link_to_sql_index(link)
|
||||||
|
|
||||||
ARCHIVE_METHODS = get_default_archive_methods()
|
ARCHIVE_METHODS = get_default_archive_methods()
|
||||||
|
|
||||||
if methods:
|
if methods:
|
||||||
|
@ -99,6 +108,10 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||||
|
|
||||||
stats[result.status] += 1
|
stats[result.status] += 1
|
||||||
log_archive_method_finished(result)
|
log_archive_method_finished(result)
|
||||||
|
if not skip_index:
|
||||||
|
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
|
||||||
|
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# print('{black} X {}{reset}'.format(method_name, **ANSI))
|
# print('{black} X {}{reset}'.format(method_name, **ANSI))
|
||||||
stats['skipped'] += 1
|
stats['skipped'] += 1
|
||||||
|
|
|
@ -224,12 +224,10 @@
|
||||||
color: black;
|
color: black;
|
||||||
}
|
}
|
||||||
|
|
||||||
tr td a.exists-True {
|
.exists-False {
|
||||||
opacity: 1;
|
|
||||||
}
|
|
||||||
tr td a.exists-False {
|
|
||||||
opacity: 0.1;
|
opacity: 0.1;
|
||||||
filter: grayscale(100%);
|
filter: grayscale(100%);
|
||||||
|
pointer-events: none;
|
||||||
}
|
}
|
||||||
</style>
|
</style>
|
||||||
<link rel="stylesheet" href="{% static 'bootstrap.min.css' %}">
|
<link rel="stylesheet" href="{% static 'bootstrap.min.css' %}">
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -51,9 +51,8 @@ setuptools.setup(
|
||||||
"requests==2.24.0",
|
"requests==2.24.0",
|
||||||
"atomicwrites==1.4.0",
|
"atomicwrites==1.4.0",
|
||||||
"mypy-extensions==0.4.3",
|
"mypy-extensions==0.4.3",
|
||||||
"django==3.0.8",
|
"django==3.1.3",
|
||||||
"django-extensions==3.0.3",
|
"django-extensions==3.0.3",
|
||||||
|
|
||||||
"dateparser",
|
"dateparser",
|
||||||
"ipython",
|
"ipython",
|
||||||
"youtube-dl",
|
"youtube-dl",
|
||||||
|
|
|
@ -6,7 +6,7 @@ def test_update_status_invalid(tmp_path, process, disable_extractors_dict):
|
||||||
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
|
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
|
||||||
assert list((tmp_path / "archive").iterdir()) != []
|
assert list((tmp_path / "archive").iterdir()) != []
|
||||||
|
|
||||||
subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
|
a_process = subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
|
||||||
|
|
||||||
conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
|
conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
|
||||||
c = conn.cursor()
|
c = conn.cursor()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue