mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
feat: Finish reversal. Add ArchiveResults that are not found in the index.json
This commit is contained in:
parent
f7f0bebdcc
commit
b237e412df
2 changed files with 14 additions and 8 deletions
|
@ -7,6 +7,7 @@ from django.db import migrations, models
|
||||||
import django.db.models.deletion
|
import django.db.models.deletion
|
||||||
|
|
||||||
from config import CONFIG
|
from config import CONFIG
|
||||||
|
from index.json import to_json
|
||||||
|
|
||||||
|
|
||||||
def forwards_func(apps, schema_editor):
|
def forwards_func(apps, schema_editor):
|
||||||
|
@ -33,26 +34,31 @@ def forwards_func(apps, schema_editor):
|
||||||
start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
|
start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
|
||||||
|
|
||||||
|
|
||||||
def verify_json_index_integrity(results):
|
def verify_json_index_integrity(snapshot):
|
||||||
results = snapshot.archiveresult_set.all()
|
results = snapshot.archiveresult_set.all()
|
||||||
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
|
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
|
||||||
with open(out_dir / "index.json", "r") as f:
|
with open(out_dir / "index.json", "r") as f:
|
||||||
index = json.load(f)
|
index = json.load(f)
|
||||||
|
|
||||||
history = index["history"]
|
history = index["history"]
|
||||||
extractors = [extractor for extractor in history]
|
index_results = [result for extractor in history for result in history[extractor]]
|
||||||
index_results = [(result, extractor) for result in history[extractor]]
|
flattened_results = [result["start_ts"] for result in index_results]
|
||||||
flattened_results = [(result["start_ts"], extractor) for result, extractor in index_results]
|
|
||||||
|
|
||||||
missing = [result for result in results if result.start_ts not in flattened_results]
|
missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
|
||||||
|
|
||||||
#process missing elements here. Re-add to the index.json
|
|
||||||
|
|
||||||
|
for missing in missing_results:
|
||||||
|
index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
|
||||||
|
"start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
|
||||||
|
"schema": "ArchiveResult", "status": missing.status})
|
||||||
|
|
||||||
|
json_index = to_json(index)
|
||||||
|
with open(out_dir / "index.json", "w") as f:
|
||||||
|
f.write(json_index)
|
||||||
|
|
||||||
|
|
||||||
def reverse_func(apps, schema_editor):
|
def reverse_func(apps, schema_editor):
|
||||||
Snapshot = apps.get_model("core", "Snapshot")
|
Snapshot = apps.get_model("core", "Snapshot")
|
||||||
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||||
for snapshot in Snapshot.objects.all():
|
for snapshot in Snapshot.objects.all():
|
||||||
verify_json_index_integrity(snapshot)
|
verify_json_index_integrity(snapshot)
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ def get_icons(snapshot: Snapshot) -> str:
|
||||||
if extractor == "wget":
|
if extractor == "wget":
|
||||||
# warc isn't technically it's own extractor, so we have to add it after wget
|
# warc isn't technically it's own extractor, so we have to add it after wget
|
||||||
|
|
||||||
output += output_template.format(path, canon[f"warc_path"],
|
output += output_template.format(path, canon["warc_path"],
|
||||||
exists, "warc", icons.get("warc", "?"))
|
exists, "warc", icons.get("warc", "?"))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue