From 534ead2440a3ecbe5ea44a81bcde850a50e9822f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 1 Feb 2021 02:18:13 -0500 Subject: [PATCH] use the db exclusively for icons instead of hammering filesystem --- archivebox/index/html.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index d97c6595..ebfe7d78 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -150,24 +150,33 @@ def snapshot_icons(snapshot) -> str: for extractor, _ in EXTRACTORS: if extractor not in exclude: - outpath = extractor_outputs[extractor] and extractor_outputs[extractor].output - if outpath: - outpath = (Path(path) / outpath) - if outpath.is_file(): - exists = True - elif outpath.is_dir(): - exists = any(outpath.glob('*.*')) - output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(outpath)), + existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output + # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching) + # if existing: + # existing = (Path(path) / existing) + # if existing.is_file(): + # existing = True + # elif existing.is_dir(): + # existing = any(existing.glob('*.*')) + output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)), extractor, icons.get(extractor, "?")) if extractor == "wget": # warc isn't technically it's own extractor, so we have to add it after wget - exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) - output += format_html(output_template, exists[0] if exists else '#', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) + + # get from db (faster but less thurthful) + exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output + # get from filesystem (slower but more accurate) + # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) + output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) if extractor == "archive_org": # The check for archive_org is different, so it has to be handled separately - target_path = Path(path) / "archive.org.txt" - exists = target_path.exists() + + # get from db (faster) + exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output + # get from filesystem (slower) + # target_path = Path(path) / "archive.org.txt" + # exists = target_path.exists() output += '{} '.format(canon["archive_org_path"], str(exists), "archive_org", icons.get("archive_org", "?"))