diff --git a/archivebox/index/html.py b/archivebox/index/html.py index c8b9d07e..5eba0959 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -140,22 +140,22 @@ def snapshot_icons(snapshot) -> str: exclude = ["favicon", "title", "headers", "archive_org"] # Missing specific entry for WARC - extractor_items = defaultdict(lambda: None) + extractor_outputs = defaultdict(lambda: None) for extractor, _ in EXTRACTORS: for result in archive_results: - if result.extractor == extractor: - extractor_items[extractor] = result + if result.extractor == extractor and result: + extractor_outputs[extractor] = result for extractor, _ in EXTRACTORS: if extractor not in exclude: - exists = False - if extractor_items[extractor] is not None: - outpath = (Path(path) / canon[f"{extractor}_path"]) - if outpath.is_dir(): + outpath = extractor_outputs[extractor] and extractor_outputs[extractor].output + if outpath: + outpath = (Path(path) / outpath) + if outpath.is_file(): + exists = True + elif outpath.is_dir(): exists = any(outpath.glob('*.*')) - elif outpath.is_file(): - exists = outpath.stat().st_size > 100 - output += format_html(output_template, path, canon[f"{extractor}_path"], str(exists), + output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(outpath)), extractor, icons.get(extractor, "?")) if extractor == "wget": # warc isn't technically it's own extractor, so we have to add it after wget