Merge branch 'dev' into link-removal2

2025-05-19 17:35:09 -04:00 · 2021-01-30 05:51:39 -05:00 · 2021-01-30 05:51:39 -05:00 · b7273a07e5
commit b7273a07e5
parent 1fe95474c2 cc80ceb0a2
39 changed files with 273 additions and 276 deletions
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@ -26,9 +26,9 @@ from ..config import (
    HTML_INDEX_FILENAME,
 )

-MAIN_INDEX_TEMPLATE = 'main_index.html'
-MINIMAL_INDEX_TEMPLATE = 'main_index_minimal.html'
-LINK_DETAILS_TEMPLATE = 'link_details.html'
+MAIN_INDEX_TEMPLATE = 'static_index.html'
+MINIMAL_INDEX_TEMPLATE = 'minimal_index.html'
+LINK_DETAILS_TEMPLATE = 'snapshot.html'
 TITLE_LOADING_MSG = 'Not yet archived...'


@ -144,9 +144,15 @@ def snapshot_icons(snapshot) -> str:

    for extractor, _ in EXTRACTORS:
        if extractor not in exclude:
-            exists = extractor_items[extractor] is not None
+            exists = False
+            if extractor_items[extractor] is not None:
+                outpath = (Path(path) / canon[f"{extractor}_path"])
+                if outpath.is_dir():
+                    exists = any(outpath.glob('*.*'))
+                elif outpath.is_file():
+                    exists = outpath.stat().st_size > 100
            output += format_html(output_template, path, canon[f"{extractor}_path"], str(exists),
-                                             extractor, icons.get(extractor, "?"))
+                                         extractor, icons.get(extractor, "?"))
        if extractor == "wget":
            # warc isn't technically it's own extractor, so we have to add it after wget
            exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@ -409,4 +409,41 @@ class Link:



+    def canonical_outputs(self) -> Dict[str, Optional[str]]:
+        """predict the expected output paths that should be present after archiving"""
+
+        from ..extractors.wget import wget_output_path
+        canonical = {
+            'index_path': 'index.html',
+            'favicon_path': 'favicon.ico',
+            'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
+            'wget_path': wget_output_path(self),
+            'warc_path': 'warc/',
+            'singlefile_path': 'singlefile.html',
+            'readability_path': 'readability/content.html',
+            'mercury_path': 'mercury/content.html',
+            'pdf_path': 'output.pdf',
+            'screenshot_path': 'screenshot.png',
+            'dom_path': 'output.html',
+            'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
+            'git_path': 'git/',
+            'media_path': 'media/',
+        }
+        if self.is_static:
+            # static binary files like PDF and images are handled slightly differently.
+            # they're just downloaded once and aren't archived separately multiple times, 
+            # so the wget, screenshot, & pdf urls should all point to the same file
+
+            static_path = wget_output_path(self)
+            canonical.update({
+                'title': self.basename,
+                'wget_path': static_path,
+                'pdf_path': static_path,
+                'screenshot_path': static_path,
+                'dom_path': static_path,
+                'singlefile_path': static_path,
+                'readability_path': static_path,
+                'mercury_path': static_path,
+            })
+        return canonical