remove derived link info in favor of derived Link properties

2025-05-14 15:14:31 -04:00 · 2019-04-16 23:21:24 -04:00 · 2019-04-16 23:21:24 -04:00 · 717e390ef6
commit 717e390ef6
parent 50d368b1bc
4 changed files with 40 additions and 34 deletions
--- a/archivebox/legacy/schema.py
+++ b/archivebox/legacy/schema.py
@ -142,19 +142,27 @@ class Link:
            info.update({
                'link_dir': self.link_dir,
                'archive_path': self.archive_path,
-                'bookmarked_date': self.bookmarked_date,
-                'updated_date': self.updated_date,
+                
+                'hash': self.url_hash,
+                'base_url': self.base_url,
+                'scheme': self.scheme,
                'domain': self.domain,
                'path': self.path,
                'basename': self.basename,
                'extension': self.extension,
-                'base_url': self.base_url,
                'is_static': self.is_static,
+
+                'bookmarked_date': self.bookmarked_date,
+                'updated_date': self.updated_date,
+                'oldest_archive_date': self.oldest_archive_date,
+                'newest_archive_date': self.newest_archive_date,
+        
                'is_archived': self.is_archived,
                'num_outputs': self.num_outputs,
                'num_failures': self.num_failures,
-                'oldest_archive_date': self.oldest_archive_date,
-                'newest_archive_date': self.newest_archive_date,
+                
+                'latest': self.latest_outputs(),
+                'canonical': self.canonical_outputs(),
            })
        return info

@ -211,11 +219,16 @@ class Link:
    
    ### URL Helpers
    @property
-    def urlhash(self):
+    def url_hash(self):
        from .util import hashurl

        return hashurl(self.url)

+    @property
+    def scheme(self) -> str:
+        from .util import scheme
+        return scheme(self.url)
+
    @property
    def extension(self) -> str:
        from .util import extension
@ -319,32 +332,35 @@ class Link:

        return latest

+
    def canonical_outputs(self) -> Dict[str, Optional[str]]:
+        """predict the expected output paths that should be present after archiving"""
+
        from .util import wget_output_path
        canonical = {
-            'index_url': 'index.html',
-            'favicon_url': 'favicon.ico',
-            'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
-            'archive_url': wget_output_path(self),
-            'warc_url': 'warc',
-            'pdf_url': 'output.pdf',
-            'screenshot_url': 'screenshot.png',
-            'dom_url': 'output.html',
-            'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),
-            'git_url': 'git',
-            'media_url': 'media',
+            'index_path': 'index.html',
+            'favicon_path': 'favicon.ico',
+            'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
+            'wget_path': wget_output_path(self),
+            'warc_path': 'warc',
+            'pdf_path': 'output.pdf',
+            'screenshot_path': 'screenshot.png',
+            'dom_path': 'output.html',
+            'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
+            'git_path': 'git',
+            'media_path': 'media',
        }
        if self.is_static:
            # static binary files like PDF and images are handled slightly differently.
            # they're just downloaded once and aren't archived separately multiple times, 
            # so the wget, screenshot, & pdf urls should all point to the same file

-            static_url = wget_output_path(self)
+            static_path = wget_output_path(self)
            canonical.update({
                'title': self.basename,
-                'archive_url': static_url,
-                'pdf_url': static_url,
-                'screenshot_url': static_url,
-                'dom_url': static_url,
+                'wget_path': static_path,
+                'pdf_path': static_path,
+                'screenshot_path': static_path,
+                'dom_path': static_path,
            })
        return canonical