diff --git a/archivebox/legacy/index.py b/archivebox/legacy/index.py index c76da968..9574c1bf 100644 --- a/archivebox/legacy/index.py +++ b/archivebox/legacy/index.py @@ -47,16 +47,6 @@ TITLE_LOADING_MSG = 'Not yet archived...' ### Link filtering and checking -@enforce_types -def derived_link_info(link: Link) -> dict: - """extend link info with the archive urls and other derived data""" - - info = link._asdict(extended=True) - info.update(link.canonical_outputs()) - - return info - - @enforce_types def merge_links(a: Link, b: Link) -> Link: """deterministially merge two links, favoring longer field values over shorter, diff --git a/archivebox/legacy/schema.py b/archivebox/legacy/schema.py index d139353e..743f3a14 100644 --- a/archivebox/legacy/schema.py +++ b/archivebox/legacy/schema.py @@ -142,19 +142,27 @@ class Link: info.update({ 'link_dir': self.link_dir, 'archive_path': self.archive_path, - 'bookmarked_date': self.bookmarked_date, - 'updated_date': self.updated_date, + + 'hash': self.url_hash, + 'base_url': self.base_url, + 'scheme': self.scheme, 'domain': self.domain, 'path': self.path, 'basename': self.basename, 'extension': self.extension, - 'base_url': self.base_url, 'is_static': self.is_static, + + 'bookmarked_date': self.bookmarked_date, + 'updated_date': self.updated_date, + 'oldest_archive_date': self.oldest_archive_date, + 'newest_archive_date': self.newest_archive_date, + 'is_archived': self.is_archived, 'num_outputs': self.num_outputs, 'num_failures': self.num_failures, - 'oldest_archive_date': self.oldest_archive_date, - 'newest_archive_date': self.newest_archive_date, + + 'latest': self.latest_outputs(), + 'canonical': self.canonical_outputs(), }) return info @@ -211,11 +219,16 @@ class Link: ### URL Helpers @property - def urlhash(self): + def url_hash(self): from .util import hashurl return hashurl(self.url) + @property + def scheme(self) -> str: + from .util import scheme + return scheme(self.url) + @property def extension(self) -> str: from .util import extension @@ -319,32 +332,35 @@ class Link: return latest + def canonical_outputs(self) -> Dict[str, Optional[str]]: + """predict the expected output paths that should be present after archiving""" + from .util import wget_output_path canonical = { - 'index_url': 'index.html', - 'favicon_url': 'favicon.ico', - 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain), - 'archive_url': wget_output_path(self), - 'warc_url': 'warc', - 'pdf_url': 'output.pdf', - 'screenshot_url': 'screenshot.png', - 'dom_url': 'output.html', - 'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url), - 'git_url': 'git', - 'media_url': 'media', + 'index_path': 'index.html', + 'favicon_path': 'favicon.ico', + 'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain), + 'wget_path': wget_output_path(self), + 'warc_path': 'warc', + 'pdf_path': 'output.pdf', + 'screenshot_path': 'screenshot.png', + 'dom_path': 'output.html', + 'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url), + 'git_path': 'git', + 'media_path': 'media', } if self.is_static: # static binary files like PDF and images are handled slightly differently. # they're just downloaded once and aren't archived separately multiple times, # so the wget, screenshot, & pdf urls should all point to the same file - static_url = wget_output_path(self) + static_path = wget_output_path(self) canonical.update({ 'title': self.basename, - 'archive_url': static_url, - 'pdf_url': static_url, - 'screenshot_url': static_url, - 'dom_url': static_url, + 'wget_path': static_path, + 'pdf_path': static_path, + 'screenshot_path': static_path, + 'dom_path': static_path, }) return canonical diff --git a/archivebox/legacy/templates/index_row.html b/archivebox/legacy/templates/index_row.html index ffda7a19..48f22802 100644 --- a/archivebox/legacy/templates/index_row.html +++ b/archivebox/legacy/templates/index_row.html @@ -2,7 +2,7 @@