remove derived link info in favor of derived Link properties

This commit is contained in:
Nick Sweeting 2019-04-16 23:21:24 -04:00
parent 50d368b1bc
commit 717e390ef6
4 changed files with 40 additions and 34 deletions

View file

@ -142,19 +142,27 @@ class Link:
info.update({
'link_dir': self.link_dir,
'archive_path': self.archive_path,
'bookmarked_date': self.bookmarked_date,
'updated_date': self.updated_date,
'hash': self.url_hash,
'base_url': self.base_url,
'scheme': self.scheme,
'domain': self.domain,
'path': self.path,
'basename': self.basename,
'extension': self.extension,
'base_url': self.base_url,
'is_static': self.is_static,
'bookmarked_date': self.bookmarked_date,
'updated_date': self.updated_date,
'oldest_archive_date': self.oldest_archive_date,
'newest_archive_date': self.newest_archive_date,
'is_archived': self.is_archived,
'num_outputs': self.num_outputs,
'num_failures': self.num_failures,
'oldest_archive_date': self.oldest_archive_date,
'newest_archive_date': self.newest_archive_date,
'latest': self.latest_outputs(),
'canonical': self.canonical_outputs(),
})
return info
@ -211,11 +219,16 @@ class Link:
### URL Helpers
@property
def urlhash(self):
def url_hash(self):
from .util import hashurl
return hashurl(self.url)
@property
def scheme(self) -> str:
from .util import scheme
return scheme(self.url)
@property
def extension(self) -> str:
from .util import extension
@ -319,32 +332,35 @@ class Link:
return latest
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""predict the expected output paths that should be present after archiving"""
from .util import wget_output_path
canonical = {
'index_url': 'index.html',
'favicon_url': 'favicon.ico',
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
'archive_url': wget_output_path(self),
'warc_url': 'warc',
'pdf_url': 'output.pdf',
'screenshot_url': 'screenshot.png',
'dom_url': 'output.html',
'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),
'git_url': 'git',
'media_url': 'media',
'index_path': 'index.html',
'favicon_path': 'favicon.ico',
'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
'wget_path': wget_output_path(self),
'warc_path': 'warc',
'pdf_path': 'output.pdf',
'screenshot_path': 'screenshot.png',
'dom_path': 'output.html',
'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
'git_path': 'git',
'media_path': 'media',
}
if self.is_static:
# static binary files like PDF and images are handled slightly differently.
# they're just downloaded once and aren't archived separately multiple times,
# so the wget, screenshot, & pdf urls should all point to the same file
static_url = wget_output_path(self)
static_path = wget_output_path(self)
canonical.update({
'title': self.basename,
'archive_url': static_url,
'pdf_url': static_url,
'screenshot_url': static_url,
'dom_url': static_url,
'wget_path': static_path,
'pdf_path': static_path,
'screenshot_path': static_path,
'dom_path': static_path,
})
return canonical