mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 15:14:31 -04:00
remove derived link info in favor of derived Link properties
This commit is contained in:
parent
50d368b1bc
commit
717e390ef6
4 changed files with 40 additions and 34 deletions
|
@ -47,16 +47,6 @@ TITLE_LOADING_MSG = 'Not yet archived...'
|
||||||
|
|
||||||
### Link filtering and checking
|
### Link filtering and checking
|
||||||
|
|
||||||
@enforce_types
|
|
||||||
def derived_link_info(link: Link) -> dict:
|
|
||||||
"""extend link info with the archive urls and other derived data"""
|
|
||||||
|
|
||||||
info = link._asdict(extended=True)
|
|
||||||
info.update(link.canonical_outputs())
|
|
||||||
|
|
||||||
return info
|
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def merge_links(a: Link, b: Link) -> Link:
|
def merge_links(a: Link, b: Link) -> Link:
|
||||||
"""deterministially merge two links, favoring longer field values over shorter,
|
"""deterministially merge two links, favoring longer field values over shorter,
|
||||||
|
|
|
@ -142,19 +142,27 @@ class Link:
|
||||||
info.update({
|
info.update({
|
||||||
'link_dir': self.link_dir,
|
'link_dir': self.link_dir,
|
||||||
'archive_path': self.archive_path,
|
'archive_path': self.archive_path,
|
||||||
'bookmarked_date': self.bookmarked_date,
|
|
||||||
'updated_date': self.updated_date,
|
'hash': self.url_hash,
|
||||||
|
'base_url': self.base_url,
|
||||||
|
'scheme': self.scheme,
|
||||||
'domain': self.domain,
|
'domain': self.domain,
|
||||||
'path': self.path,
|
'path': self.path,
|
||||||
'basename': self.basename,
|
'basename': self.basename,
|
||||||
'extension': self.extension,
|
'extension': self.extension,
|
||||||
'base_url': self.base_url,
|
|
||||||
'is_static': self.is_static,
|
'is_static': self.is_static,
|
||||||
|
|
||||||
|
'bookmarked_date': self.bookmarked_date,
|
||||||
|
'updated_date': self.updated_date,
|
||||||
|
'oldest_archive_date': self.oldest_archive_date,
|
||||||
|
'newest_archive_date': self.newest_archive_date,
|
||||||
|
|
||||||
'is_archived': self.is_archived,
|
'is_archived': self.is_archived,
|
||||||
'num_outputs': self.num_outputs,
|
'num_outputs': self.num_outputs,
|
||||||
'num_failures': self.num_failures,
|
'num_failures': self.num_failures,
|
||||||
'oldest_archive_date': self.oldest_archive_date,
|
|
||||||
'newest_archive_date': self.newest_archive_date,
|
'latest': self.latest_outputs(),
|
||||||
|
'canonical': self.canonical_outputs(),
|
||||||
})
|
})
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
@ -211,11 +219,16 @@ class Link:
|
||||||
|
|
||||||
### URL Helpers
|
### URL Helpers
|
||||||
@property
|
@property
|
||||||
def urlhash(self):
|
def url_hash(self):
|
||||||
from .util import hashurl
|
from .util import hashurl
|
||||||
|
|
||||||
return hashurl(self.url)
|
return hashurl(self.url)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def scheme(self) -> str:
|
||||||
|
from .util import scheme
|
||||||
|
return scheme(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def extension(self) -> str:
|
def extension(self) -> str:
|
||||||
from .util import extension
|
from .util import extension
|
||||||
|
@ -319,32 +332,35 @@ class Link:
|
||||||
|
|
||||||
return latest
|
return latest
|
||||||
|
|
||||||
|
|
||||||
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||||
|
"""predict the expected output paths that should be present after archiving"""
|
||||||
|
|
||||||
from .util import wget_output_path
|
from .util import wget_output_path
|
||||||
canonical = {
|
canonical = {
|
||||||
'index_url': 'index.html',
|
'index_path': 'index.html',
|
||||||
'favicon_url': 'favicon.ico',
|
'favicon_path': 'favicon.ico',
|
||||||
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
|
'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
|
||||||
'archive_url': wget_output_path(self),
|
'wget_path': wget_output_path(self),
|
||||||
'warc_url': 'warc',
|
'warc_path': 'warc',
|
||||||
'pdf_url': 'output.pdf',
|
'pdf_path': 'output.pdf',
|
||||||
'screenshot_url': 'screenshot.png',
|
'screenshot_path': 'screenshot.png',
|
||||||
'dom_url': 'output.html',
|
'dom_path': 'output.html',
|
||||||
'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),
|
'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
|
||||||
'git_url': 'git',
|
'git_path': 'git',
|
||||||
'media_url': 'media',
|
'media_path': 'media',
|
||||||
}
|
}
|
||||||
if self.is_static:
|
if self.is_static:
|
||||||
# static binary files like PDF and images are handled slightly differently.
|
# static binary files like PDF and images are handled slightly differently.
|
||||||
# they're just downloaded once and aren't archived separately multiple times,
|
# they're just downloaded once and aren't archived separately multiple times,
|
||||||
# so the wget, screenshot, & pdf urls should all point to the same file
|
# so the wget, screenshot, & pdf urls should all point to the same file
|
||||||
|
|
||||||
static_url = wget_output_path(self)
|
static_path = wget_output_path(self)
|
||||||
canonical.update({
|
canonical.update({
|
||||||
'title': self.basename,
|
'title': self.basename,
|
||||||
'archive_url': static_url,
|
'wget_path': static_path,
|
||||||
'pdf_url': static_url,
|
'pdf_path': static_path,
|
||||||
'screenshot_url': static_url,
|
'screenshot_path': static_path,
|
||||||
'dom_url': static_url,
|
'dom_path': static_path,
|
||||||
})
|
})
|
||||||
return canonical
|
return canonical
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
<td title="$timestamp">$bookmarked_date</td>
|
<td title="$timestamp">$bookmarked_date</td>
|
||||||
<td class="title-col">
|
<td class="title-col">
|
||||||
<a href="$archive_path/$index_url"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
|
<a href="$archive_path/$index_url"><img src="$favicon_url" class="link-favicon" decoding="async"></a>
|
||||||
<a href="$archive_path/$archive_url" title="$title">
|
<a href="$archive_path/$wget_url" title="$title">
|
||||||
<span data-title-for="$url" data-archived="$is_archived">$title</span>
|
<span data-title-for="$url" data-archived="$is_archived">$title</span>
|
||||||
<small style="float:right">$tags</small>
|
<small style="float:right">$tags</small>
|
||||||
</a>
|
</a>
|
||||||
|
|
|
@ -60,7 +60,6 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
||||||
|
|
||||||
without_www = lambda url: url.replace('://www.', '://', 1)
|
without_www = lambda url: url.replace('://www.', '://', 1)
|
||||||
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
|
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
|
||||||
fuzzy_url = lambda url: without_trailing_slash(without_www(without_scheme(url.lower())))
|
|
||||||
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
|
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
|
||||||
|
|
||||||
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
|
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
|
||||||
|
@ -393,6 +392,7 @@ def parse_date(date: Any) -> Optional[datetime]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if '-' in date:
|
if '-' in date:
|
||||||
|
# 2019-04-07T05:44:39.227520
|
||||||
try:
|
try:
|
||||||
return datetime.fromisoformat(date)
|
return datetime.fromisoformat(date)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue