mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
fix pdf and screenshot links
This commit is contained in:
parent
7b72156afd
commit
f33330ebbf
1 changed files with 9 additions and 11 deletions
20
archive.py
20
archive.py
|
@ -168,7 +168,7 @@ def fetch_wget(out_dir, link, overwrite=False):
|
|||
if not os.path.exists('{}/{}'.format(out_dir, domain)) or overwrite:
|
||||
print(' - Downloading Full Site')
|
||||
CMD = [
|
||||
*'wget --timestamping --adjust-extension --no-parent'.split(' '),
|
||||
*'wget --timestamping --adjust-extension --no-parent'.split(' '), # Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||
*(('--page-requisites', '--convert-links') if FETCH_WGET_REQUISITES else ()),
|
||||
link['url'],
|
||||
]
|
||||
|
@ -327,8 +327,9 @@ def valid_links(links):
|
|||
return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp'))
|
||||
|
||||
def calculate_archive_url(link):
|
||||
"""calculate the path to the wgetted html file, since wget may adjust some paths
|
||||
to be different than the base_url path
|
||||
"""calculate the path to the wgetted html file, since wget may
|
||||
adjust some paths to be different than the base_url path.
|
||||
|
||||
See docs on wget --adjust-extension."""
|
||||
|
||||
split_url = link['url'].split('#', 1)
|
||||
|
@ -370,21 +371,18 @@ def dump_index(links, service):
|
|||
# since we dont screenshot or PDF links that are images or PDFs, change those links to point to the wget'ed file
|
||||
link_info = {**link}
|
||||
|
||||
# append .html to archive links that dont have it, since wget appends .html to everything
|
||||
link_info['archive_url'] = calculate_archive_url(link)
|
||||
|
||||
# add link type to title
|
||||
if link['type']:
|
||||
link_info.update({'title': '{title} ({type})'.format(**link)})
|
||||
|
||||
# PDF and images link to wgetted version, since we dont re-screenshot/pdf them
|
||||
# PDF and images are handled slightly differently
|
||||
# wget, screenshot, & pdf urls all point to the same file
|
||||
if link['type'] in ('PDF', 'image'):
|
||||
link_info.update({
|
||||
'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
|
||||
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
||||
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
||||
'title': '{title} ({type})'.format(**link),
|
||||
})
|
||||
else:
|
||||
link_info.update({
|
||||
'archive_url': calculate_archive_url(link),
|
||||
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
|
||||
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link)
|
||||
})
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue