mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
fix nested archive index page and improve wget output finding
This commit is contained in:
parent
eb5cc8078a
commit
61ec4971e9
1 changed files with 24 additions and 5 deletions
|
@ -263,16 +263,29 @@ def wget_output_path(link):
|
||||||
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
||||||
# instead of trying to emulate it here, we just look in the output folder
|
# instead of trying to emulate it here, we just look in the output folder
|
||||||
# to see what html file wget actually created as the output
|
# to see what html file wget actually created as the output
|
||||||
wget_folder = base_url(link['url']).rsplit('/', 1)[0].split('/')
|
url_path = without_fragment(without_query(path(link['url']))).strip('/')
|
||||||
look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder)
|
html_parent_folder = (domain(link['url']), *url_path.rsplit('/', 1)[0].split('/'))
|
||||||
|
look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *html_parent_folder)
|
||||||
|
|
||||||
if look_in and os.path.exists(look_in):
|
# look inside innermost path folder for an html file
|
||||||
|
if os.path.exists(look_in):
|
||||||
html_files = [
|
html_files = [
|
||||||
f for f in os.listdir(look_in)
|
f for f in os.listdir(look_in)
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
||||||
]
|
]
|
||||||
if html_files:
|
if html_files:
|
||||||
return urlencode(os.path.join(*wget_folder, html_files[0]))
|
return urlencode(os.path.join(*html_parent_folder, html_files[0]))
|
||||||
|
|
||||||
|
# Look one level up in case last path fragment was a file and not a folder
|
||||||
|
look_in = look_in.rsplit('/', 1)[0]
|
||||||
|
html_parent_folder = html_parent_folder[:-1]
|
||||||
|
if os.path.exists(look_in):
|
||||||
|
html_files = [
|
||||||
|
f for f in os.listdir(look_in)
|
||||||
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
||||||
|
]
|
||||||
|
if html_files:
|
||||||
|
return urlencode(os.path.join(*html_parent_folder, html_files[0]))
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -397,6 +410,12 @@ def derived_link_info(link):
|
||||||
'path': path(url),
|
'path': path(url),
|
||||||
'basename': basename(url),
|
'basename': basename(url),
|
||||||
'base_url': base_url(url),
|
'base_url': base_url(url),
|
||||||
|
'is_archived': os.path.exists(os.path.join(
|
||||||
|
ARCHIVE_DIR,
|
||||||
|
link['timestamp'],
|
||||||
|
wget_output_path(link) or domain(url)
|
||||||
|
)),
|
||||||
|
'num_outputs': len([entry for entry in link['latest'].values() if entry]) if 'latest' in link else 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Archive Method Output URLs
|
# Archive Method Output URLs
|
||||||
|
@ -405,7 +424,7 @@ def derived_link_info(link):
|
||||||
'index_url': 'index.html',
|
'index_url': 'index.html',
|
||||||
'favicon_url': 'favicon.ico',
|
'favicon_url': 'favicon.ico',
|
||||||
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
|
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
|
||||||
'archive_url': wget_output_path(link) or 'index.html',
|
'archive_url': wget_output_path(link),
|
||||||
'warc_url': 'warc',
|
'warc_url': 'warc',
|
||||||
'pdf_url': 'output.pdf',
|
'pdf_url': 'output.pdf',
|
||||||
'screenshot_url': 'screenshot.png',
|
'screenshot_url': 'screenshot.png',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue