mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 15:14:31 -04:00
fix handling of wget file paths
This commit is contained in:
parent
dbe4660da3
commit
b926b1affc
3 changed files with 30 additions and 27 deletions
|
@ -70,7 +70,7 @@ def archive_links(archive_path, links, source=None, resume=None):
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
def archive_link(link_dir, link, overwrite=False):
|
def archive_link(link_dir, link, overwrite=True):
|
||||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||||
|
|
||||||
update_existing = os.path.exists(link_dir)
|
update_existing = os.path.exists(link_dir)
|
||||||
|
@ -170,7 +170,7 @@ def attach_result_to_link(method):
|
||||||
history_entry.update(result or {})
|
history_entry.update(result or {})
|
||||||
link['history'][method].append(history_entry)
|
link['history'][method].append(history_entry)
|
||||||
link['latest'][method] = result['output']
|
link['latest'][method] = result['output']
|
||||||
|
|
||||||
_RESULTS_TOTALS[history_entry['status']] += 1
|
_RESULTS_TOTALS[history_entry['status']] += 1
|
||||||
|
|
||||||
return link
|
return link
|
||||||
|
@ -183,8 +183,9 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT
|
||||||
"""download full site using wget"""
|
"""download full site using wget"""
|
||||||
|
|
||||||
domain_dir = os.path.join(link_dir, link['domain'])
|
domain_dir = os.path.join(link_dir, link['domain'])
|
||||||
if os.path.exists(domain_dir):
|
existing_file = wget_output_path(link)
|
||||||
return {'output': wget_output_path(link, look_in=domain_dir), 'status': 'skipped'}
|
if os.path.exists(domain_dir) and existing_file:
|
||||||
|
return {'output': existing_file, 'status': 'skipped'}
|
||||||
|
|
||||||
CMD = [
|
CMD = [
|
||||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||||
|
|
4
links.py
4
links.py
|
@ -63,10 +63,10 @@ def validate_links(links):
|
||||||
link['latest']['wget'] = wget_output_path(link)
|
link['latest']['wget'] = wget_output_path(link)
|
||||||
|
|
||||||
if not link['latest'].get('pdf'):
|
if not link['latest'].get('pdf'):
|
||||||
link['latest']['pdf'] = wget_output_path(link)
|
link['latest']['pdf'] = None
|
||||||
|
|
||||||
if not link['latest'].get('screenshot'):
|
if not link['latest'].get('screenshot'):
|
||||||
link['latest']['screenshot'] = wget_output_path(link)
|
link['latest']['screenshot'] = None
|
||||||
|
|
||||||
return list(links)
|
return list(links)
|
||||||
|
|
||||||
|
|
44
util.py
44
util.py
|
@ -411,8 +411,8 @@ def wget_output_path(link, look_in=None):
|
||||||
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
||||||
# instead of trying to emulate it here, we just look in the output folder
|
# instead of trying to emulate it here, we just look in the output folder
|
||||||
# to see what html file wget actually created as the output
|
# to see what html file wget actually created as the output
|
||||||
wget_folder = link['base_url'].rsplit('/', 1)[0]
|
wget_folder = link['base_url'].rsplit('/', 1)[0].split('/')
|
||||||
look_in = look_in or os.path.join(HTML_FOLDER, 'archive', link['timestamp'], wget_folder)
|
look_in = os.path.join(HTML_FOLDER, 'archive', link['timestamp'], *wget_folder)
|
||||||
|
|
||||||
if look_in and os.path.exists(look_in):
|
if look_in and os.path.exists(look_in):
|
||||||
html_files = [
|
html_files = [
|
||||||
|
@ -420,29 +420,31 @@ def wget_output_path(link, look_in=None):
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
||||||
]
|
]
|
||||||
if html_files:
|
if html_files:
|
||||||
return urlencode(os.path.join(wget_folder, html_files[0]))
|
return urlencode(os.path.join(*wget_folder, html_files[0]))
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
# If finding the actual output file didn't work, fall back to the buggy
|
# If finding the actual output file didn't work, fall back to the buggy
|
||||||
# implementation of the wget .html appending algorithm
|
# implementation of the wget .html appending algorithm
|
||||||
split_url = link['url'].split('#', 1)
|
# split_url = link['url'].split('#', 1)
|
||||||
query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
|
# query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
|
||||||
|
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
# if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
||||||
# already ends in .html
|
# # already ends in .html
|
||||||
return urlencode(link['base_url'])
|
# return urlencode(link['base_url'])
|
||||||
else:
|
# else:
|
||||||
# .html needs to be appended
|
# # .html needs to be appended
|
||||||
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
# without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
||||||
if without_scheme.endswith('/'):
|
# if without_scheme.endswith('/'):
|
||||||
if query:
|
# if query:
|
||||||
return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
|
# return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
|
||||||
return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
|
# return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
|
||||||
else:
|
# else:
|
||||||
if query:
|
# if query:
|
||||||
return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
|
# return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
|
||||||
elif '/' in without_scheme:
|
# elif '/' in without_scheme:
|
||||||
return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
|
# return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
|
||||||
return urlencode(link['base_url'] + '/index.html')
|
# return urlencode(link['base_url'] + '/index.html')
|
||||||
|
|
||||||
|
|
||||||
def derived_link_info(link):
|
def derived_link_info(link):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue