diff --git a/archive_methods.py b/archive_methods.py index d2a567b3..e1f5b4ed 100644 --- a/archive_methods.py +++ b/archive_methods.py @@ -70,7 +70,7 @@ def archive_links(archive_path, links, source=None, resume=None): raise SystemExit(1) -def archive_link(link_dir, link, overwrite=False): +def archive_link(link_dir, link, overwrite=True): """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" update_existing = os.path.exists(link_dir) @@ -170,7 +170,7 @@ def attach_result_to_link(method): history_entry.update(result or {}) link['history'][method].append(history_entry) link['latest'][method] = result['output'] - + _RESULTS_TOTALS[history_entry['status']] += 1 return link @@ -183,8 +183,9 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT """download full site using wget""" domain_dir = os.path.join(link_dir, link['domain']) - if os.path.exists(domain_dir): - return {'output': wget_output_path(link, look_in=domain_dir), 'status': 'skipped'} + existing_file = wget_output_path(link) + if os.path.exists(domain_dir) and existing_file: + return {'output': existing_file, 'status': 'skipped'} CMD = [ # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html diff --git a/links.py b/links.py index 5d977ab0..61d968e9 100644 --- a/links.py +++ b/links.py @@ -63,10 +63,10 @@ def validate_links(links): link['latest']['wget'] = wget_output_path(link) if not link['latest'].get('pdf'): - link['latest']['pdf'] = wget_output_path(link) + link['latest']['pdf'] = None if not link['latest'].get('screenshot'): - link['latest']['screenshot'] = wget_output_path(link) + link['latest']['screenshot'] = None return list(links) diff --git a/util.py b/util.py index f7c88ae4..4b1a6b29 100644 --- a/util.py +++ b/util.py @@ -411,8 +411,8 @@ def wget_output_path(link, look_in=None): # Since the wget algorithm to for -E (appending .html) is incredibly complex # instead of trying to emulate it here, we just look in the output folder # to see what html file wget actually created as the output - wget_folder = link['base_url'].rsplit('/', 1)[0] - look_in = look_in or os.path.join(HTML_FOLDER, 'archive', link['timestamp'], wget_folder) + wget_folder = link['base_url'].rsplit('/', 1)[0].split('/') + look_in = os.path.join(HTML_FOLDER, 'archive', link['timestamp'], *wget_folder) if look_in and os.path.exists(look_in): html_files = [ @@ -420,29 +420,31 @@ def wget_output_path(link, look_in=None): if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M) ] if html_files: - return urlencode(os.path.join(wget_folder, html_files[0])) + return urlencode(os.path.join(*wget_folder, html_files[0])) + + return None # If finding the actual output file didn't work, fall back to the buggy # implementation of the wget .html appending algorithm - split_url = link['url'].split('#', 1) - query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' + # split_url = link['url'].split('#', 1) + # query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' - if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): - # already ends in .html - return urlencode(link['base_url']) - else: - # .html needs to be appended - without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] - if without_scheme.endswith('/'): - if query: - return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])) - return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]])) - else: - if query: - return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])) - elif '/' in without_scheme: - return urlencode('#'.join([without_scheme + '.html', *split_url[1:]])) - return urlencode(link['base_url'] + '/index.html') + # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): + # # already ends in .html + # return urlencode(link['base_url']) + # else: + # # .html needs to be appended + # without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] + # if without_scheme.endswith('/'): + # if query: + # return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])) + # return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]])) + # else: + # if query: + # return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])) + # elif '/' in without_scheme: + # return urlencode('#'.join([without_scheme + '.html', *split_url[1:]])) + # return urlencode(link['base_url'] + '/index.html') def derived_link_info(link):