diff --git a/archivebox/archive.py b/archivebox/archive.py index 9ebe1e88..0e572f81 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -30,7 +30,6 @@ from util import ( save_remote_source, save_stdin_source, pretty_path, - migrate_data, check_links_structure, ) @@ -159,8 +158,6 @@ if __name__ == '__main__': print_help() raise SystemExit(0) - migrate_data() - source = sys.argv[1] if argc > 1 else None # path of links file to import resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from diff --git a/archivebox/util.py b/archivebox/util.py index 0ba8d9dc..0cf9ce0a 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -2,22 +2,18 @@ import os import re import sys import time -import json -from urllib.request import Request, urlopen -from urllib.parse import urlparse +from urllib.request import Request, urlopen +from urllib.parse import urlparse, quote from decimal import Decimal -from urllib.parse import quote from datetime import datetime -from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError from multiprocessing import Process +from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError from config import ( ANSI, - IS_TTY, TERM_WIDTH, REPO_DIR, - OUTPUT_DIR, SOURCES_DIR, ARCHIVE_DIR, OUTPUT_PERMISSIONS, @@ -42,7 +38,9 @@ from config import ( SUBMIT_ARCHIVE_DOT_ORG, ) -# URL helpers: https://docs.python.org/3/library/urllib.parse.html#url-parsing +### Parsing Helpers + +# Url Parsing: https://docs.python.org/3/library/urllib.parse.html#url-parsing scheme = lambda url: urlparse(url).scheme without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//') without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//') @@ -72,6 +70,20 @@ HTML_TITLE_REGEX = re.compile( re.IGNORECASE, ) +### Checks & Tests + +def check_link_structure(link): + """basic sanity check invariants to make sure the data is valid""" + assert isinstance(link, dict) + assert isinstance(link.get('url'), str) + assert len(link['url']) > 2 + assert len(re.findall(URL_REGEX, link['url'])) == 1 + +def check_links_structure(links): + """basic sanity check invariants to make sure the data is valid""" + assert isinstance(links, list) + if links: + check_link_structure(links[0]) def check_dependencies(): """Check that all necessary dependencies are installed, and have valid versions""" @@ -134,7 +146,6 @@ def check_dependencies(): print(' See https://github.com/pirate/ArchiveBox for help.') raise SystemExit(1) - def check_url_parsing(): """Check that plain text regex URL parsing works as expected""" test_urls = ''' @@ -159,6 +170,148 @@ def check_url_parsing(): assert len(re.findall(URL_REGEX, test_urls)) == 12 +### Random Helpers + +def save_stdin_source(raw_text): + if not os.path.exists(SOURCES_DIR): + os.makedirs(SOURCES_DIR) + + ts = str(datetime.now().timestamp()).split('.', 1)[0] + + source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts)) + + with open(source_path, 'w', encoding='utf-8') as f: + f.write(raw_text) + + return source_path + +def save_remote_source(url, timeout=TIMEOUT): + """download a given url's content into output/sources/domain-.txt""" + + if not os.path.exists(SOURCES_DIR): + os.makedirs(SOURCES_DIR) + + ts = str(datetime.now().timestamp()).split('.', 1)[0] + + source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts)) + + print('{}[*] [{}] Downloading {}{}'.format( + ANSI['green'], + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + url, + ANSI['reset'], + )) + end = progress(TIMEOUT, prefix=' ') + try: + downloaded_xml = download_url(url, timeout=timeout) + end() + except Exception as e: + end() + print('{}[!] Failed to download {}{}\n'.format( + ANSI['red'], + url, + ANSI['reset'], + )) + print(' ', e) + raise SystemExit(1) + + with open(source_path, 'w', encoding='utf-8') as f: + f.write(downloaded_xml) + + print(' > {}'.format(pretty_path(source_path))) + + return source_path + +def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS): + """Attempt to guess a page's title by downloading the html""" + if not FETCH_TITLE: + return None + + try: + if progress: + sys.stdout.write('.') + sys.stdout.flush() + + html = download_url(url, timeout=timeout) + + match = re.search(HTML_TITLE_REGEX, html) + return match.group(1).strip() if match else None + except Exception as err: + # print('[!] Failed to fetch title because of {}: {}'.format( + # err.__class__.__name__, + # err, + # )) + return None + +def wget_output_path(link, look_in=None): + """calculate the path to the wgetted .html file, since wget may + adjust some paths to be different than the base_url path. + + See docs on wget --adjust-extension (-E) + """ + + # if we have it stored, always prefer the actual output path to computed one + if link.get('latest', {}).get('wget'): + return link['latest']['wget'] + + urlencode = lambda s: quote(s, encoding='utf-8', errors='replace') + + if link['type'] in ('PDF', 'image'): + return urlencode(base_url(link['url'])) + + # Since the wget algorithm to for -E (appending .html) is incredibly complex + # instead of trying to emulate it here, we just look in the output folder + # to see what html file wget actually created as the output + wget_folder = base_url(link['url']).rsplit('/', 1)[0].split('/') + look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder) + + if look_in and os.path.exists(look_in): + html_files = [ + f for f in os.listdir(look_in) + if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M) + ] + if html_files: + return urlencode(os.path.join(*wget_folder, html_files[0])) + + return None + + # If finding the actual output file didn't work, fall back to the buggy + # implementation of the wget .html appending algorithm + # split_url = link['url'].split('#', 1) + # query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' + + # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): + # # already ends in .html + # return urlencode(base_url(link['url'])) + # else: + # # .html needs to be appended + # without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] + # if without_scheme.endswith('/'): + # if query: + # return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])) + # return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]])) + # else: + # if query: + # return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])) + # elif '/' in without_scheme: + # return urlencode('#'.join([without_scheme + '.html', *split_url[1:]])) + # return urlencode(base_url(link['url']) + '/index.html') + +### String Manipulation & Logging Helpers + +def str_between(string, start, end=None): + """(12345, , ) -> 12345""" + + content = string.split(start, 1)[-1] + if end is not None: + content = content.rsplit(end, 1)[0] + + return content + +def pretty_path(path): + """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" + return path.replace(REPO_DIR + '/', '') + def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '): """quote the argument with whitespace in a command so the user can copy-paste the outputted string directly to run the cmd @@ -184,17 +337,98 @@ def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '): ) -def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30): - """chmod -R /""" +### Link Helpers - if not os.path.exists(os.path.join(cwd, path)): - raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) +def merge_links(a, b): + """deterministially merge two links, favoring longer field values over shorter, + and "cleaner" values over worse ones. + """ + longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key]) + earlier = lambda key: a[key] if a[key] < b[key] else b[key] + + url = longer('url') + longest_title = longer('title') + cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title'] + link = { + 'timestamp': earlier('timestamp'), + 'url': url, + 'domain': domain(url), + 'base_url': base_url(url), + 'tags': longer('tags'), + 'title': longest_title if '://' not in (longest_title or '') else cleanest_title, + 'sources': list(set(a.get('sources', []) + b.get('sources', []))), + } + link['type'] = get_link_type(link) + return link - chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) - if chmod_result.returncode == 1: - print(' ', chmod_result.stderr.decode()) - raise Exception('Failed to chmod {}/{}'.format(cwd, path)) +def get_link_type(link): + """Certain types of links need to be handled specially, this figures out when that's the case""" + if extension(link['url']) == 'pdf': + return 'PDF' + elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'): + return 'image' + elif 'wikipedia.org' in domain(link['url']).lower(): + return 'wiki' + elif 'youtube.com' in domain(link['url']).lower(): + return 'youtube' + elif 'soundcloud.com' in domain(link['url']).lower(): + return 'soundcloud' + elif 'youku.com' in domain(link['url']).lower(): + return 'youku' + elif 'vimeo.com' in domain(link['url']).lower(): + return 'vimeo' + return None + +def derived_link_info(link): + """extend link info with the archive urls and other derived data""" + + url = link['url'] + + to_date_str = lambda ts: datetime.fromtimestamp(Decimal(ts)).strftime('%Y-%m-%d %H:%M') + + extended_info = { + **link, + 'bookmarked_date': to_date_str(link['timestamp']), + 'updated_date': to_date_str(link['updated']) if 'updated' in link else None, + 'domain': domain(url), + 'path': path(url), + 'basename': basename(url), + 'base_url': base_url(url), + } + + # Archive Method Output URLs + extended_info = { + **extended_info, + 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**extended_info), + 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info), + 'files_url': 'archive/{timestamp}/index.html'.format(**extended_info), + 'archive_url': wget_output_path(link) or 'archive/{}/index.html'.format(link['timestamp']), + 'warc_url': 'archive/{timestamp}/warc'.format(**extended_info), + 'pdf_url': 'archive/{timestamp}/output.pdf'.format(**extended_info), + 'screenshot_url': 'archive/{timestamp}/screenshot.png'.format(**extended_info), + 'dom_url': 'archive/{timestamp}/output.html'.format(**extended_info), + 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info), + 'git_url': 'archive/{timestamp}/git'.format(**extended_info), + 'media_url': 'archive/{timestamp}/media'.format(**extended_info), + + } + + # PDF and images are handled slightly differently + # wget, screenshot, & pdf urls all point to the same file + if link['type'] in ('PDF', 'image'): + extended_info.update({ + 'title': basename(link['url']), + 'archive_url': 'archive/{timestamp}/{base_url}'.format(**extended_info), + 'pdf_url': 'archive/{timestamp}/{base_url}'.format(**extended_info), + 'screenshot_url': 'archive/{timestamp}/{base_url}'.format(**extended_info), + 'dom_url': 'archive/{timestamp}/{base_url}'.format(**extended_info), + }) + + return extended_info + + +### Python / System Helpers def progress(seconds=TIMEOUT, prefix=''): """Show a (subprocess-controlled) progress bar with a timeout, @@ -260,26 +494,7 @@ def progress(seconds=TIMEOUT, prefix=''): return end -def pretty_path(path): - """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" - return path.replace(REPO_DIR + '/', '') - - -def save_stdin_source(raw_text): - if not os.path.exists(SOURCES_DIR): - os.makedirs(SOURCES_DIR) - - ts = str(datetime.now().timestamp()).split('.', 1)[0] - - source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts)) - - with open(source_path, 'w', encoding='utf-8') as f: - f.write(raw_text) - - return source_path - - -def fetch_page_content(url, timeout=TIMEOUT): +def download_url(url, timeout=TIMEOUT): req = Request(url, headers={'User-Agent': WGET_USER_AGENT}) if CHECK_SSL_VALIDITY: @@ -292,380 +507,16 @@ def fetch_page_content(url, timeout=TIMEOUT): encoding = resp.headers.get_content_charset() or 'utf-8' return resp.read().decode(encoding) +def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30): + """chmod -R /""" -def save_remote_source(url, timeout=TIMEOUT): - """download a given url's content into downloads/domain.txt""" - - if not os.path.exists(SOURCES_DIR): - os.makedirs(SOURCES_DIR) - - ts = str(datetime.now().timestamp()).split('.', 1)[0] - - source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts)) - - print('{}[*] [{}] Downloading {}{}'.format( - ANSI['green'], - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - url, - ANSI['reset'], - )) - end = progress(TIMEOUT, prefix=' ') - try: - downloaded_xml = fetch_page_content(url, timeout=timeout) - end() - except Exception as e: - end() - print('{}[!] Failed to download {}{}\n'.format( - ANSI['red'], - url, - ANSI['reset'], - )) - print(' ', e) - raise SystemExit(1) - - with open(source_path, 'w', encoding='utf-8') as f: - f.write(downloaded_xml) - - print(' > {}'.format(pretty_path(source_path))) - - return source_path - - -def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS): - """Attempt to guess a page's title by downloading the html""" - if not FETCH_TITLE: - return None - - try: - if progress: - sys.stdout.write('.') - sys.stdout.flush() - - html = fetch_page_content(url, timeout=timeout) - - match = re.search(HTML_TITLE_REGEX, html) - return match.group(1).strip() if match else None - except Exception as err: - # print('[!] Failed to fetch title because of {}: {}'.format( - # err.__class__.__name__, - # err, - # )) - return None - - -def str_between(string, start, end=None): - """(12345, , ) -> 12345""" - - content = string.split(start, 1)[-1] - if end is not None: - content = content.rsplit(end, 1)[0] - - return content - -def get_link_type(link): - """Certain types of links need to be handled specially, this figures out when that's the case""" - - if extension(link['url']) == 'pdf': - return 'PDF' - elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'): - return 'image' - elif 'wikipedia.org' in domain(link['url']).lower(): - return 'wiki' - elif 'youtube.com' in domain(link['url']).lower(): - return 'youtube' - elif 'soundcloud.com' in domain(link['url']).lower(): - return 'soundcloud' - elif 'youku.com' in domain(link['url']).lower(): - return 'youku' - elif 'vimeo.com' in domain(link['url']).lower(): - return 'vimeo' - return None - -def merge_links(a, b): - """deterministially merge two links, favoring longer field values over shorter, - and "cleaner" values over worse ones. - """ - longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key]) - earlier = lambda key: a[key] if a[key] < b[key] else b[key] - - url = longer('url') - longest_title = longer('title') - cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title'] - link = { - 'timestamp': earlier('timestamp'), - 'url': url, - 'domain': domain(url), - 'base_url': base_url(url), - 'tags': longer('tags'), - 'title': longest_title if '://' not in (longest_title or '') else cleanest_title, - 'sources': list(set(a.get('sources', []) + b.get('sources', []))), - } - link['type'] = get_link_type(link) - return link - -def find_link(folder, links): - """for a given archive folder, find the corresponding link object in links""" - url = parse_url(folder) - if url: - for link in links: - if (base_url(link['url']) in url) or (url in link['url']): - return link - - timestamp = folder.split('.')[0] - for link in links: - if link['timestamp'].startswith(timestamp): - if domain(link['url']) in os.listdir(os.path.join(ARCHIVE_DIR, folder)): - return link # careful now, this isn't safe for most ppl - if domain(link['url']) in parse_url(folder): - return link - return None - - -def parse_url(folder): - """for a given archive folder, figure out what url it's for""" - link_json = os.path.join(ARCHIVE_DIR, folder, 'index.json') - if os.path.exists(link_json): - with open(link_json, 'r') as f: - try: - link_json = f.read().strip() - if link_json: - link = json.loads(link_json) - return base_url(link['url']) - except ValueError: - print('File contains invalid JSON: {}!'.format(link_json)) - - archive_org_txt = os.path.join(ARCHIVE_DIR, folder, 'archive.org.txt') - if os.path.exists(archive_org_txt): - with open(archive_org_txt, 'r') as f: - original_link = f.read().strip().split('/http', 1)[-1] - with_scheme = 'http{}'.format(original_link) - return with_scheme - - return '' - -def manually_merge_folders(source, target): - """prompt for user input to resolve a conflict between two archive folders""" - - if not IS_TTY: - return - - fname = lambda path: path.split('/')[-1] - - print(' {} and {} have conflicting files, which do you want to keep?'.format(fname(source), fname(target))) - print(' - [enter]: do nothing (keep both)') - print(' - a: prefer files from {}'.format(source)) - print(' - b: prefer files from {}'.format(target)) - print(' - q: quit and resolve the conflict manually') - try: - answer = input('> ').strip().lower() - except KeyboardInterrupt: - answer = 'q' - - assert answer in ('', 'a', 'b', 'q'), 'Invalid choice.' - - if answer == 'q': - print('\nJust run ArchiveBox again to pick up where you left off.') - raise SystemExit(0) - elif answer == '': - return - - files_in_source = set(os.listdir(source)) - files_in_target = set(os.listdir(target)) - for file in files_in_source: - if file in files_in_target: - to_delete = target if answer == 'a' else source - run(['rm', '-Rf', os.path.join(to_delete, file)]) - run(['mv', os.path.join(source, file), os.path.join(target, file)]) - - if not set(os.listdir(source)): - run(['rm', '-Rf', source]) - -def fix_folder_path(archive_path, link_folder, link): - """given a folder, merge it to the canonical 'correct' path for the given link object""" - source = os.path.join(archive_path, link_folder) - target = os.path.join(archive_path, link['timestamp']) - - url_in_folder = parse_url(source) - if not (url_in_folder in base_url(link['url']) - or base_url(link['url']) in url_in_folder): - raise ValueError('The link does not match the url for this folder.') - - if not os.path.exists(target): - # target doesn't exist so nothing needs merging, simply move A to B - run(['mv', source, target]) - else: - # target folder exists, check for conflicting files and attempt manual merge - files_in_source = set(os.listdir(source)) - files_in_target = set(os.listdir(target)) - conflicting_files = files_in_source & files_in_target - - if not conflicting_files: - for file in files_in_source: - run(['mv', os.path.join(source, file), os.path.join(target, file)]) - - if os.path.exists(source): - files_in_source = set(os.listdir(source)) - if files_in_source: - manually_merge_folders(source, target) - else: - run(['rm', '-R', source]) - - -def migrate_data(): - # migrate old folder to new OUTPUT folder - old_dir = os.path.join(REPO_DIR, 'html') - if os.path.exists(old_dir): - print('[!] WARNING: Moved old output folder "html" to new location: {}'.format(OUTPUT_DIR)) - run(['mv', old_dir, OUTPUT_DIR], timeout=10) - - -def cleanup_archive(archive_path, links): - """move any incorrectly named folders to their canonical locations""" - - # for each folder that exists, see if we can match it up with a known good link - # if we can, then merge the two folders (TODO: if not, move it to lost & found) - - unmatched = [] - bad_folders = [] - - if not os.path.exists(archive_path): - return - - for folder in os.listdir(archive_path): - try: - files = os.listdir(os.path.join(archive_path, folder)) - except NotADirectoryError: - continue - - if files: - link = find_link(folder, links) - if link is None: - unmatched.append(folder) - continue - - if folder != link['timestamp']: - bad_folders.append((folder, link)) - else: - # delete empty folders - run(['rm', '-R', os.path.join(archive_path, folder)]) - - if bad_folders and IS_TTY and input('[!] Cleanup archive? y/[n]: ') == 'y': - print('[!] Fixing {} improperly named folders in archive...'.format(len(bad_folders))) - for folder, link in bad_folders: - fix_folder_path(archive_path, folder, link) - elif bad_folders: - print('[!] Warning! {} folders need to be merged, fix by running ArchiveBox.'.format(len(bad_folders))) - - if unmatched: - print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched))) - print(' '+ '\n '.join(unmatched)) - - -def wget_output_path(link, look_in=None): - """calculate the path to the wgetted .html file, since wget may - adjust some paths to be different than the base_url path. - - See docs on wget --adjust-extension (-E) - """ - - # if we have it stored, always prefer the actual output path to computed one - if link.get('latest', {}).get('wget'): - return link['latest']['wget'] - - urlencode = lambda s: quote(s, encoding='utf-8', errors='replace') - - if link['type'] in ('PDF', 'image'): - return urlencode(base_url(link['url'])) - - # Since the wget algorithm to for -E (appending .html) is incredibly complex - # instead of trying to emulate it here, we just look in the output folder - # to see what html file wget actually created as the output - wget_folder = base_url(link['url']).rsplit('/', 1)[0].split('/') - look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder) - - if look_in and os.path.exists(look_in): - html_files = [ - f for f in os.listdir(look_in) - if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M) - ] - if html_files: - return urlencode(os.path.join(*wget_folder, html_files[0])) - - return None - - # If finding the actual output file didn't work, fall back to the buggy - # implementation of the wget .html appending algorithm - # split_url = link['url'].split('#', 1) - # query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' - - # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): - # # already ends in .html - # return urlencode(base_url(link['url'])) - # else: - # # .html needs to be appended - # without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] - # if without_scheme.endswith('/'): - # if query: - # return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])) - # return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]])) - # else: - # if query: - # return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])) - # elif '/' in without_scheme: - # return urlencode('#'.join([without_scheme + '.html', *split_url[1:]])) - # return urlencode(base_url(link['url']) + '/index.html') - - -def derived_link_info(link): - """extend link info with the archive urls and other derived data""" - - url = link['url'] - - to_date_str = lambda ts: datetime.fromtimestamp(Decimal(ts)).strftime('%Y-%m-%d %H:%M') - - extended_info = { - **link, - 'title': link['title'] or base_url(url), - 'date': to_date_str(link['timestamp']), - 'updated_date': to_date_str(link['updated']) if 'updated' in link else None, - 'base_url': base_url(url), - 'domain': domain(url), - 'basename': basename(url), - 'path': path(url), - 'type': link['type'] or 'website', - 'tags': link['tags'] or 'untagged', - } - - # Archive Method Output URLs - extended_info = { - **extended_info, - 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**extended_info), - 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info), - 'files_url': 'archive/{timestamp}/index.html'.format(**extended_info), - 'archive_url': wget_output_path(link) or 'archive/{}/index.html'.format(link['timestamp']), - 'warc_url': 'archive/{timestamp}/warc'.format(**extended_info), - 'pdf_url': 'archive/{timestamp}/output.pdf'.format(**extended_info), - 'screenshot_url': 'archive/{timestamp}/screenshot.png'.format(**extended_info), - 'dom_url': 'archive/{timestamp}/output.html'.format(**extended_info), - 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info), - 'git_url': 'archive/{timestamp}/git'.format(**extended_info), - 'media_url': 'archive/{timestamp}/media'.format(**extended_info), - - } - - # PDF and images are handled slightly differently - # wget, screenshot, & pdf urls all point to the same file - if link['type'] in ('PDF', 'image'): - extended_info.update({ - 'title': basename(link['url']), - 'archive_url': 'archive/{timestamp}/{base_url}'.format(**extended_info), - 'pdf_url': 'archive/{timestamp}/{base_url}'.format(**extended_info), - 'screenshot_url': 'archive/{timestamp}/{base_url}'.format(**extended_info), - 'dom_url': 'archive/{timestamp}/{base_url}'.format(**extended_info), - }) - - return extended_info + if not os.path.exists(os.path.join(cwd, path)): + raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) + chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) + if chmod_result.returncode == 1: + print(' ', chmod_result.stderr.decode()) + raise Exception('Failed to chmod {}/{}'.format(cwd, path)) def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): """Patched of subprocess.run to fix blocking io making timeout=innefective""" @@ -701,16 +552,3 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, raise CalledProcessError(retcode, process.args, output=stdout, stderr=stderr) return CompletedProcess(process.args, retcode, stdout, stderr) - - -def check_link_structure(link): - assert isinstance(link, dict) - assert isinstance(link.get('url'), str) - assert len(link['url']) > 2 - assert len(re.findall(URL_REGEX, link['url'])) == 1 - - -def check_links_structure(links): - assert isinstance(links, list) - if links: - check_link_structure(links[0])