diff --git a/index.py b/index.py index 29bfabb1..7c5d2cfe 100644 --- a/index.py +++ b/index.py @@ -1,5 +1,4 @@ import os -import re import json from datetime import datetime @@ -14,20 +13,15 @@ from config import ( ANSI, GIT_SHA, ) -from util import chmod_file +from util import ( + chmod_file, + html_appended_url, + derived_link_info, +) ### Homepage index for all the links -def parse_json_links_index(out_dir): - """load the index in a given directory and merge it with the given link""" - index_path = os.path.join(out_dir, 'index.json') - if os.path.exists(index_path): - with open(index_path, 'r', encoding='utf-8') as f: - return json.load(f)['links'] - - return [] - def write_links_index(out_dir, links): """create index.html file for a given list of links""" @@ -44,8 +38,6 @@ def write_links_index(out_dir, links): write_json_links_index(out_dir, links) write_html_links_index(out_dir, links) - chmod_file(out_dir, permissions=ARCHIVE_PERMISSIONS) - def write_json_links_index(out_dir, links): """write the json link index to a given path""" @@ -65,6 +57,15 @@ def write_json_links_index(out_dir, links): chmod_file(path) +def parse_json_links_index(out_dir): + """load the index in a given directory and merge it with the given link""" + index_path = os.path.join(out_dir, 'index.json') + if os.path.exists(index_path): + with open(index_path, 'r', encoding='utf-8') as f: + return json.load(f)['links'] + + return [] + def write_html_links_index(out_dir, links): """write the html link index to a given path""" @@ -91,17 +92,11 @@ def write_html_links_index(out_dir, links): with open(path, 'w', encoding='utf-8') as f: f.write(Template(index_html).substitute(**template_vars)) + chmod_file(path) + ### Individual link index -def parse_json_link_index(out_dir): - """load the index in a given directory and merge it with the given link""" - existing_index = os.path.join(out_dir, 'index.json') - if os.path.exists(existing_index): - with open(existing_index, 'r', encoding='utf-8') as f: - return json.load(f) - return {} - def write_link_index(out_dir, link): link['updated'] = str(datetime.now().timestamp()) write_json_link_index(out_dir, link) @@ -112,85 +107,39 @@ def write_json_link_index(out_dir, link): path = os.path.join(out_dir, 'index.json') + print(' √ Updating: index.json') + with open(path, 'w', encoding='utf-8') as f: json.dump(link, f, indent=4, default=str) chmod_file(path) +def parse_json_link_index(out_dir): + """load the json link index from a given directory""" + existing_index = os.path.join(out_dir, 'index.json') + if os.path.exists(existing_index): + with open(existing_index, 'r', encoding='utf-8') as f: + return json.load(f) + return {} + def write_html_link_index(out_dir, link): with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f: link_html = f.read() path = os.path.join(out_dir, 'index.html') + print(' √ Updating: index.html') + with open(path, 'w', encoding='utf-8') as f: f.write(Template(link_html).substitute({ **link, - **link['methods'], + **link['latest'], 'type': link['type'] or 'website', - 'tags': link['tags'] or '', + 'tags': link['tags'] or 'untagged', 'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'), 'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'), - 'archive_org': link['methods']['archive_org'] or 'https://web.archive.org/save/{}'.format(link['url']), - 'wget': link['methods']['wget'] or link['domain'], + 'archive_org': link['latest']['archive_org'] or 'https://web.archive.org/save/{}'.format(link['url']), + 'wget': link['latest']['wget'] or link['domain'], })) chmod_file(path) - - - -def html_appended_url(link): - """calculate the path to the wgetted .html file, since wget may - adjust some paths to be different than the base_url path. - - See docs on wget --adjust-extension.""" - - if link['type'] in ('PDF', 'image'): - return link['base_url'] - - split_url = link['url'].split('#', 1) - query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' - - if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): - # already ends in .html - return link['base_url'] - else: - # .html needs to be appended - without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] - if without_scheme.endswith('/'): - if query: - return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]) - return '#'.join([without_scheme + 'index.html', *split_url[1:]]) - else: - if query: - return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]) - elif '/' in without_scheme: - return '#'.join([without_scheme + '.html', *split_url[1:]]) - return link['base_url'] + '/index.html' - - -def derived_link_info(link): - """extend link info with the archive urls and other derived data""" - - link_info = { - **link, - 'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'), - 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), - 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link), - 'files_url': 'archive/{timestamp}/'.format(**link), - 'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)), - 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link), - 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link), - 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link), - } - - # PDF and images are handled slightly differently - # wget, screenshot, & pdf urls all point to the same file - if link['type'] in ('PDF', 'image'): - link_info.update({ - 'archive_url': 'archive/{timestamp}/{base_url}'.format(**link), - 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link), - 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link), - 'title': '{title} ({type})'.format(**link), - }) - return link_info diff --git a/links.py b/links.py index 9eb3cfa6..22242d17 100644 --- a/links.py +++ b/links.py @@ -1,18 +1,11 @@ -from util import ( - domain, - base_url, - get_str_between, - get_link_type, -) - """ In Bookmark Archiver, a Link represents a single entry that we track in the json index. All links pass through all archiver functions and the latest, -most up-to-date canonical output for each is stored in "latest_archives". -. +most up-to-date canonical output for each is stored in "latest". + Link { - timestamp: float, (how we uniquely id links) _ _ _ _ ___ + timestamp: str, (how we uniquely id links) _ _ _ _ ___ url: str, | \ / \ |\| ' | base_url: str, |_/ \_/ | | | domain: str, _ _ _ _ _ _ @@ -20,7 +13,7 @@ Link { type: str, | /"| | | | \_, title: str, ,-'"`-. sources: [str], /// / @ @ \ \\\\ - latest_archives: { :=| ,._,. |=: / + latest: { \ :=| ,._,. |=: / ..., || ,\ \_../ /. || pdf: 'output.pdf', ||','`-._))'`.`|| wget: 'example.com/1234/index.html' `-' (/ `-' @@ -39,10 +32,18 @@ Link { """ +from util import ( + domain, + base_url, + get_str_between, + get_link_type, +) + + def validate_links(links): - links = valid_links(links) # remove chrome://, about:, mailto: etc. - links = uniquefied_links(links) # fix duplicate timestamps, returns sorted list - links = sorted_links(links) # deterministically sort the links + links = archivable_links(links) # remove chrome://, about:, mailto: etc. + links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls + links = sorted_links(links) # deterministically sort the links based on timstamp, url if not links: print('[X] No links found :(') @@ -50,34 +51,14 @@ def validate_links(links): return list(links) -def sorted_links(links): - return sorted( - links, - key=lambda link: (link['timestamp'], link['url']), - reverse=True, - ) -def merge_links(link1, link2): - """deterministially merge two links, favoring longer field values over shorter, - and "cleaner" values over worse ones. - """ - longer = lambda a, b, key: a[key] if len(a[key]) > len(b[key]) else b[key] - earlier = lambda a, b, key: a[key] if a[key] < b[key] else b[key] - - url = longer(link1, link2, 'url') - longest_title = longer(link1, link2, 'title') - cleanest_title = link1['title'] if '://' not in link1['title'] else link2['title'] - link = { - 'url': url, - 'domain': domain(url), - 'base_url': base_url(url), - 'timestamp': earlier(link1, link2, 'timestamp'), - 'tags': longer(link1, link2, 'tags'), - 'title': longest_title if '://' not in longest_title else cleanest_title, - 'sources': list(set(link1['sources'] + link2['sources'])), - } - link['type'] = get_link_type(link) - return link +def archivable_links(links): + """remove chrome://, about:// or other schemed links that cant be archived""" + return ( + link + for link in links + if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://')) + ) def uniquefied_links(sorted_links): """ @@ -104,13 +85,33 @@ def uniquefied_links(sorted_links): return unique_timestamps.values() -def valid_links(links): - """remove chrome://, about:// or other schemed links that cant be archived""" - return ( - link - for link in links - if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://')) - ) +def sorted_links(links): + sort_func = lambda link: (link['timestamp'], link['url']) + return sorted(links, key=sort_func, reverse=True) + + + +def merge_links(a, b): + """deterministially merge two links, favoring longer field values over shorter, + and "cleaner" values over worse ones. + """ + longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key] + earlier = lambda key: a[key] if a[key] < b[key] else b[key] + + url = longer('url') + longest_title = longer('title') + cleanest_title = a['title'] if '://' not in a['title'] else b['title'] + link = { + 'timestamp': earlier('timestamp'), + 'url': url, + 'domain': domain(url), + 'base_url': base_url(url), + 'tags': longer('tags'), + 'title': longest_title if '://' not in longest_title else cleanest_title, + 'sources': list(set(a.get('sources', []) + b.get('sources', []))), + } + link['type'] = get_link_type(link) + return link def links_after_timestamp(links, timestamp=None): if not timestamp: diff --git a/parse.py b/parse.py index c1d23a5b..0a203fe2 100644 --- a/parse.py +++ b/parse.py @@ -1,32 +1,36 @@ +""" +Everything related to parsing links from bookmark services. + +For a list of supported services, see the README.md. +For examples of supported files see examples/. + +Parsed link schema: { + 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop', + 'domain': 'example.com', + 'base_url': 'example.com/example/', + 'timestamp': '15442123124234', + 'tags': 'abc,def', + 'title': 'Example.com Page Title', + 'sources': ['ril_export.html', 'downloads/getpocket.com.txt'], +} +""" + import re import json + from datetime import datetime from util import ( domain, base_url, - get_str_between, + str_between, get_link_type, ) -def parse_export(path): - """parse a list of links dictionaries from a bookmark export file""" - - links = [] - with open(path, 'r', encoding='utf-8') as file: - for service, parser_func in get_parsers().items(): - # otherwise try all parsers until one works - try: - links += list(parser_func(file)) - if links: - break - except Exception as e: - pass +def get_parsers(file): + """return all parsers that work on a given file, defaults to all of them""" - return links - -def get_parsers(): return { 'pocket': parse_pocket_export, 'pinboard': parse_json_export, @@ -34,12 +38,32 @@ def get_parsers(): 'rss': parse_rss_export, } +def parse_links(path): + """parse a list of links dictionaries from a bookmark export file""" + + links = [] + with open(path, 'r', encoding='utf-8') as file: + for parser_func in get_parsers(file).values(): + # otherwise try all parsers until one works + try: + links += list(parser_func(file)) + if links: + break + except (ValueError, TypeError): + # parser not supported on this file + pass + + return links + + def parse_pocket_export(html_file): """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" html_file.seek(0) - pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE) # see sample input in ./example_ril_export.html + pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE) for line in html_file: + # example line + # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li> match = pattern.search(line) if match: fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url @@ -62,6 +86,8 @@ def parse_json_export(json_file): json_file.seek(0) json_content = json.load(json_file) for line in json_content: + # example line + # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] if line: erg = line time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ') @@ -96,11 +122,12 @@ def parse_rss_export(rss_file): leading_removed = trailing_removed.split('<item>', 1)[-1] rows = leading_removed.split('\n') - row = lambda key: [r for r in rows if r.startswith('<{}>'.format(key))][0] + def get_row(key): + return [r for r in rows if r.startswith('<{}>'.format(key))][0] - title = get_str_between(row('title'), '<![CDATA[', ']]') - url = get_str_between(row('link'), '<link>', '</link>') - ts_str = get_str_between(row('pubDate'), '<pubDate>', '</pubDate>') + title = str_between(get_row('title'), '<![CDATA[', ']]') + url = str_between(get_row('link'), '<link>', '</link>') + ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>') time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") info = { @@ -112,17 +139,20 @@ def parse_rss_export(rss_file): 'title': title, 'sources': [rss_file.name], } - info['type'] = get_link_type(info) - # import ipdb; ipdb.set_trace() + yield info def parse_bookmarks_export(html_file): """Parse netscape-format bookmarks export files (produced by all browsers)""" + html_file.seek(0) pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE) for line in html_file: + # example line + # <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A> + match = pattern.search(line) if match: url = match.group(1) @@ -137,6 +167,6 @@ def parse_bookmarks_export(html_file): 'title': match.group(3), 'sources': [html_file.name], } - info['type'] = get_link_type(info) + yield info diff --git a/templates/index.html b/templates/index.html index 2b125790..3b997de6 100644 --- a/templates/index.html +++ b/templates/index.html @@ -68,7 +68,7 @@ <img src="https://nicksweeting.com/images/archive.png" height="36px"> Archived Sites <img src="https://getpocket.com/favicon.ico" height="36px"> <br/> <small> - Archived with: <a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a> on $date_updated + <a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a> </small> </h1> </header> diff --git a/templates/index_row.html b/templates/index_row.html index 8ea4a4e9..508b29f0 100644 --- a/templates/index_row.html +++ b/templates/index_row.html @@ -4,7 +4,7 @@ <img src="$favicon_url"> $title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small> </td> - <td style="text-align:center"><a href="$files_url/index.html" title="Files">📂</a></td> + <td style="text-align:center"><a href="$files_url" title="Files">📂</a></td> <td style="text-align:center"><a href="$pdf_link" title="PDF">📄</a></td> <td style="text-align:center"><a href="$screenshot_link" title="Screenshot">🖼</a></td> <td style="text-align:center"><a href="$archive_org_url" title="Archive.org">🏛</a></td> diff --git a/templates/link_index.html b/templates/link_index.html index 9f6a5d10..a4eb35df 100644 --- a/templates/link_index.html +++ b/templates/link_index.html @@ -140,7 +140,7 @@ <a href="#" class="collapse-icon" title="Collapse Navbar"> [-] </a> - <a href="../../../index.html" class="nav-icon" title="Archived Sites"> + <a href="./../../index.html" class="nav-icon" title="Archived Sites"> <img src="https://nicksweeting.com/images/archive.png" alt="Archive Icon"> </a> $title<br/> @@ -221,6 +221,7 @@ </body> <script> + // show selected file in iframe when preview card is clicked jQuery('.card').on('click', function(e) { jQuery('.selected-card').removeClass('selected-card') jQuery(e.target).closest('.card').addClass('selected-card') @@ -233,12 +234,16 @@ } return true }) + + // un-sandbox iframes showing pdfs (required to display pdf viewer) jQuery('iframe').map(function() { if (this.src.endsWith('.pdf')) { this.removeAttribute('sandbox') this.src = this.src } }) + + // hide header when collapse icon is clicked jQuery('.collapse-icon').on('click', function() { if (jQuery('.collapse-icon').text().includes('[-]')) { jQuery('.collapse-icon').text('[+]') @@ -251,6 +256,8 @@ } return true }) + + // hide all preview iframes on small screens if (window.innerWidth < 1091) { jQuery('.card a[target=preview]').attr('target', '_self') } diff --git a/util.py b/util.py index 19966a10..98be2978 100644 --- a/util.py +++ b/util.py @@ -1,6 +1,8 @@ import os +import re import sys import time +import json import requests from datetime import datetime @@ -24,6 +26,17 @@ from config import ( SUBMIT_ARCHIVE_DOT_ORG, ) +# URL helpers +without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '') +without_query = lambda url: url.split('?', 1)[0] +without_hash = lambda url: url.split('#', 1)[0] +without_path = lambda url: url.split('/', 1)[0] +domain = lambda url: without_hash(without_query(without_path(without_scheme(url)))) +base_url = lambda url: without_query(without_scheme(url)) + +short_ts = lambda ts: ts.split('.')[0] + + def check_dependencies(): """Check that all necessary dependencies are installed, and have valid versions""" @@ -149,11 +162,15 @@ def progress(seconds=TIMEOUT, prefix=''): def download_url(url): - if not os.path.exists(os.path.join(ARCHIVE_DIR, 'downloads')): - os.makedirs(os.path.join(ARCHIVE_DIR, 'downloads')) + """download a given url's content into downloads/domain.txt""" + + download_dir = os.path.join(ARCHIVE_DIR, 'downloads') + + if not os.path.exists(download_dir): + os.makedirs(download_dir) url_domain = url.split('/', 3)[2] - output_path = os.path.join(ARCHIVE_DIR, 'downloads', '{}.txt'.format(url_domain)) + output_path = os.path.join(download_dir, '{}.txt'.format(url_domain)) print('[*] [{}] Downloading {} > {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), @@ -172,10 +189,10 @@ def download_url(url): with open(output_path, 'w', encoding='utf-8') as f: f.write(downloaded_xml) + return output_path - -def get_str_between(string, start, end=None): +def str_between(string, start, end=None): """(<abc>12345</def>, <abc>, </def>) -> 12345""" content = string.split(start, 1)[-1] @@ -184,9 +201,6 @@ def get_str_between(string, start, end=None): return content - - - def get_link_type(link): """Certain types of links need to be handled specially, this figures out when that's the case""" @@ -207,10 +221,130 @@ def get_link_type(link): return None -# URL helpers -without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '') -without_query = lambda url: url.split('?', 1)[0] -without_hash = lambda url: url.split('#', 1)[0] -without_path = lambda url: url.split('/', 1)[0] -domain = lambda url: without_hash(without_query(without_path(without_scheme(url)))) -base_url = lambda url: without_query(without_scheme(url)) +def find_link(folder, links): + """for a given archive folder, find the corresponding link object in links""" + url = parse_url(folder) + if url: + for link in links: + if (link['base_url'] in url) or (url in link['url']): + return link + + timestamp = folder.split('.')[0] + for link in links: + if link['timestamp'].startswith(timestamp): + if link['domain'] in os.listdir('./html/archive/' + folder): + return link # careful now, this isn't safe for most ppl + if link['domain'] in parse_url(folder): + return link + return None + + +def parse_url(folder): + """for a given archive folder, figure out what url it's for""" + link_json = os.path.join('./html/archive/' + folder, 'index.json') + if os.path.exists(link_json): + with open(link_json, 'r') as f: + link = json.load(f) + return link['base_url'] + + archive_org_txt = os.path.join('./html/archive/' + folder, 'archive.org.txt') + if os.path.exists(archive_org_txt): + with open(archive_org_txt, 'r') as f: + original_link = f.read().strip().split('/http', 1)[-1] + with_scheme = 'http{}'.format(original_link) + return with_scheme + + return '' + + +def merge_folders(folder, link): + """given a folder, merge it to the canonical 'correct' path for the given link object""" + base_url = parse_url(folder) + if not (base_url in link['base_url'] + or link['base_url'] in base_url): + print(base_url, link['base_url']) + assert False + print('{} > {}'.format(folder, link['timestamp'])) + + +def cleanup_archive(path, links): + """move any incorrectly named folders to their canonical locations""" + + # for each folder that exists, see if we can match it up with a known good link + # if we can, then merge the two folders, if not, move it to lost & found + + # for each timestamp, find similar timestamped folders + # check each folder for a "domain.com" folder or + + unmatched = [] + + for folder in os.listdir(path): + link = find_link(folder, links) + if link is None: + unmatched.append(folder) + continue + + if folder != link['timestamp']: + merge_folders(folder, link) + + if unmatched: + print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched))) + print('\n '.join(unmatched)) + + +def html_appended_url(link): + """calculate the path to the wgetted .html file, since wget may + adjust some paths to be different than the base_url path. + + See docs on wget --adjust-extension. + """ + + if link['type'] in ('PDF', 'image'): + return link['base_url'] + + split_url = link['url'].split('#', 1) + query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else '' + + if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): + # already ends in .html + return link['base_url'] + else: + # .html needs to be appended + without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] + if without_scheme.endswith('/'): + if query: + return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]) + return '#'.join([without_scheme + 'index.html', *split_url[1:]]) + else: + if query: + return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]) + elif '/' in without_scheme: + return '#'.join([without_scheme + '.html', *split_url[1:]]) + return link['base_url'] + '/index.html' + + +def derived_link_info(link): + """extend link info with the archive urls and other derived data""" + + link_info = { + **link, + 'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'), + 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), + 'favicon_url': './archive/{timestamp}/favicon.ico'.format(**link), + 'files_url': './archive/{timestamp}/index.html'.format(**link), + 'archive_url': './archive/{}/{}'.format(link['timestamp'], html_appended_url(link)), + 'pdf_link': './archive/{timestamp}/output.pdf'.format(**link), + 'screenshot_link': './archive/{timestamp}/screenshot.png'.format(**link), + 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link), + } + + # PDF and images are handled slightly differently + # wget, screenshot, & pdf urls all point to the same file + if link['type'] in ('PDF', 'image'): + link_info.update({ + 'archive_url': 'archive/{timestamp}/{base_url}'.format(**link), + 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link), + 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link), + 'title': '{title} ({type})'.format(**link), + }) + return link_info