diff --git a/examples/firefox_export.html b/examples/firefox_export.html index 56ffd77d..349ffe10 100644 --- a/examples/firefox_export.html +++ b/examples/firefox_export.html @@ -22,6 +22,7 @@
archive firefox bookmarks at DuckDuckGo
nodiscc (nodiscc) ยท GitHub
pirate/bookmark-archiver ยท Github +
Phonotactic Reconstruction of Encrypted VoIP Conversations
Firefox Bookmarks Archiver - gHacks Tech News

Bookmarks Toolbar

diff --git a/fetch.py b/fetch.py index 3a13fd74..32116408 100644 --- a/fetch.py +++ b/fetch.py @@ -32,7 +32,7 @@ def fetch_wget(out_dir, link, overwrite=False, requisites=True, timeout=60): """download full site using wget""" domain = link['base_url'].split('/', 1)[0] - if not os.path.exists('{}/{}'.format(out_dir, domain)) or overwrite: + if not os.path.exists(os.path.join(out_dir, domain)) or overwrite: print(' - Downloading Full Site') CMD = [ *'wget --timestamping --adjust-extension --no-parent'.split(' '), # Docs: https://www.gnu.org/software/wget/manual/wget.html @@ -54,7 +54,9 @@ def fetch_wget(out_dir, link, overwrite=False, requisites=True, timeout=60): def fetch_pdf(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromium-browser'): """print PDF of site to file using chrome --headless""" - if (not os.path.exists('{}/output.pdf'.format(out_dir)) or overwrite) and link['type'] not in ('PDF', 'image'): + path = os.path.join(out_dir, 'output.pdf') + + if (not os.path.exists(path) or overwrite) and link['type'] not in ('PDF', 'image'): print(' - Printing PDF') CMD = [ chrome_binary, @@ -76,7 +78,9 @@ def fetch_pdf(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromiu def fetch_screenshot(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromium-browser', resolution='1440,900'): """take screenshot of site using chrome --headless""" - if (not os.path.exists('{}/screenshot.png'.format(out_dir)) or overwrite) and link['type'] not in ('PDF', 'image'): + path = os.path.join(out_dir, 'screenshot.png') + + if (not os.path.exists(path) or overwrite) and link['type'] not in ('PDF', 'image'): print(' - Snapping Screenshot') CMD = [ chrome_binary, @@ -98,7 +102,10 @@ def fetch_screenshot(out_dir, link, overwrite=False, timeout=60, chrome_binary=' def archive_dot_org(out_dir, link, overwrite=False, timeout=60): """submit site to archive.org for archiving via their service, save returned archive url""" - if (not os.path.exists('{}/archive.org.txt'.format(out_dir)) or overwrite): + + path = os.path.join(out_dir, 'archive.org.txt') + + if not os.path.exists(path) or overwrite: print(' - Submitting to archive.org') submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0]) @@ -129,7 +136,9 @@ def archive_dot_org(out_dir, link, overwrite=False, timeout=60): def fetch_favicon(out_dir, link, overwrite=False, timeout=60): """download site favicon from google's favicon api""" - if not os.path.exists('{}/favicon.ico'.format(out_dir)) or overwrite: + path = os.path.join(out_dir, 'favicon.ico') + + if not os.path.exists(path) or overwrite: print(' - Fetching Favicon') CMD = 'curl https://www.google.com/s2/favicons?domain={domain}'.format(**link).split(' ') fout = open('{}/favicon.ico'.format(out_dir), 'w') @@ -149,7 +158,9 @@ def fetch_audio(out_dir, link, overwrite=False, timeout=60): if link['type'] not in ('soundcloud',): return - if (not os.path.exists('{}/audio'.format(out_dir)) or overwrite): + path = os.path.join(out_dir, 'audio') + + if not os.path.exists(path) or overwrite: print(' - Downloading audio') CMD = [ "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'", @@ -173,8 +184,9 @@ def fetch_video(out_dir, link, overwrite=False, timeout=60): if link['type'] not in ('youtube', 'youku', 'vimeo'): return + path = os.path.join(out_dir, 'video') - if (not os.path.exists('{}/video'.format(out_dir)) or overwrite): + if not os.path.exists(path) or overwrite: print(' - Downloading video') CMD = [ "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'", diff --git a/index.py b/index.py index 19af455e..92787481 100644 --- a/index.py +++ b/index.py @@ -19,7 +19,12 @@ def dump_index(links, service): link_html.format(**derived_link_info(link)) for link in links ) - template_vars = (datetime.now().strftime('%Y-%m-%d %H:%M'), article_rows) + template_vars = { + 'num_links': len(links), + 'date_updated': datetime.now().strftime('%Y-%m-%d'), + 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), + 'rows': article_rows, + } with open(os.path.join(service, 'index.html'), 'w', encoding='utf-8') as f: - f.write(index_html.format(*template_vars)) + f.write(index_html.format(**template_vars)) diff --git a/parse.py b/parse.py index 6e9e7b50..d130b298 100644 --- a/parse.py +++ b/parse.py @@ -168,8 +168,8 @@ def valid_links(links): return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp')) -def calculate_archive_url(link): - """calculate the path to the wgetted html file, since wget may +def html_appended_url(link): + """calculate the path to the wgetted .html file, since wget may adjust some paths to be different than the base_url path. See docs on wget --adjust-extension.""" @@ -190,7 +190,13 @@ def calculate_archive_url(link): def derived_link_info(link): """extend link info with the archive urls and other derived data""" - link_info = {**link} + link_info = { + **link, + 'files_url': 'archive/{timestamp}/'.format(**link), + 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link), + 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link), + 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), + } # PDF and images are handled slightly differently # wget, screenshot, & pdf urls all point to the same file @@ -203,7 +209,7 @@ def derived_link_info(link): }) else: link_info.update({ - 'archive_url': calculate_archive_url(link), + 'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)), 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link), 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link) }) diff --git a/templates/index.html b/templates/index.html index 594a09c7..f6287e5d 100644 --- a/templates/index.html +++ b/templates/index.html @@ -64,11 +64,11 @@
-

+

Archived Sites
- Exported with: Bookmark Archiver + Archived with: Bookmark Archiver on {date_updated}

@@ -76,7 +76,7 @@ Starred - Saved Article + Saved Articles ({num_links}) Files PDF Screenshot @@ -84,7 +84,7 @@ Original URL - {} + {rows} diff --git a/templates/index_row.html b/templates/index_row.html index 553c4d57..ffe4bb9e 100644 --- a/templates/index_row.html +++ b/templates/index_row.html @@ -1,12 +1,12 @@ {time} - - + + {title} {tags} - ๐Ÿ“‚ + ๐Ÿ“‚ ๐Ÿ“„ ๐Ÿ–ผ - ๐Ÿ› - ๐Ÿ”— {url} + ๐Ÿ› + ๐Ÿ”— {url}