diff --git a/examples/firefox_export.html b/examples/firefox_export.html
index 56ffd77d..349ffe10 100644
--- a/examples/firefox_export.html
+++ b/examples/firefox_export.html
@@ -22,6 +22,7 @@
archive firefox bookmarks at DuckDuckGo
nodiscc (nodiscc) ยท GitHub
pirate/bookmark-archiver ยท Github
+ Phonotactic Reconstruction of Encrypted VoIP Conversations
Firefox Bookmarks Archiver - gHacks Tech News
Bookmarks Toolbar
diff --git a/fetch.py b/fetch.py
index 3a13fd74..32116408 100644
--- a/fetch.py
+++ b/fetch.py
@@ -32,7 +32,7 @@ def fetch_wget(out_dir, link, overwrite=False, requisites=True, timeout=60):
"""download full site using wget"""
domain = link['base_url'].split('/', 1)[0]
- if not os.path.exists('{}/{}'.format(out_dir, domain)) or overwrite:
+ if not os.path.exists(os.path.join(out_dir, domain)) or overwrite:
print(' - Downloading Full Site')
CMD = [
*'wget --timestamping --adjust-extension --no-parent'.split(' '), # Docs: https://www.gnu.org/software/wget/manual/wget.html
@@ -54,7 +54,9 @@ def fetch_wget(out_dir, link, overwrite=False, requisites=True, timeout=60):
def fetch_pdf(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromium-browser'):
"""print PDF of site to file using chrome --headless"""
- if (not os.path.exists('{}/output.pdf'.format(out_dir)) or overwrite) and link['type'] not in ('PDF', 'image'):
+ path = os.path.join(out_dir, 'output.pdf')
+
+ if (not os.path.exists(path) or overwrite) and link['type'] not in ('PDF', 'image'):
print(' - Printing PDF')
CMD = [
chrome_binary,
@@ -76,7 +78,9 @@ def fetch_pdf(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromiu
def fetch_screenshot(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromium-browser', resolution='1440,900'):
"""take screenshot of site using chrome --headless"""
- if (not os.path.exists('{}/screenshot.png'.format(out_dir)) or overwrite) and link['type'] not in ('PDF', 'image'):
+ path = os.path.join(out_dir, 'screenshot.png')
+
+ if (not os.path.exists(path) or overwrite) and link['type'] not in ('PDF', 'image'):
print(' - Snapping Screenshot')
CMD = [
chrome_binary,
@@ -98,7 +102,10 @@ def fetch_screenshot(out_dir, link, overwrite=False, timeout=60, chrome_binary='
def archive_dot_org(out_dir, link, overwrite=False, timeout=60):
"""submit site to archive.org for archiving via their service, save returned archive url"""
- if (not os.path.exists('{}/archive.org.txt'.format(out_dir)) or overwrite):
+
+ path = os.path.join(out_dir, 'archive.org.txt')
+
+ if not os.path.exists(path) or overwrite:
print(' - Submitting to archive.org')
submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0])
@@ -129,7 +136,9 @@ def archive_dot_org(out_dir, link, overwrite=False, timeout=60):
def fetch_favicon(out_dir, link, overwrite=False, timeout=60):
"""download site favicon from google's favicon api"""
- if not os.path.exists('{}/favicon.ico'.format(out_dir)) or overwrite:
+ path = os.path.join(out_dir, 'favicon.ico')
+
+ if not os.path.exists(path) or overwrite:
print(' - Fetching Favicon')
CMD = 'curl https://www.google.com/s2/favicons?domain={domain}'.format(**link).split(' ')
fout = open('{}/favicon.ico'.format(out_dir), 'w')
@@ -149,7 +158,9 @@ def fetch_audio(out_dir, link, overwrite=False, timeout=60):
if link['type'] not in ('soundcloud',):
return
- if (not os.path.exists('{}/audio'.format(out_dir)) or overwrite):
+ path = os.path.join(out_dir, 'audio')
+
+ if not os.path.exists(path) or overwrite:
print(' - Downloading audio')
CMD = [
"youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
@@ -173,8 +184,9 @@ def fetch_video(out_dir, link, overwrite=False, timeout=60):
if link['type'] not in ('youtube', 'youku', 'vimeo'):
return
+ path = os.path.join(out_dir, 'video')
- if (not os.path.exists('{}/video'.format(out_dir)) or overwrite):
+ if not os.path.exists(path) or overwrite:
print(' - Downloading video')
CMD = [
"youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
diff --git a/index.py b/index.py
index 19af455e..92787481 100644
--- a/index.py
+++ b/index.py
@@ -19,7 +19,12 @@ def dump_index(links, service):
link_html.format(**derived_link_info(link)) for link in links
)
- template_vars = (datetime.now().strftime('%Y-%m-%d %H:%M'), article_rows)
+ template_vars = {
+ 'num_links': len(links),
+ 'date_updated': datetime.now().strftime('%Y-%m-%d'),
+ 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
+ 'rows': article_rows,
+ }
with open(os.path.join(service, 'index.html'), 'w', encoding='utf-8') as f:
- f.write(index_html.format(*template_vars))
+ f.write(index_html.format(**template_vars))
diff --git a/parse.py b/parse.py
index 6e9e7b50..d130b298 100644
--- a/parse.py
+++ b/parse.py
@@ -168,8 +168,8 @@ def valid_links(links):
return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp'))
-def calculate_archive_url(link):
- """calculate the path to the wgetted html file, since wget may
+def html_appended_url(link):
+ """calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension."""
@@ -190,7 +190,13 @@ def calculate_archive_url(link):
def derived_link_info(link):
"""extend link info with the archive urls and other derived data"""
- link_info = {**link}
+ link_info = {
+ **link,
+ 'files_url': 'archive/{timestamp}/'.format(**link),
+ 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
+ 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
+ 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
+ }
# PDF and images are handled slightly differently
# wget, screenshot, & pdf urls all point to the same file
@@ -203,7 +209,7 @@ def derived_link_info(link):
})
else:
link_info.update({
- 'archive_url': calculate_archive_url(link),
+ 'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link)
})
diff --git a/templates/index.html b/templates/index.html
index 594a09c7..f6287e5d 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -64,11 +64,11 @@
@@ -76,7 +76,7 @@
Starred |
- Saved Article |
+ Saved Articles ({num_links}) |
Files |
PDF |
Screenshot |
@@ -84,7 +84,7 @@
Original URL |
- {}
+ {rows}