mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
refactoring and fancy new link index
This commit is contained in:
parent
1249493fcd
commit
a95912679e
7 changed files with 295 additions and 174 deletions
115
index.py
115
index.py
|
@ -1,5 +1,4 @@
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
@ -14,20 +13,15 @@ from config import (
|
||||||
ANSI,
|
ANSI,
|
||||||
GIT_SHA,
|
GIT_SHA,
|
||||||
)
|
)
|
||||||
from util import chmod_file
|
from util import (
|
||||||
|
chmod_file,
|
||||||
|
html_appended_url,
|
||||||
|
derived_link_info,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
### Homepage index for all the links
|
### Homepage index for all the links
|
||||||
|
|
||||||
def parse_json_links_index(out_dir):
|
|
||||||
"""load the index in a given directory and merge it with the given link"""
|
|
||||||
index_path = os.path.join(out_dir, 'index.json')
|
|
||||||
if os.path.exists(index_path):
|
|
||||||
with open(index_path, 'r', encoding='utf-8') as f:
|
|
||||||
return json.load(f)['links']
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
def write_links_index(out_dir, links):
|
def write_links_index(out_dir, links):
|
||||||
"""create index.html file for a given list of links"""
|
"""create index.html file for a given list of links"""
|
||||||
|
|
||||||
|
@ -44,8 +38,6 @@ def write_links_index(out_dir, links):
|
||||||
write_json_links_index(out_dir, links)
|
write_json_links_index(out_dir, links)
|
||||||
write_html_links_index(out_dir, links)
|
write_html_links_index(out_dir, links)
|
||||||
|
|
||||||
chmod_file(out_dir, permissions=ARCHIVE_PERMISSIONS)
|
|
||||||
|
|
||||||
def write_json_links_index(out_dir, links):
|
def write_json_links_index(out_dir, links):
|
||||||
"""write the json link index to a given path"""
|
"""write the json link index to a given path"""
|
||||||
|
|
||||||
|
@ -65,6 +57,15 @@ def write_json_links_index(out_dir, links):
|
||||||
|
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
def parse_json_links_index(out_dir):
|
||||||
|
"""load the index in a given directory and merge it with the given link"""
|
||||||
|
index_path = os.path.join(out_dir, 'index.json')
|
||||||
|
if os.path.exists(index_path):
|
||||||
|
with open(index_path, 'r', encoding='utf-8') as f:
|
||||||
|
return json.load(f)['links']
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
def write_html_links_index(out_dir, links):
|
def write_html_links_index(out_dir, links):
|
||||||
"""write the html link index to a given path"""
|
"""write the html link index to a given path"""
|
||||||
|
|
||||||
|
@ -91,17 +92,11 @@ def write_html_links_index(out_dir, links):
|
||||||
with open(path, 'w', encoding='utf-8') as f:
|
with open(path, 'w', encoding='utf-8') as f:
|
||||||
f.write(Template(index_html).substitute(**template_vars))
|
f.write(Template(index_html).substitute(**template_vars))
|
||||||
|
|
||||||
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
### Individual link index
|
### Individual link index
|
||||||
|
|
||||||
def parse_json_link_index(out_dir):
|
|
||||||
"""load the index in a given directory and merge it with the given link"""
|
|
||||||
existing_index = os.path.join(out_dir, 'index.json')
|
|
||||||
if os.path.exists(existing_index):
|
|
||||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
|
||||||
return json.load(f)
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def write_link_index(out_dir, link):
|
def write_link_index(out_dir, link):
|
||||||
link['updated'] = str(datetime.now().timestamp())
|
link['updated'] = str(datetime.now().timestamp())
|
||||||
write_json_link_index(out_dir, link)
|
write_json_link_index(out_dir, link)
|
||||||
|
@ -112,85 +107,39 @@ def write_json_link_index(out_dir, link):
|
||||||
|
|
||||||
path = os.path.join(out_dir, 'index.json')
|
path = os.path.join(out_dir, 'index.json')
|
||||||
|
|
||||||
|
print(' √ Updating: index.json')
|
||||||
|
|
||||||
with open(path, 'w', encoding='utf-8') as f:
|
with open(path, 'w', encoding='utf-8') as f:
|
||||||
json.dump(link, f, indent=4, default=str)
|
json.dump(link, f, indent=4, default=str)
|
||||||
|
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
def parse_json_link_index(out_dir):
|
||||||
|
"""load the json link index from a given directory"""
|
||||||
|
existing_index = os.path.join(out_dir, 'index.json')
|
||||||
|
if os.path.exists(existing_index):
|
||||||
|
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||||
|
return json.load(f)
|
||||||
|
return {}
|
||||||
|
|
||||||
def write_html_link_index(out_dir, link):
|
def write_html_link_index(out_dir, link):
|
||||||
with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f:
|
with open(LINK_INDEX_TEMPLATE, 'r', encoding='utf-8') as f:
|
||||||
link_html = f.read()
|
link_html = f.read()
|
||||||
|
|
||||||
path = os.path.join(out_dir, 'index.html')
|
path = os.path.join(out_dir, 'index.html')
|
||||||
|
|
||||||
|
print(' √ Updating: index.html')
|
||||||
|
|
||||||
with open(path, 'w', encoding='utf-8') as f:
|
with open(path, 'w', encoding='utf-8') as f:
|
||||||
f.write(Template(link_html).substitute({
|
f.write(Template(link_html).substitute({
|
||||||
**link,
|
**link,
|
||||||
**link['methods'],
|
**link['latest'],
|
||||||
'type': link['type'] or 'website',
|
'type': link['type'] or 'website',
|
||||||
'tags': link['tags'] or '',
|
'tags': link['tags'] or 'untagged',
|
||||||
'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
|
'bookmarked': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
|
||||||
'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'),
|
'updated': datetime.fromtimestamp(float(link['updated'])).strftime('%Y-%m-%d %H:%M'),
|
||||||
'archive_org': link['methods']['archive_org'] or 'https://web.archive.org/save/{}'.format(link['url']),
|
'archive_org': link['latest']['archive_org'] or 'https://web.archive.org/save/{}'.format(link['url']),
|
||||||
'wget': link['methods']['wget'] or link['domain'],
|
'wget': link['latest']['wget'] or link['domain'],
|
||||||
}))
|
}))
|
||||||
|
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def html_appended_url(link):
|
|
||||||
"""calculate the path to the wgetted .html file, since wget may
|
|
||||||
adjust some paths to be different than the base_url path.
|
|
||||||
|
|
||||||
See docs on wget --adjust-extension."""
|
|
||||||
|
|
||||||
if link['type'] in ('PDF', 'image'):
|
|
||||||
return link['base_url']
|
|
||||||
|
|
||||||
split_url = link['url'].split('#', 1)
|
|
||||||
query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
|
|
||||||
|
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
|
||||||
# already ends in .html
|
|
||||||
return link['base_url']
|
|
||||||
else:
|
|
||||||
# .html needs to be appended
|
|
||||||
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
|
||||||
if without_scheme.endswith('/'):
|
|
||||||
if query:
|
|
||||||
return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
|
|
||||||
return '#'.join([without_scheme + 'index.html', *split_url[1:]])
|
|
||||||
else:
|
|
||||||
if query:
|
|
||||||
return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])
|
|
||||||
elif '/' in without_scheme:
|
|
||||||
return '#'.join([without_scheme + '.html', *split_url[1:]])
|
|
||||||
return link['base_url'] + '/index.html'
|
|
||||||
|
|
||||||
|
|
||||||
def derived_link_info(link):
|
|
||||||
"""extend link info with the archive urls and other derived data"""
|
|
||||||
|
|
||||||
link_info = {
|
|
||||||
**link,
|
|
||||||
'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
|
|
||||||
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
|
|
||||||
'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
|
|
||||||
'files_url': 'archive/{timestamp}/'.format(**link),
|
|
||||||
'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
|
|
||||||
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
|
|
||||||
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
|
|
||||||
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
|
|
||||||
}
|
|
||||||
|
|
||||||
# PDF and images are handled slightly differently
|
|
||||||
# wget, screenshot, & pdf urls all point to the same file
|
|
||||||
if link['type'] in ('PDF', 'image'):
|
|
||||||
link_info.update({
|
|
||||||
'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
|
|
||||||
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
|
||||||
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
|
||||||
'title': '{title} ({type})'.format(**link),
|
|
||||||
})
|
|
||||||
return link_info
|
|
||||||
|
|
97
links.py
97
links.py
|
@ -1,18 +1,11 @@
|
||||||
from util import (
|
|
||||||
domain,
|
|
||||||
base_url,
|
|
||||||
get_str_between,
|
|
||||||
get_link_type,
|
|
||||||
)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
In Bookmark Archiver, a Link represents a single entry that we track in the
|
In Bookmark Archiver, a Link represents a single entry that we track in the
|
||||||
json index. All links pass through all archiver functions and the latest,
|
json index. All links pass through all archiver functions and the latest,
|
||||||
most up-to-date canonical output for each is stored in "latest_archives".
|
most up-to-date canonical output for each is stored in "latest".
|
||||||
.
|
|
||||||
|
|
||||||
Link {
|
Link {
|
||||||
timestamp: float, (how we uniquely id links) _ _ _ _ ___
|
timestamp: str, (how we uniquely id links) _ _ _ _ ___
|
||||||
url: str, | \ / \ |\| ' |
|
url: str, | \ / \ |\| ' |
|
||||||
base_url: str, |_/ \_/ | | |
|
base_url: str, |_/ \_/ | | |
|
||||||
domain: str, _ _ _ _ _ _
|
domain: str, _ _ _ _ _ _
|
||||||
|
@ -20,7 +13,7 @@ Link {
|
||||||
type: str, | /"| | | | \_,
|
type: str, | /"| | | | \_,
|
||||||
title: str, ,-'"`-.
|
title: str, ,-'"`-.
|
||||||
sources: [str], /// / @ @ \ \\\\
|
sources: [str], /// / @ @ \ \\\\
|
||||||
latest_archives: { :=| ,._,. |=: /
|
latest: { \ :=| ,._,. |=: /
|
||||||
..., || ,\ \_../ /. ||
|
..., || ,\ \_../ /. ||
|
||||||
pdf: 'output.pdf', ||','`-._))'`.`||
|
pdf: 'output.pdf', ||','`-._))'`.`||
|
||||||
wget: 'example.com/1234/index.html' `-' (/ `-'
|
wget: 'example.com/1234/index.html' `-' (/ `-'
|
||||||
|
@ -39,10 +32,18 @@ Link {
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from util import (
|
||||||
|
domain,
|
||||||
|
base_url,
|
||||||
|
get_str_between,
|
||||||
|
get_link_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def validate_links(links):
|
def validate_links(links):
|
||||||
links = valid_links(links) # remove chrome://, about:, mailto: etc.
|
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
||||||
links = uniquefied_links(links) # fix duplicate timestamps, returns sorted list
|
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
|
||||||
links = sorted_links(links) # deterministically sort the links
|
links = sorted_links(links) # deterministically sort the links based on timstamp, url
|
||||||
|
|
||||||
if not links:
|
if not links:
|
||||||
print('[X] No links found :(')
|
print('[X] No links found :(')
|
||||||
|
@ -50,35 +51,15 @@ def validate_links(links):
|
||||||
|
|
||||||
return list(links)
|
return list(links)
|
||||||
|
|
||||||
def sorted_links(links):
|
|
||||||
return sorted(
|
def archivable_links(links):
|
||||||
links,
|
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||||
key=lambda link: (link['timestamp'], link['url']),
|
return (
|
||||||
reverse=True,
|
link
|
||||||
|
for link in links
|
||||||
|
if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
|
||||||
)
|
)
|
||||||
|
|
||||||
def merge_links(link1, link2):
|
|
||||||
"""deterministially merge two links, favoring longer field values over shorter,
|
|
||||||
and "cleaner" values over worse ones.
|
|
||||||
"""
|
|
||||||
longer = lambda a, b, key: a[key] if len(a[key]) > len(b[key]) else b[key]
|
|
||||||
earlier = lambda a, b, key: a[key] if a[key] < b[key] else b[key]
|
|
||||||
|
|
||||||
url = longer(link1, link2, 'url')
|
|
||||||
longest_title = longer(link1, link2, 'title')
|
|
||||||
cleanest_title = link1['title'] if '://' not in link1['title'] else link2['title']
|
|
||||||
link = {
|
|
||||||
'url': url,
|
|
||||||
'domain': domain(url),
|
|
||||||
'base_url': base_url(url),
|
|
||||||
'timestamp': earlier(link1, link2, 'timestamp'),
|
|
||||||
'tags': longer(link1, link2, 'tags'),
|
|
||||||
'title': longest_title if '://' not in longest_title else cleanest_title,
|
|
||||||
'sources': list(set(link1['sources'] + link2['sources'])),
|
|
||||||
}
|
|
||||||
link['type'] = get_link_type(link)
|
|
||||||
return link
|
|
||||||
|
|
||||||
def uniquefied_links(sorted_links):
|
def uniquefied_links(sorted_links):
|
||||||
"""
|
"""
|
||||||
ensures that all non-duplicate links have monotonically increasing timestamps
|
ensures that all non-duplicate links have monotonically increasing timestamps
|
||||||
|
@ -104,13 +85,33 @@ def uniquefied_links(sorted_links):
|
||||||
|
|
||||||
return unique_timestamps.values()
|
return unique_timestamps.values()
|
||||||
|
|
||||||
def valid_links(links):
|
def sorted_links(links):
|
||||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
sort_func = lambda link: (link['timestamp'], link['url'])
|
||||||
return (
|
return sorted(links, key=sort_func, reverse=True)
|
||||||
link
|
|
||||||
for link in links
|
|
||||||
if any(link['url'].startswith(s) for s in ('http://', 'https://', 'ftp://'))
|
|
||||||
)
|
def merge_links(a, b):
|
||||||
|
"""deterministially merge two links, favoring longer field values over shorter,
|
||||||
|
and "cleaner" values over worse ones.
|
||||||
|
"""
|
||||||
|
longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
|
||||||
|
earlier = lambda key: a[key] if a[key] < b[key] else b[key]
|
||||||
|
|
||||||
|
url = longer('url')
|
||||||
|
longest_title = longer('title')
|
||||||
|
cleanest_title = a['title'] if '://' not in a['title'] else b['title']
|
||||||
|
link = {
|
||||||
|
'timestamp': earlier('timestamp'),
|
||||||
|
'url': url,
|
||||||
|
'domain': domain(url),
|
||||||
|
'base_url': base_url(url),
|
||||||
|
'tags': longer('tags'),
|
||||||
|
'title': longest_title if '://' not in longest_title else cleanest_title,
|
||||||
|
'sources': list(set(a.get('sources', []) + b.get('sources', []))),
|
||||||
|
}
|
||||||
|
link['type'] = get_link_type(link)
|
||||||
|
return link
|
||||||
|
|
||||||
def links_after_timestamp(links, timestamp=None):
|
def links_after_timestamp(links, timestamp=None):
|
||||||
if not timestamp:
|
if not timestamp:
|
||||||
|
|
80
parse.py
80
parse.py
|
@ -1,32 +1,36 @@
|
||||||
|
"""
|
||||||
|
Everything related to parsing links from bookmark services.
|
||||||
|
|
||||||
|
For a list of supported services, see the README.md.
|
||||||
|
For examples of supported files see examples/.
|
||||||
|
|
||||||
|
Parsed link schema: {
|
||||||
|
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
|
||||||
|
'domain': 'example.com',
|
||||||
|
'base_url': 'example.com/example/',
|
||||||
|
'timestamp': '15442123124234',
|
||||||
|
'tags': 'abc,def',
|
||||||
|
'title': 'Example.com Page Title',
|
||||||
|
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from util import (
|
from util import (
|
||||||
domain,
|
domain,
|
||||||
base_url,
|
base_url,
|
||||||
get_str_between,
|
str_between,
|
||||||
get_link_type,
|
get_link_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_export(path):
|
def get_parsers(file):
|
||||||
"""parse a list of links dictionaries from a bookmark export file"""
|
"""return all parsers that work on a given file, defaults to all of them"""
|
||||||
|
|
||||||
links = []
|
|
||||||
with open(path, 'r', encoding='utf-8') as file:
|
|
||||||
for service, parser_func in get_parsers().items():
|
|
||||||
# otherwise try all parsers until one works
|
|
||||||
try:
|
|
||||||
links += list(parser_func(file))
|
|
||||||
if links:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return links
|
|
||||||
|
|
||||||
def get_parsers():
|
|
||||||
return {
|
return {
|
||||||
'pocket': parse_pocket_export,
|
'pocket': parse_pocket_export,
|
||||||
'pinboard': parse_json_export,
|
'pinboard': parse_json_export,
|
||||||
|
@ -34,12 +38,32 @@ def get_parsers():
|
||||||
'rss': parse_rss_export,
|
'rss': parse_rss_export,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def parse_links(path):
|
||||||
|
"""parse a list of links dictionaries from a bookmark export file"""
|
||||||
|
|
||||||
|
links = []
|
||||||
|
with open(path, 'r', encoding='utf-8') as file:
|
||||||
|
for parser_func in get_parsers(file).values():
|
||||||
|
# otherwise try all parsers until one works
|
||||||
|
try:
|
||||||
|
links += list(parser_func(file))
|
||||||
|
if links:
|
||||||
|
break
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
# parser not supported on this file
|
||||||
|
pass
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
def parse_pocket_export(html_file):
|
def parse_pocket_export(html_file):
|
||||||
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
||||||
|
|
||||||
html_file.seek(0)
|
html_file.seek(0)
|
||||||
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE) # see sample input in ./example_ril_export.html
|
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
|
||||||
for line in html_file:
|
for line in html_file:
|
||||||
|
# example line
|
||||||
|
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
|
||||||
match = pattern.search(line)
|
match = pattern.search(line)
|
||||||
if match:
|
if match:
|
||||||
fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
|
fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
|
||||||
|
@ -62,6 +86,8 @@ def parse_json_export(json_file):
|
||||||
json_file.seek(0)
|
json_file.seek(0)
|
||||||
json_content = json.load(json_file)
|
json_content = json.load(json_file)
|
||||||
for line in json_content:
|
for line in json_content:
|
||||||
|
# example line
|
||||||
|
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||||
if line:
|
if line:
|
||||||
erg = line
|
erg = line
|
||||||
time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')
|
time = datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')
|
||||||
|
@ -96,11 +122,12 @@ def parse_rss_export(rss_file):
|
||||||
leading_removed = trailing_removed.split('<item>', 1)[-1]
|
leading_removed = trailing_removed.split('<item>', 1)[-1]
|
||||||
rows = leading_removed.split('\n')
|
rows = leading_removed.split('\n')
|
||||||
|
|
||||||
row = lambda key: [r for r in rows if r.startswith('<{}>'.format(key))][0]
|
def get_row(key):
|
||||||
|
return [r for r in rows if r.startswith('<{}>'.format(key))][0]
|
||||||
|
|
||||||
title = get_str_between(row('title'), '<![CDATA[', ']]')
|
title = str_between(get_row('title'), '<![CDATA[', ']]')
|
||||||
url = get_str_between(row('link'), '<link>', '</link>')
|
url = str_between(get_row('link'), '<link>', '</link>')
|
||||||
ts_str = get_str_between(row('pubDate'), '<pubDate>', '</pubDate>')
|
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
|
@ -112,17 +139,20 @@ def parse_rss_export(rss_file):
|
||||||
'title': title,
|
'title': title,
|
||||||
'sources': [rss_file.name],
|
'sources': [rss_file.name],
|
||||||
}
|
}
|
||||||
|
|
||||||
info['type'] = get_link_type(info)
|
info['type'] = get_link_type(info)
|
||||||
# import ipdb; ipdb.set_trace()
|
|
||||||
yield info
|
yield info
|
||||||
|
|
||||||
def parse_bookmarks_export(html_file):
|
def parse_bookmarks_export(html_file):
|
||||||
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
||||||
|
|
||||||
|
|
||||||
html_file.seek(0)
|
html_file.seek(0)
|
||||||
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
|
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
|
||||||
for line in html_file:
|
for line in html_file:
|
||||||
|
# example line
|
||||||
|
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
|
||||||
|
|
||||||
match = pattern.search(line)
|
match = pattern.search(line)
|
||||||
if match:
|
if match:
|
||||||
url = match.group(1)
|
url = match.group(1)
|
||||||
|
@ -137,6 +167,6 @@ def parse_bookmarks_export(html_file):
|
||||||
'title': match.group(3),
|
'title': match.group(3),
|
||||||
'sources': [html_file.name],
|
'sources': [html_file.name],
|
||||||
}
|
}
|
||||||
|
|
||||||
info['type'] = get_link_type(info)
|
info['type'] = get_link_type(info)
|
||||||
|
|
||||||
yield info
|
yield info
|
||||||
|
|
|
@ -68,7 +68,7 @@
|
||||||
<img src="https://nicksweeting.com/images/archive.png" height="36px">
|
<img src="https://nicksweeting.com/images/archive.png" height="36px">
|
||||||
Archived Sites <img src="https://getpocket.com/favicon.ico" height="36px"> <br/>
|
Archived Sites <img src="https://getpocket.com/favicon.ico" height="36px"> <br/>
|
||||||
<small>
|
<small>
|
||||||
Archived with: <a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a> on $date_updated
|
<a href="https://github.com/pirate/bookmark-archiver">Bookmark Archiver</a>
|
||||||
</small>
|
</small>
|
||||||
</h1>
|
</h1>
|
||||||
</header>
|
</header>
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
<img src="$favicon_url">
|
<img src="$favicon_url">
|
||||||
$title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
|
$title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
|
||||||
</td>
|
</td>
|
||||||
<td style="text-align:center"><a href="$files_url/index.html" title="Files">📂</a></td>
|
<td style="text-align:center"><a href="$files_url" title="Files">📂</a></td>
|
||||||
<td style="text-align:center"><a href="$pdf_link" title="PDF">📄</a></td>
|
<td style="text-align:center"><a href="$pdf_link" title="PDF">📄</a></td>
|
||||||
<td style="text-align:center"><a href="$screenshot_link" title="Screenshot">🖼</a></td>
|
<td style="text-align:center"><a href="$screenshot_link" title="Screenshot">🖼</a></td>
|
||||||
<td style="text-align:center"><a href="$archive_org_url" title="Archive.org">🏛</a></td>
|
<td style="text-align:center"><a href="$archive_org_url" title="Archive.org">🏛</a></td>
|
||||||
|
|
|
@ -140,7 +140,7 @@
|
||||||
<a href="#" class="collapse-icon" title="Collapse Navbar">
|
<a href="#" class="collapse-icon" title="Collapse Navbar">
|
||||||
[-]
|
[-]
|
||||||
</a>
|
</a>
|
||||||
<a href="../../../index.html" class="nav-icon" title="Archived Sites">
|
<a href="./../../index.html" class="nav-icon" title="Archived Sites">
|
||||||
<img src="https://nicksweeting.com/images/archive.png" alt="Archive Icon">
|
<img src="https://nicksweeting.com/images/archive.png" alt="Archive Icon">
|
||||||
</a>
|
</a>
|
||||||
$title<br/>
|
$title<br/>
|
||||||
|
@ -221,6 +221,7 @@
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
|
// show selected file in iframe when preview card is clicked
|
||||||
jQuery('.card').on('click', function(e) {
|
jQuery('.card').on('click', function(e) {
|
||||||
jQuery('.selected-card').removeClass('selected-card')
|
jQuery('.selected-card').removeClass('selected-card')
|
||||||
jQuery(e.target).closest('.card').addClass('selected-card')
|
jQuery(e.target).closest('.card').addClass('selected-card')
|
||||||
|
@ -233,12 +234,16 @@
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// un-sandbox iframes showing pdfs (required to display pdf viewer)
|
||||||
jQuery('iframe').map(function() {
|
jQuery('iframe').map(function() {
|
||||||
if (this.src.endsWith('.pdf')) {
|
if (this.src.endsWith('.pdf')) {
|
||||||
this.removeAttribute('sandbox')
|
this.removeAttribute('sandbox')
|
||||||
this.src = this.src
|
this.src = this.src
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// hide header when collapse icon is clicked
|
||||||
jQuery('.collapse-icon').on('click', function() {
|
jQuery('.collapse-icon').on('click', function() {
|
||||||
if (jQuery('.collapse-icon').text().includes('[-]')) {
|
if (jQuery('.collapse-icon').text().includes('[-]')) {
|
||||||
jQuery('.collapse-icon').text('[+]')
|
jQuery('.collapse-icon').text('[+]')
|
||||||
|
@ -251,6 +256,8 @@
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// hide all preview iframes on small screens
|
||||||
if (window.innerWidth < 1091) {
|
if (window.innerWidth < 1091) {
|
||||||
jQuery('.card a[target=preview]').attr('target', '_self')
|
jQuery('.card a[target=preview]').attr('target', '_self')
|
||||||
}
|
}
|
||||||
|
|
164
util.py
164
util.py
|
@ -1,6 +1,8 @@
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
import json
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
@ -24,6 +26,17 @@ from config import (
|
||||||
SUBMIT_ARCHIVE_DOT_ORG,
|
SUBMIT_ARCHIVE_DOT_ORG,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# URL helpers
|
||||||
|
without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '')
|
||||||
|
without_query = lambda url: url.split('?', 1)[0]
|
||||||
|
without_hash = lambda url: url.split('#', 1)[0]
|
||||||
|
without_path = lambda url: url.split('/', 1)[0]
|
||||||
|
domain = lambda url: without_hash(without_query(without_path(without_scheme(url))))
|
||||||
|
base_url = lambda url: without_query(without_scheme(url))
|
||||||
|
|
||||||
|
short_ts = lambda ts: ts.split('.')[0]
|
||||||
|
|
||||||
|
|
||||||
def check_dependencies():
|
def check_dependencies():
|
||||||
"""Check that all necessary dependencies are installed, and have valid versions"""
|
"""Check that all necessary dependencies are installed, and have valid versions"""
|
||||||
|
|
||||||
|
@ -149,11 +162,15 @@ def progress(seconds=TIMEOUT, prefix=''):
|
||||||
|
|
||||||
|
|
||||||
def download_url(url):
|
def download_url(url):
|
||||||
if not os.path.exists(os.path.join(ARCHIVE_DIR, 'downloads')):
|
"""download a given url's content into downloads/domain.txt"""
|
||||||
os.makedirs(os.path.join(ARCHIVE_DIR, 'downloads'))
|
|
||||||
|
download_dir = os.path.join(ARCHIVE_DIR, 'downloads')
|
||||||
|
|
||||||
|
if not os.path.exists(download_dir):
|
||||||
|
os.makedirs(download_dir)
|
||||||
|
|
||||||
url_domain = url.split('/', 3)[2]
|
url_domain = url.split('/', 3)[2]
|
||||||
output_path = os.path.join(ARCHIVE_DIR, 'downloads', '{}.txt'.format(url_domain))
|
output_path = os.path.join(download_dir, '{}.txt'.format(url_domain))
|
||||||
|
|
||||||
print('[*] [{}] Downloading {} > {}'.format(
|
print('[*] [{}] Downloading {} > {}'.format(
|
||||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
@ -172,10 +189,10 @@ def download_url(url):
|
||||||
|
|
||||||
with open(output_path, 'w', encoding='utf-8') as f:
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
f.write(downloaded_xml)
|
f.write(downloaded_xml)
|
||||||
|
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
|
def str_between(string, start, end=None):
|
||||||
def get_str_between(string, start, end=None):
|
|
||||||
"""(<abc>12345</def>, <abc>, </def>) -> 12345"""
|
"""(<abc>12345</def>, <abc>, </def>) -> 12345"""
|
||||||
|
|
||||||
content = string.split(start, 1)[-1]
|
content = string.split(start, 1)[-1]
|
||||||
|
@ -184,9 +201,6 @@ def get_str_between(string, start, end=None):
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_link_type(link):
|
def get_link_type(link):
|
||||||
"""Certain types of links need to be handled specially, this figures out when that's the case"""
|
"""Certain types of links need to be handled specially, this figures out when that's the case"""
|
||||||
|
|
||||||
|
@ -207,10 +221,130 @@ def get_link_type(link):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
# URL helpers
|
def find_link(folder, links):
|
||||||
without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '')
|
"""for a given archive folder, find the corresponding link object in links"""
|
||||||
without_query = lambda url: url.split('?', 1)[0]
|
url = parse_url(folder)
|
||||||
without_hash = lambda url: url.split('#', 1)[0]
|
if url:
|
||||||
without_path = lambda url: url.split('/', 1)[0]
|
for link in links:
|
||||||
domain = lambda url: without_hash(without_query(without_path(without_scheme(url))))
|
if (link['base_url'] in url) or (url in link['url']):
|
||||||
base_url = lambda url: without_query(without_scheme(url))
|
return link
|
||||||
|
|
||||||
|
timestamp = folder.split('.')[0]
|
||||||
|
for link in links:
|
||||||
|
if link['timestamp'].startswith(timestamp):
|
||||||
|
if link['domain'] in os.listdir('./html/archive/' + folder):
|
||||||
|
return link # careful now, this isn't safe for most ppl
|
||||||
|
if link['domain'] in parse_url(folder):
|
||||||
|
return link
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_url(folder):
|
||||||
|
"""for a given archive folder, figure out what url it's for"""
|
||||||
|
link_json = os.path.join('./html/archive/' + folder, 'index.json')
|
||||||
|
if os.path.exists(link_json):
|
||||||
|
with open(link_json, 'r') as f:
|
||||||
|
link = json.load(f)
|
||||||
|
return link['base_url']
|
||||||
|
|
||||||
|
archive_org_txt = os.path.join('./html/archive/' + folder, 'archive.org.txt')
|
||||||
|
if os.path.exists(archive_org_txt):
|
||||||
|
with open(archive_org_txt, 'r') as f:
|
||||||
|
original_link = f.read().strip().split('/http', 1)[-1]
|
||||||
|
with_scheme = 'http{}'.format(original_link)
|
||||||
|
return with_scheme
|
||||||
|
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def merge_folders(folder, link):
|
||||||
|
"""given a folder, merge it to the canonical 'correct' path for the given link object"""
|
||||||
|
base_url = parse_url(folder)
|
||||||
|
if not (base_url in link['base_url']
|
||||||
|
or link['base_url'] in base_url):
|
||||||
|
print(base_url, link['base_url'])
|
||||||
|
assert False
|
||||||
|
print('{} > {}'.format(folder, link['timestamp']))
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_archive(path, links):
|
||||||
|
"""move any incorrectly named folders to their canonical locations"""
|
||||||
|
|
||||||
|
# for each folder that exists, see if we can match it up with a known good link
|
||||||
|
# if we can, then merge the two folders, if not, move it to lost & found
|
||||||
|
|
||||||
|
# for each timestamp, find similar timestamped folders
|
||||||
|
# check each folder for a "domain.com" folder or
|
||||||
|
|
||||||
|
unmatched = []
|
||||||
|
|
||||||
|
for folder in os.listdir(path):
|
||||||
|
link = find_link(folder, links)
|
||||||
|
if link is None:
|
||||||
|
unmatched.append(folder)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if folder != link['timestamp']:
|
||||||
|
merge_folders(folder, link)
|
||||||
|
|
||||||
|
if unmatched:
|
||||||
|
print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))
|
||||||
|
print('\n '.join(unmatched))
|
||||||
|
|
||||||
|
|
||||||
|
def html_appended_url(link):
|
||||||
|
"""calculate the path to the wgetted .html file, since wget may
|
||||||
|
adjust some paths to be different than the base_url path.
|
||||||
|
|
||||||
|
See docs on wget --adjust-extension.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if link['type'] in ('PDF', 'image'):
|
||||||
|
return link['base_url']
|
||||||
|
|
||||||
|
split_url = link['url'].split('#', 1)
|
||||||
|
query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
|
||||||
|
|
||||||
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
||||||
|
# already ends in .html
|
||||||
|
return link['base_url']
|
||||||
|
else:
|
||||||
|
# .html needs to be appended
|
||||||
|
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
||||||
|
if without_scheme.endswith('/'):
|
||||||
|
if query:
|
||||||
|
return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
|
||||||
|
return '#'.join([without_scheme + 'index.html', *split_url[1:]])
|
||||||
|
else:
|
||||||
|
if query:
|
||||||
|
return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])
|
||||||
|
elif '/' in without_scheme:
|
||||||
|
return '#'.join([without_scheme + '.html', *split_url[1:]])
|
||||||
|
return link['base_url'] + '/index.html'
|
||||||
|
|
||||||
|
|
||||||
|
def derived_link_info(link):
|
||||||
|
"""extend link info with the archive urls and other derived data"""
|
||||||
|
|
||||||
|
link_info = {
|
||||||
|
**link,
|
||||||
|
'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
|
||||||
|
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
|
||||||
|
'favicon_url': './archive/{timestamp}/favicon.ico'.format(**link),
|
||||||
|
'files_url': './archive/{timestamp}/index.html'.format(**link),
|
||||||
|
'archive_url': './archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
|
||||||
|
'pdf_link': './archive/{timestamp}/output.pdf'.format(**link),
|
||||||
|
'screenshot_link': './archive/{timestamp}/screenshot.png'.format(**link),
|
||||||
|
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
|
||||||
|
}
|
||||||
|
|
||||||
|
# PDF and images are handled slightly differently
|
||||||
|
# wget, screenshot, & pdf urls all point to the same file
|
||||||
|
if link['type'] in ('PDF', 'image'):
|
||||||
|
link_info.update({
|
||||||
|
'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
|
||||||
|
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
||||||
|
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
||||||
|
'title': '{title} ({type})'.format(**link),
|
||||||
|
})
|
||||||
|
return link_info
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue