mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-15 07:34:27 -04:00
remove flawed link_type concept in favor of simpler staticfile detection
This commit is contained in:
parent
c79e1df8b2
commit
5ee1c39720
4 changed files with 107 additions and 79 deletions
|
@ -224,6 +224,7 @@ def write_html_link_index(out_dir, link):
|
|||
wget_output_path(link)
|
||||
or (link['domain'] if link['is_archived'] else 'about:blank')
|
||||
),
|
||||
'extension': link['extension'] or 'HTML',
|
||||
}))
|
||||
|
||||
chmod_file(path)
|
||||
|
|
|
@ -10,8 +10,8 @@ Parsed link schema: {
|
|||
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
|
||||
'timestamp': '15442123124234',
|
||||
'title': 'Example.com Page Title',
|
||||
'tags': 'abc,def',
|
||||
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
|
||||
'tags': 'abc,def',
|
||||
}
|
||||
"""
|
||||
|
||||
|
@ -25,7 +25,6 @@ import xml.etree.ElementTree as etree
|
|||
from config import ANSI
|
||||
from util import (
|
||||
str_between,
|
||||
get_link_type,
|
||||
URL_REGEX,
|
||||
check_url_parsing,
|
||||
)
|
||||
|
@ -69,17 +68,18 @@ def parse_pocket_html_export(html_file):
|
|||
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
|
||||
url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
|
||||
time = datetime.fromtimestamp(float(match.group(2)))
|
||||
info = {
|
||||
'url': fixed_url,
|
||||
tags = match.group(3)
|
||||
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'tags': match.group(3),
|
||||
'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or None,
|
||||
'title': title or None,
|
||||
'tags': tags or '',
|
||||
'sources': [html_file.name],
|
||||
}
|
||||
info['type'] = get_link_type(info)
|
||||
yield info
|
||||
|
||||
def parse_pinboard_json_export(json_file):
|
||||
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
||||
|
@ -106,14 +106,14 @@ def parse_pinboard_json_export(json_file):
|
|||
title = (erg.get('description') or '').replace(' — Readability', '')
|
||||
else:
|
||||
title = erg['title'].strip()
|
||||
|
||||
info = {
|
||||
'url': url,
|
||||
'timestamp': timestamp,
|
||||
'tags': erg.get('tags') or '',
|
||||
'title': title or None,
|
||||
'tags': erg.get('tags') or '',
|
||||
'sources': [json_file.name],
|
||||
}
|
||||
info['type'] = get_link_type(info)
|
||||
yield info
|
||||
|
||||
|
||||
|
@ -144,16 +144,13 @@ def parse_rss_export(rss_file):
|
|||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
|
||||
info = {
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'tags': '',
|
||||
'title': title or None,
|
||||
'tags': '',
|
||||
'sources': [rss_file.name],
|
||||
}
|
||||
info['type'] = get_link_type(info)
|
||||
|
||||
yield info
|
||||
|
||||
|
||||
def parse_shaarli_rss_export(rss_file):
|
||||
|
@ -184,16 +181,14 @@ def parse_shaarli_rss_export(rss_file):
|
|||
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
|
||||
info = {
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'tags': '',
|
||||
'title': title or None,
|
||||
'tags': '',
|
||||
'sources': [rss_file.name],
|
||||
}
|
||||
info['type'] = get_link_type(info)
|
||||
|
||||
yield info
|
||||
|
||||
def parse_netscape_html_export(html_file):
|
||||
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
||||
|
@ -209,16 +204,14 @@ def parse_netscape_html_export(html_file):
|
|||
url = match.group(1)
|
||||
time = datetime.fromtimestamp(float(match.group(2)))
|
||||
|
||||
info = {
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'tags': "",
|
||||
'title': match.group(3).strip() or None,
|
||||
'tags': '',
|
||||
'sources': [html_file.name],
|
||||
}
|
||||
info['type'] = get_link_type(info)
|
||||
|
||||
yield info
|
||||
|
||||
def parse_pinboard_rss_export(rss_file):
|
||||
"""Parse Pinboard RSS feed files into links"""
|
||||
|
@ -237,18 +230,22 @@ def parse_pinboard_rss_export(rss_file):
|
|||
|
||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||
# Python can't parse. Remove it:
|
||||
if ":" == ts_str[-3:-2]:
|
||||
if ts_str and ts_str[-3:-2] == ":":
|
||||
ts_str = ts_str[:-3]+ts_str[-2:]
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
info = {
|
||||
|
||||
if ts_str:
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
else:
|
||||
time = datetime.now()
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'tags': tags or '',
|
||||
'title': title or None,
|
||||
'tags': tags or '',
|
||||
'sources': [rss_file.name],
|
||||
}
|
||||
info['type'] = get_link_type(info)
|
||||
yield info
|
||||
|
||||
|
||||
def parse_medium_rss_export(rss_file):
|
||||
"""Parse Medium RSS feed files into links"""
|
||||
|
@ -263,15 +260,14 @@ def parse_medium_rss_export(rss_file):
|
|||
title = item.find("title").text.strip()
|
||||
ts_str = item.find("pubDate").text
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
|
||||
info = {
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'tags': '',
|
||||
'title': title or None,
|
||||
'tags': '',
|
||||
'sources': [rss_file.name],
|
||||
}
|
||||
info['type'] = get_link_type(info)
|
||||
yield info
|
||||
|
||||
|
||||
def parse_plain_text_export(text_file):
|
||||
|
@ -285,15 +281,15 @@ def parse_plain_text_export(text_file):
|
|||
|
||||
for url in urls:
|
||||
url = url.strip()
|
||||
info = {
|
||||
time = datetime.now()
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(datetime.now().timestamp()),
|
||||
'tags': '',
|
||||
'timestamp': str(time.timestamp()),
|
||||
'title': None,
|
||||
'tags': '',
|
||||
'sources': [text_file.name],
|
||||
}
|
||||
info['type'] = get_link_type(info)
|
||||
yield info
|
||||
|
||||
|
||||
PARSERS = OrderedDict([
|
||||
|
|
|
@ -194,8 +194,8 @@
|
|||
Last updated: <small title="Timestamp: $updated">$updated_date</small>
|
||||
</div>
|
||||
<div class="col-lg-4 alert well">
|
||||
Metadata:
|
||||
<span class="badge badge-default">$type</span>
|
||||
Type:
|
||||
<span class="badge badge-default">$extension</span>
|
||||
|
|
||||
Tags:
|
||||
<span class="badge badge-success">$tags</span>
|
||||
|
|
|
@ -70,6 +70,26 @@ HTML_TITLE_REGEX = re.compile(
|
|||
r'(.[^<>]+)', # get everything up to these symbols
|
||||
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
||||
)
|
||||
STATICFILE_EXTENSIONS = {
|
||||
# 99.999% of the time, URLs ending in these extentions are static files
|
||||
# that can be downloaded as-is, not html pages that need to be rendered
|
||||
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
||||
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
||||
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8'
|
||||
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
||||
'atom', 'rss', 'css', 'js', 'json',
|
||||
'dmg', 'iso', 'img',
|
||||
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
|
||||
|
||||
# Less common extensions to consider adding later
|
||||
# jar, swf, bin, com, exe, dll, deb
|
||||
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
||||
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
||||
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
||||
|
||||
# Thse are always treated as pages, not as static files, never add them:
|
||||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||
}
|
||||
|
||||
### Checks & Tests
|
||||
|
||||
|
@ -225,6 +245,7 @@ def save_remote_source(url, timeout=TIMEOUT):
|
|||
|
||||
def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
|
||||
"""Attempt to guess a page's title by downloading the html"""
|
||||
|
||||
if not FETCH_TITLE:
|
||||
return None
|
||||
|
||||
|
@ -257,8 +278,8 @@ def wget_output_path(link):
|
|||
|
||||
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
|
||||
|
||||
if link['type'] in ('PDF', 'image'):
|
||||
return urlencode(base_url(link['url']))
|
||||
if is_static_file(link['url']):
|
||||
return urlencode(without_scheme(without_fragment(link['url'])))
|
||||
|
||||
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
||||
# instead of trying to emulate it here, we just look in the output folder
|
||||
|
@ -271,6 +292,18 @@ def wget_output_path(link):
|
|||
full_path,
|
||||
)
|
||||
|
||||
# Wget downloads can save in a number of different ways depending on the url
|
||||
# https://example.com
|
||||
# > output/archive/<timestamp>/example.com/index.html
|
||||
# https://example.com/abc
|
||||
# > output/archive/<timestamp>/example.com/abc.html
|
||||
# https://example.com/abc/
|
||||
# > output/archive/<timestamp>/example.com/abc/index.html
|
||||
# https://example.com/abc/test.html
|
||||
# > output/archive/<timestamp>/example.com/abc/test.html
|
||||
|
||||
# There's also lots of complexity around how the urlencoding and renaming
|
||||
# is done for pages with query and hash fragments or extensions like shtml / htm
|
||||
for _ in range(4):
|
||||
if os.path.exists(search_dir):
|
||||
if os.path.isdir(search_dir):
|
||||
|
@ -279,8 +312,8 @@ def wget_output_path(link):
|
|||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
||||
]
|
||||
if html_files:
|
||||
relative_path = search_dir.split(link_dir)[-1].strip('/')
|
||||
return urlencode(os.path.join(relative_path, html_files[0]))
|
||||
path_from_link_dir = search_dir.split(link_dir)[-1].strip('/')
|
||||
return urlencode(os.path.join(path_from_link_dir, html_files[0]))
|
||||
|
||||
# Move up one directory level
|
||||
search_dir = search_dir.rsplit('/', 1)[0]
|
||||
|
@ -327,19 +360,32 @@ def pretty_path(path):
|
|||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||
return path.replace(REPO_DIR + '/', '')
|
||||
|
||||
|
||||
def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '):
|
||||
"""quote the argument with whitespace in a command so the user can
|
||||
copy-paste the outputted string directly to run the cmd
|
||||
"""
|
||||
|
||||
# Prettify CMD string and make it save to copy-paste by quoting arguments
|
||||
quoted_cmd = ' '.join(
|
||||
'"{}"'.format(arg) if ' ' in arg else arg
|
||||
for arg in cmd
|
||||
)
|
||||
|
||||
# Prettify error output hints string and limit to five lines
|
||||
hints = hints or getattr(err, 'hints', None)
|
||||
if hints:
|
||||
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
||||
hints = (
|
||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||
for line in hints[:5] if line.strip()
|
||||
)
|
||||
else:
|
||||
hints = ()
|
||||
|
||||
output_lines = [
|
||||
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
|
||||
' {}{}{}'.format(ANSI['lightyellow'], hints, ANSI['reset']) if hints else None,
|
||||
*hints,
|
||||
'Run to see full output:'
|
||||
' cd {};'.format(pwd),
|
||||
' {}'.format(quoted_cmd),
|
||||
|
@ -364,36 +410,21 @@ def merge_links(a, b):
|
|||
url = longer('url')
|
||||
longest_title = longer('title')
|
||||
cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
|
||||
link = {
|
||||
'timestamp': earlier('timestamp'),
|
||||
return {
|
||||
'url': url,
|
||||
'domain': domain(url),
|
||||
'base_url': base_url(url),
|
||||
'tags': longer('tags'),
|
||||
'timestamp': earlier('timestamp'),
|
||||
'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
|
||||
'tags': longer('tags'),
|
||||
'sources': list(set(a.get('sources', []) + b.get('sources', []))),
|
||||
}
|
||||
link['type'] = get_link_type(link)
|
||||
return link
|
||||
|
||||
def get_link_type(link):
|
||||
"""Certain types of links need to be handled specially, this figures out when that's the case"""
|
||||
def is_static_file(url):
|
||||
"""Certain URLs just point to a single static file, and
|
||||
don't need to be re-archived in many formats
|
||||
"""
|
||||
|
||||
if extension(link['url']) == 'pdf':
|
||||
return 'PDF'
|
||||
elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
|
||||
return 'image'
|
||||
elif 'wikipedia.org' in domain(link['url']).lower():
|
||||
return 'wiki'
|
||||
elif 'youtube.com' in domain(link['url']).lower():
|
||||
return 'youtube'
|
||||
elif 'soundcloud.com' in domain(link['url']).lower():
|
||||
return 'soundcloud'
|
||||
elif 'youku.com' in domain(link['url']).lower():
|
||||
return 'youku'
|
||||
elif 'vimeo.com' in domain(link['url']).lower():
|
||||
return 'vimeo'
|
||||
return None
|
||||
# TODO: the proper way is with MIME type detection, not using extension
|
||||
return extension(url) in STATICFILE_EXTENSIONS
|
||||
|
||||
def derived_link_info(link):
|
||||
"""extend link info with the archive urls and other derived data"""
|
||||
|
@ -410,7 +441,9 @@ def derived_link_info(link):
|
|||
'domain': domain(url),
|
||||
'path': path(url),
|
||||
'basename': basename(url),
|
||||
'extension': extension(url),
|
||||
'base_url': base_url(url),
|
||||
'is_static': is_static_file(url),
|
||||
'is_archived': os.path.exists(os.path.join(
|
||||
ARCHIVE_DIR,
|
||||
link['timestamp'],
|
||||
|
@ -420,8 +453,7 @@ def derived_link_info(link):
|
|||
}
|
||||
|
||||
# Archive Method Output URLs
|
||||
extended_info = {
|
||||
**extended_info,
|
||||
extended_info.update({
|
||||
'index_url': 'index.html',
|
||||
'favicon_url': 'favicon.ico',
|
||||
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
|
||||
|
@ -433,14 +465,13 @@ def derived_link_info(link):
|
|||
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
|
||||
'git_url': 'git',
|
||||
'media_url': 'media',
|
||||
|
||||
}
|
||||
|
||||
# PDF and images are handled slightly differently
|
||||
# wget, screenshot, & pdf urls all point to the same file
|
||||
if link['type'] in ('PDF', 'image'):
|
||||
})
|
||||
# static binary files like PDF and images are handled slightly differently.
|
||||
# they're just downloaded once and aren't archived separately multiple times,
|
||||
# so the wget, screenshot, & pdf urls should all point to the same file
|
||||
if is_static_file(url):
|
||||
extended_info.update({
|
||||
'title': basename(link['url']),
|
||||
'title': basename(url),
|
||||
'archive_url': base_url(url),
|
||||
'pdf_url': base_url(url),
|
||||
'screenshot_url': base_url(url),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue