mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 22:54:27 -04:00
better UX before titles have been fetched during archiving progress
This commit is contained in:
parent
914750c453
commit
eb5cc8078a
5 changed files with 90 additions and 27 deletions
|
@ -7,7 +7,7 @@ from datetime import datetime
|
||||||
from index import (
|
from index import (
|
||||||
parse_json_link_index,
|
parse_json_link_index,
|
||||||
write_link_index,
|
write_link_index,
|
||||||
patch_index_title_hack,
|
update_main_index,
|
||||||
)
|
)
|
||||||
from config import (
|
from config import (
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
|
@ -103,7 +103,9 @@ def archive_link(link_dir, link, overwrite=True):
|
||||||
for archive_method in active_methods:
|
for archive_method in active_methods:
|
||||||
archive_method(link_dir, link, overwrite=overwrite)
|
archive_method(link_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
write_link_index(link_dir, link)
|
write_link_index(link_dir, link)
|
||||||
|
update_main_index(link)
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
||||||
|
@ -218,7 +220,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
||||||
try:
|
try:
|
||||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
||||||
end()
|
end()
|
||||||
output = wget_output_path(link, look_in=domain_dir)
|
output = wget_output_path(link)
|
||||||
|
|
||||||
output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()]
|
output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()]
|
||||||
|
|
||||||
|
@ -391,11 +393,13 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||||
output = 'archive.org.txt'
|
output = 'archive.org.txt'
|
||||||
archive_org_url = None
|
archive_org_url = None
|
||||||
|
|
||||||
|
|
||||||
path = os.path.join(link_dir, output)
|
path = os.path.join(link_dir, output)
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
archive_org_url = open(path, 'r').read().strip()
|
archive_org_url = open(path, 'r').read().strip()
|
||||||
return {'output': archive_org_url, 'status': 'skipped'}
|
return {'output': archive_org_url, 'status': 'skipped'}
|
||||||
|
|
||||||
|
|
||||||
submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
|
submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
|
||||||
CMD = [
|
CMD = [
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
|
@ -412,7 +416,6 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||||
end()
|
end()
|
||||||
|
|
||||||
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
||||||
|
|
||||||
if content_location:
|
if content_location:
|
||||||
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
||||||
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
||||||
|
@ -427,6 +430,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||||
output = e
|
output = e
|
||||||
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
||||||
|
|
||||||
|
|
||||||
if not isinstance(output, Exception):
|
if not isinstance(output, Exception):
|
||||||
# instead of writing None when archive.org rejects the url write the
|
# instead of writing None when archive.org rejects the url write the
|
||||||
# url to resubmit it to archive.org. This is so when the user visits
|
# url to resubmit it to archive.org. This is so when the user visits
|
||||||
|
@ -499,7 +503,6 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
|
||||||
# TODO: figure out how to do this without gnarly string replacement
|
# TODO: figure out how to do this without gnarly string replacement
|
||||||
if title:
|
if title:
|
||||||
link['title'] = title
|
link['title'] = title
|
||||||
patch_index_title_hack(link['url'], title)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'cmd': 'fetch_page_title("{}")'.format(link['url']),
|
'cmd': 'fetch_page_title("{}")'.format(link['url']),
|
||||||
|
|
|
@ -22,8 +22,11 @@ from util import (
|
||||||
pretty_path,
|
pretty_path,
|
||||||
check_link_structure,
|
check_link_structure,
|
||||||
check_links_structure,
|
check_links_structure,
|
||||||
|
wget_output_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||||
|
|
||||||
|
|
||||||
### Homepage index for all the links
|
### Homepage index for all the links
|
||||||
|
|
||||||
|
@ -96,9 +99,20 @@ def write_html_links_index(out_dir, links, finished=False):
|
||||||
with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
|
||||||
link_row_html = f.read()
|
link_row_html = f.read()
|
||||||
|
|
||||||
|
full_links_info = (derived_link_info(link) for link in links)
|
||||||
|
|
||||||
link_rows = '\n'.join(
|
link_rows = '\n'.join(
|
||||||
Template(link_row_html).substitute(**derived_link_info(link))
|
Template(link_row_html).substitute(**{
|
||||||
for link in links
|
**link,
|
||||||
|
'title': (
|
||||||
|
link['title']
|
||||||
|
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
|
||||||
|
),
|
||||||
|
'archive_url': (
|
||||||
|
wget_output_path(link) or 'index.html'
|
||||||
|
),
|
||||||
|
})
|
||||||
|
for link in full_links_info
|
||||||
)
|
)
|
||||||
|
|
||||||
template_vars = {
|
template_vars = {
|
||||||
|
@ -118,24 +132,41 @@ def write_html_links_index(out_dir, links, finished=False):
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
def patch_index_title_hack(link_url, new_title):
|
def update_main_index(link):
|
||||||
"""hack to update just one link's title in the link index json"""
|
"""hack to in-place update one row's info in the generated index html"""
|
||||||
|
|
||||||
|
title = link['latest']['title']
|
||||||
|
successful = len([entry for entry in link['latest'].values() if entry])
|
||||||
|
|
||||||
|
# Patch JSON index
|
||||||
json_path = os.path.join(OUTPUT_DIR, 'index.json')
|
json_path = os.path.join(OUTPUT_DIR, 'index.json')
|
||||||
|
|
||||||
links = parse_json_links_index(OUTPUT_DIR)
|
links = parse_json_links_index(OUTPUT_DIR)
|
||||||
|
|
||||||
changed = False
|
changed = False
|
||||||
for link in links:
|
for json_link in links:
|
||||||
if link['url'] == link_url:
|
if json_link['url'] == link['url']:
|
||||||
link['title'] = new_title
|
json_link['title'] = title
|
||||||
|
json_link['latest'] = link['latest']
|
||||||
changed = True
|
changed = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if changed:
|
if changed:
|
||||||
write_json_links_index(OUTPUT_DIR, links)
|
write_json_links_index(OUTPUT_DIR, links)
|
||||||
|
|
||||||
|
# Patch HTML index
|
||||||
|
html_path = os.path.join(OUTPUT_DIR, 'index.html')
|
||||||
|
|
||||||
|
html = open(html_path, 'r').read().split('\n')
|
||||||
|
for idx, line in enumerate(html):
|
||||||
|
if title and ('<span data-title-for="{}"'.format(link['url']) in line):
|
||||||
|
html[idx] = '<span>{}</span>'.format(title)
|
||||||
|
elif successful and ('<span data-number-for="{}"'.format(link['url']) in line):
|
||||||
|
html[idx] = '<span>{}</span>'.format(successful)
|
||||||
|
break
|
||||||
|
|
||||||
|
with open(html_path, 'w') as f:
|
||||||
|
f.write('\n'.join(html))
|
||||||
|
|
||||||
### Individual link index
|
### Individual link index
|
||||||
|
|
||||||
|
@ -176,10 +207,19 @@ def write_html_link_index(out_dir, link):
|
||||||
|
|
||||||
print(' √ index.html')
|
print(' √ index.html')
|
||||||
|
|
||||||
|
link = derived_link_info(link)
|
||||||
|
|
||||||
with open(path, 'w', encoding='utf-8') as f:
|
with open(path, 'w', encoding='utf-8') as f:
|
||||||
f.write(Template(link_html).substitute({
|
f.write(Template(link_html).substitute({
|
||||||
**derived_link_info(link),
|
**link,
|
||||||
# **link['latest'],
|
'title': (
|
||||||
|
link['title']
|
||||||
|
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
|
||||||
|
),
|
||||||
|
'archive_url': (
|
||||||
|
wget_output_path(link)
|
||||||
|
or (link['domain'] if link['is_archived'] else 'about:blank')
|
||||||
|
),
|
||||||
}))
|
}))
|
||||||
|
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
|
@ -98,6 +98,28 @@
|
||||||
overflow-y: scroll;
|
overflow-y: scroll;
|
||||||
table-layout: fixed;
|
table-layout: fixed;
|
||||||
}
|
}
|
||||||
|
table tr a span[data-archived~=False] {
|
||||||
|
opacity: 0.2;
|
||||||
|
}
|
||||||
|
.files-spinner {
|
||||||
|
height: 15px;
|
||||||
|
width: auto;
|
||||||
|
opacity: 0.5;
|
||||||
|
vertical-align: -2px;
|
||||||
|
}
|
||||||
|
.link-favicon {
|
||||||
|
padding-right: 8px;
|
||||||
|
vertical-align: -4px;
|
||||||
|
}
|
||||||
|
.in-progress {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
body[data-status~=finished] .files-spinner {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
body[data-status~=running] .in-progress {
|
||||||
|
display: inline-block;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body data-status="$status">
|
<body data-status="$status">
|
||||||
|
@ -121,12 +143,8 @@
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
<th style="width: 80px;">Bookmarked</th>
|
<th style="width: 80px;">Bookmarked</th>
|
||||||
<th style="width: 26px;">Files</th>
|
|
||||||
<th style="width: 26vw;">Saved Link ($num_links)</th>
|
<th style="width: 26vw;">Saved Link ($num_links)</th>
|
||||||
<th style="width: 30px;">PNG</th>
|
<th style="width: 50px">Saved Files</th>
|
||||||
<th style="width: 30px">PDF</th>
|
|
||||||
<th style="width: 30px">HTML</th>
|
|
||||||
<th style="width: 30px">A.org</th>
|
|
||||||
<th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
|
<th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
|
|
|
@ -1,16 +1,18 @@
|
||||||
<tr>
|
<tr data-url="$url">
|
||||||
<td title="Bookmarked timestamp: $timestamp">$bookmarked_date</td>
|
<td title="Bookmarked timestamp: $timestamp">$bookmarked_date</td>
|
||||||
<td>
|
<td style="text-align: left">
|
||||||
<a href="$link_dir/$index_url" title="Link Index">
|
<a href="$link_dir/$index_url" title="Link Index">
|
||||||
<img src="$link_dir/$favicon_url" onerror="this.src='static/spinner.gif'" class="link-favicon">
|
<img src="$link_dir/$favicon_url" onerror="this.src='static/spinner.gif'" class="link-favicon">
|
||||||
</a>
|
</a>
|
||||||
|
<a href="$link_dir/$archive_url" style="font-size:1.4em;text-decoration:none;color:black;" title="$title">
|
||||||
|
<span data-title-for="$url" data-archived="$is_archived">$title</span>
|
||||||
|
<small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
|
||||||
|
</a>
|
||||||
</td>
|
</td>
|
||||||
<td style="text-align: left"><a href="$link_dir/$archive_url" style="font-size:1.4em;text-decoration:none;color:black;" title="$title">
|
<td>
|
||||||
$title <small style="background-color: #eee;border-radius:4px; float:right">$tags</small>
|
<a href="$link_dir/$index_url">📄
|
||||||
|
<span data-number-for="$url" title="Fetching any missing files...">$num_outputs <img src="static/spinner.gif" class="files-spinner"/></span>
|
||||||
|
</a>
|
||||||
</td>
|
</td>
|
||||||
<td><a href="$link_dir/$screenshot_url" title="Screenshot">🖼</a></td>
|
|
||||||
<td><a href="$link_dir/$pdf_url" title="PDF">📜</a></td>
|
|
||||||
<td><a href="$link_dir/$dom_url" title="DOM">📄</a></td>
|
|
||||||
<td><a href="$archive_org_url" title="Archive.org">🏛</a></td>
|
|
||||||
<td style="text-align: left"><!--🔗 <img src="$google_favicon_url" height="16px">--> <a href="$url">$url</a></td>
|
<td style="text-align: left"><!--🔗 <img src="$google_favicon_url" height="16px">--> <a href="$url">$url</a></td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
|
@ -244,7 +244,7 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
|
||||||
# ))
|
# ))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def wget_output_path(link, look_in=None):
|
def wget_output_path(link):
|
||||||
"""calculate the path to the wgetted .html file, since wget may
|
"""calculate the path to the wgetted .html file, since wget may
|
||||||
adjust some paths to be different than the base_url path.
|
adjust some paths to be different than the base_url path.
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue