better UX before titles have been fetched during archiving progress

2025-05-13 14:44:29 -04:00 · 2019-03-19 18:09:46 -04:00 · 2019-03-19 18:09:46 -04:00 · eb5cc8078a
commit eb5cc8078a
parent 914750c453
5 changed files with 90 additions and 27 deletions
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@ -7,7 +7,7 @@ from datetime import datetime
 from index import (
    parse_json_link_index,
    write_link_index,
-    patch_index_title_hack,
+    update_main_index,
 )
 from config import (
    CURL_BINARY,
@ -103,7 +103,9 @@ def archive_link(link_dir, link, overwrite=True):
        for archive_method in active_methods:
            archive_method(link_dir, link, overwrite=overwrite)

+
        write_link_index(link_dir, link)
+        update_main_index(link)

    except Exception as err:
        print('    ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
@ -218,7 +220,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
    try:
        result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
        end()
-        output = wget_output_path(link, look_in=domain_dir)
+        output = wget_output_path(link)

        output_tail = ['          ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()]

@ -391,11 +393,13 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
    output = 'archive.org.txt'
    archive_org_url = None

+
    path = os.path.join(link_dir, output)
    if os.path.exists(path):
        archive_org_url = open(path, 'r').read().strip()
        return {'output': archive_org_url, 'status': 'skipped'}

+
    submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
    CMD = [
        CURL_BINARY,
@ -412,7 +416,6 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
        end()

        content_location, errors = parse_archive_dot_org_response(result.stdout)
-
        if content_location:
            archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
@ -427,6 +430,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
        output = e
        print_error_hints(cmd=CMD, pwd=link_dir, err=e)

+
    if not isinstance(output, Exception):
        # instead of writing None when archive.org rejects the url write the
        # url to resubmit it to archive.org. This is so when the user visits
@ -499,7 +503,6 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
    # TODO: figure out how to do this without gnarly string replacement
    if title:
        link['title'] = title
-        patch_index_title_hack(link['url'], title)

    return {
        'cmd': 'fetch_page_title("{}")'.format(link['url']),