fetch page title during archiving process

2025-05-13 22:54:27 -04:00 · 2019-02-19 01:44:54 -05:00 · 2019-02-19 01:44:54 -05:00 · 5a7d00a639
commit 5a7d00a639
parent bb5879a4f7
5 changed files with 44 additions and 15 deletions
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@ -12,6 +12,8 @@ from index import wget_output_path, parse_json_link_index, write_link_index
 from links import links_after_timestamp
 from config import (
    CHROME_BINARY,
+    FETCH_FAVICON,
+    FETCH_TITLE,
    FETCH_WGET,
    FETCH_WGET_REQUISITES,
    FETCH_PDF,
@ -23,7 +25,6 @@ from config import (
    RESOLUTION,
    CHECK_SSL_VALIDITY,
    SUBMIT_ARCHIVE_DOT_ORG,
-    FETCH_FAVICON,
    WGET_USER_AGENT,
    CHROME_USER_DATA_DIR,
    CHROME_SANDBOX,
@ -36,6 +37,7 @@ from config import (
 )
 from util import (
    check_dependencies,
+    fetch_page_title,
    progress,
    chmod_file,
    pretty_path,
@ -96,6 +98,9 @@ def archive_link(link_dir, link, overwrite=True):
        if FETCH_FAVICON:
            link = fetch_favicon(link_dir, link, overwrite=overwrite)

+        if FETCH_TITLE:
+            link = fetch_title(link_dir, link, overwrite=overwrite)
+
        if FETCH_WGET:
            link = fetch_wget(link_dir, link, overwrite=overwrite)

@ -129,7 +134,7 @@ def log_link_archive(link_dir, link, update_existing):
        symbol='*' if update_existing else '+',
        symbol_color=ANSI['black' if update_existing else 'green'],
        now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-        **link,
+        **{**link, 'title': link['title'] or link['url']},
        **ANSI,
    ))

@ -492,6 +497,29 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
        'output': output,
    }

+@attach_result_to_link('title')
+def fetch_title(link_dir, link, timeout=TIMEOUT):
+    """try to guess the page's title from its content"""
+
+    # if link already has valid title, skip it
+    if link['title'] and not link['title'].lower().startswith('http'):
+        return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
+
+    end = progress(timeout, prefix='      ')
+    try:
+        title = fetch_page_title(link['url'], timeout=timeout, progress=False)
+        end()
+        output = title
+    except Exception as e:
+        end()
+        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+        output = e
+
+    return {
+        'cmd': 'fetch_page_title("{}")'.format(link['url']),
+        'output': output,
+    }
+
@attach_result_to_link('media')
 def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
    """Download playlists or individual video, audio, and subtitles using youtube-dl"""