From 5a7d00a6399f2b3a256a6059920a9330134b5fd7 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@nicksweeting.com>
Date: Tue, 19 Feb 2019 01:44:54 -0500
Subject: [PATCH] fetch page title during archiving process

---
 archivebox/archive_methods.py | 32 ++++++++++++++++++++++++++++++--
 archivebox/config.py          |  1 +
 archivebox/links.py           |  5 ++++-
 archivebox/util.py            | 20 ++++++++------------
 etc/ArchiveBox.conf.default   |  1 +
 5 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py
index 0148849d..26530d22 100644
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@@ -12,6 +12,8 @@ from index import wget_output_path, parse_json_link_index, write_link_index
 from links import links_after_timestamp
 from config import (
     CHROME_BINARY,
+    FETCH_FAVICON,
+    FETCH_TITLE,
     FETCH_WGET,
     FETCH_WGET_REQUISITES,
     FETCH_PDF,
@@ -23,7 +25,6 @@ from config import (
     RESOLUTION,
     CHECK_SSL_VALIDITY,
     SUBMIT_ARCHIVE_DOT_ORG,
-    FETCH_FAVICON,
     WGET_USER_AGENT,
     CHROME_USER_DATA_DIR,
     CHROME_SANDBOX,
@@ -36,6 +37,7 @@ from config import (
 )
 from util import (
     check_dependencies,
+    fetch_page_title,
     progress,
     chmod_file,
     pretty_path,
@@ -96,6 +98,9 @@ def archive_link(link_dir, link, overwrite=True):
         if FETCH_FAVICON:
             link = fetch_favicon(link_dir, link, overwrite=overwrite)
 
+        if FETCH_TITLE:
+            link = fetch_title(link_dir, link, overwrite=overwrite)
+
         if FETCH_WGET:
             link = fetch_wget(link_dir, link, overwrite=overwrite)
 
@@ -129,7 +134,7 @@ def log_link_archive(link_dir, link, update_existing):
         symbol='*' if update_existing else '+',
         symbol_color=ANSI['black' if update_existing else 'green'],
         now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-        **link,
+        **{**link, 'title': link['title'] or link['url']},
         **ANSI,
     ))
 
@@ -492,6 +497,29 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
         'output': output,
     }
 
+@attach_result_to_link('title')
+def fetch_title(link_dir, link, timeout=TIMEOUT):
+    """try to guess the page's title from its content"""
+
+    # if link already has valid title, skip it
+    if link['title'] and not link['title'].lower().startswith('http'):
+        return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
+
+    end = progress(timeout, prefix='      ')
+    try:
+        title = fetch_page_title(link['url'], timeout=timeout, progress=False)
+        end()
+        output = title
+    except Exception as e:
+        end()
+        print('        {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
+        output = e
+
+    return {
+        'cmd': 'fetch_page_title("{}")'.format(link['url']),
+        'output': output,
+    }
+
 @attach_result_to_link('media')
 def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
     """Download playlists or individual video, audio, and subtitles using youtube-dl"""
diff --git a/archivebox/config.py b/archivebox/config.py
index c887c7f2..1202fd3c 100644
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -27,6 +27,7 @@ FETCH_WARC =             os.getenv('FETCH_WARC',             'True'
 FETCH_GIT =              os.getenv('FETCH_GIT',              'True'             ).lower() == 'true'
 FETCH_MEDIA =            os.getenv('FETCH_MEDIA',            'True'             ).lower() == 'true'
 FETCH_FAVICON =          os.getenv('FETCH_FAVICON',          'True'             ).lower() == 'true'
+FETCH_TITLE =            os.getenv('FETCH_TITLE',            'True'             ).lower() == 'true'
 SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'             ).lower() == 'true'
 
 CHECK_SSL_VALIDITY =     os.getenv('CHECK_SSL_VALIDITY',     'True'             ).lower() == 'true'
diff --git a/archivebox/links.py b/archivebox/links.py
index e544618a..1a88f793 100644
--- a/archivebox/links.py
+++ b/archivebox/links.py
@@ -57,7 +57,7 @@ def validate_links(links):
         raise SystemExit(1)
 
     for link in links:
-        link['title'] = unescape(link['title'])
+        link['title'] = unescape(link['title']) if link['title'] else None
         link['latest'] = link.get('latest') or {}
         
         latest = link['latest']
@@ -76,6 +76,9 @@ def validate_links(links):
         if not latest.get('favicon'):
             latest['favicon'] = None
 
+        if not link['latest'].get('title'):
+            link['latest']['title'] = link['title']
+
     return list(links)
 
 def new_links(all_links, existing_links):
diff --git a/archivebox/util.py b/archivebox/util.py
index 6a91dd76..89f95ccf 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -44,6 +44,7 @@ base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links
 short_ts = lambda ts: ts.split('.')[0]
 
 URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
+HTML_TITLE_REGEX = '<title>(.[^<>]+)'
 
 
 def check_dependencies():
@@ -227,22 +228,17 @@ def download_url(url):
     return source_path
 
 
-def fetch_page_title(url, default=True):
+def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
     """Attempt to guess a page's title by downloading the html"""
-    if default is True:
-        default = url
-
     try:
-        if SHOW_PROGRESS:
+        if progress:
             sys.stdout.write('.')
             sys.stdout.flush()
         html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
         match = re.search('<title>(.*?)</title>', html_content)
         return match.group(1) if match else default or None
     except Exception:
-        if default is False:
-            raise
-        return default
+        return None
 
 
 def str_between(string, start, end=None):
@@ -277,19 +273,19 @@ def merge_links(a, b):
     """deterministially merge two links, favoring longer field values over shorter,
     and "cleaner" values over worse ones.
     """
-    longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
+    longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
     earlier = lambda key: a[key] if a[key] < b[key] else b[key]
     
     url = longer('url')
     longest_title = longer('title')
-    cleanest_title = a['title'] if '://' not in a['title'] else b['title']
+    cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
     link = {
         'timestamp': earlier('timestamp'),
         'url': url,
         'domain': domain(url),
         'base_url': base_url(url),
         'tags': longer('tags'),
-        'title': longest_title if '://' not in longest_title else cleanest_title,
+        'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
         'sources': list(set(a.get('sources', []) + b.get('sources', []))),
     }
     link['type'] = get_link_type(link)
@@ -532,7 +528,7 @@ def derived_link_info(link):
             'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
             'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
             'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
-            'title': '{title} ({type})'.format(**link),
+            'title': link['title'] or basename(link['url']),
         })
     return link_info
 
diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default
index 08fcbe1c..52dc5c4b 100644
--- a/etc/ArchiveBox.conf.default
+++ b/etc/ArchiveBox.conf.default
@@ -10,6 +10,7 @@
 # FETCH_MEDIA=False
 # FETCH_GIT=True
 # FETCH_FAVICON=True
+# FETCH_TITLE=True
 # SUBMIT_ARCHIVE_DOT_ORG=True
 
 ### To only download new links, and never attempt to update old ones, uncomment this line: