save the url as title for staticfiles or non html files

2025-05-13 06:34:25 -04:00 · 2021-01-30 22:01:49 -05:00 · 2021-01-30 22:01:49 -05:00 · 385daf9af8
commit 385daf9af8
parent 24e24934f7
1 changed files with 5 additions and 4 deletions
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -62,9 +62,6 @@ class TitleParser(HTMLParser):
@enforce_types
 def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
    if is_static_file(link.url):
        return False
    # if link already has valid title, skip it
    if not overwrite and link.title and not link.title.lower().startswith('http'):
        return False
@ -113,7 +110,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
                                        timestamp=link.timestamp)\
                                .update(title=output)
        else:
-            raise ArchiveError('Unable to detect page title')
+            # if no content was returned, dont save a title (because it might be a temporary error)
            if not html:
                raise ArchiveError('Unable to detect page title')
            # output = html[:128]       # use first bit of content as the title
            output = link.base_url      # use the filename as the title (better UX)
    except Exception as err:
        status = 'failed'
        output = err