tests: Add tests for several different ways to extract the title

2025-05-15 23:54:27 -04:00 · 2020-10-30 08:04:26 -05:00 · 2020-10-30 08:04:26 -05:00 · e7e33ea7a5
commit e7e33ea7a5
parent aede134ab3
4 changed files with 761 additions and 4 deletions
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -72,6 +72,11 @@ def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:

    return SAVE_TITLE

+def extract_title_with_regex(html):
+    match = re.search(HTML_TITLE_REGEX, html)
+    output = htmldecode(match.group(1).strip()) if match else None
+    return output
+
@enforce_types
 def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """try to guess the page's title from its content"""
@ -97,10 +102,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
            parser = TitleParser()
            parser.feed(html)
            output = parser.title
+            if output is None:
+                raise
        except Exception:
            # fallback to regex that can handle broken/malformed html
-            match = re.search(HTML_TITLE_REGEX, html)
-            output = htmldecode(match.group(1).strip()) if match else None
+            output = extract_title_with_regex(html)
        
        # if title is better than the one in the db, update db with new title
        if isinstance(output, str) and output: