mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-15 23:54:27 -04:00
tests: Add tests for several different ways to extract the title
This commit is contained in:
parent
aede134ab3
commit
e7e33ea7a5
4 changed files with 761 additions and 4 deletions
|
@ -72,6 +72,11 @@ def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
|
|||
|
||||
return SAVE_TITLE
|
||||
|
||||
def extract_title_with_regex(html):
|
||||
match = re.search(HTML_TITLE_REGEX, html)
|
||||
output = htmldecode(match.group(1).strip()) if match else None
|
||||
return output
|
||||
|
||||
@enforce_types
|
||||
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""try to guess the page's title from its content"""
|
||||
|
@ -97,10 +102,11 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
|||
parser = TitleParser()
|
||||
parser.feed(html)
|
||||
output = parser.title
|
||||
if output is None:
|
||||
raise
|
||||
except Exception:
|
||||
# fallback to regex that can handle broken/malformed html
|
||||
match = re.search(HTML_TITLE_REGEX, html)
|
||||
output = htmldecode(match.group(1).strip()) if match else None
|
||||
output = extract_title_with_regex(html)
|
||||
|
||||
# if title is better than the one in the db, update db with new title
|
||||
if isinstance(output, str) and output:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue