mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-28 21:54:14 -04:00
better title regex to match titles surrounded by newlines
This commit is contained in:
parent
1b5201fd58
commit
914750c453
1 changed files with 2 additions and 2 deletions
|
@ -66,9 +66,9 @@ URL_REGEX = re.compile(
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
HTML_TITLE_REGEX = re.compile(
|
HTML_TITLE_REGEX = re.compile(
|
||||||
r'<title>' # start matching text after <title> tag
|
r'<title.*?>' # start matching text after <title> tag
|
||||||
r'(.[^<>]+)', # get everything up to these symbols
|
r'(.[^<>]+)', # get everything up to these symbols
|
||||||
re.IGNORECASE,
|
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
||||||
)
|
)
|
||||||
|
|
||||||
### Checks & Tests
|
### Checks & Tests
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue