only add url-list lines that are real urls

This commit is contained in:
Nick Sweeting 2021-04-01 03:31:55 -04:00
parent d73f7d7d96
commit f59b6d4189

View file

@ -1,12 +1,15 @@
__package__ = 'archivebox.parsers' __package__ = 'archivebox.parsers'
__description__ = 'URL list' __description__ = 'URL list'
import re
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from ..util import (
enforce_types enforce_types,
URL_REGEX,
) )
@ -17,7 +20,7 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
text_file.seek(0) text_file.seek(0)
for line in text_file.readlines(): for line in text_file.readlines():
url = line.strip() url = line.strip()
if not url: if (not url) or not re.findall(URL_REGEX, url):
continue continue
yield Link( yield Link(