From f59b6d418940263b82918af1f9104d5c78c7f216 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 1 Apr 2021 03:31:55 -0400 Subject: [PATCH] only add url-list lines that are real urls --- archivebox/parsers/url_list.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/archivebox/parsers/url_list.py b/archivebox/parsers/url_list.py index a45e5225..66e3961c 100644 --- a/archivebox/parsers/url_list.py +++ b/archivebox/parsers/url_list.py @@ -1,12 +1,15 @@ __package__ = 'archivebox.parsers' __description__ = 'URL list' +import re + from typing import IO, Iterable from datetime import datetime from ..index.schema import Link from ..util import ( - enforce_types + enforce_types, + URL_REGEX, ) @@ -17,7 +20,7 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]: text_file.seek(0) for line in text_file.readlines(): url = line.strip() - if not url: + if (not url) or not re.findall(URL_REGEX, url): continue yield Link(