mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-20 18:05:19 -04:00
only add url-list lines that are real urls
This commit is contained in:
parent
d73f7d7d96
commit
f59b6d4189
1 changed files with 5 additions and 2 deletions
|
@ -1,12 +1,15 @@
|
||||||
__package__ = 'archivebox.parsers'
|
__package__ = 'archivebox.parsers'
|
||||||
__description__ = 'URL list'
|
__description__ = 'URL list'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
from typing import IO, Iterable
|
from typing import IO, Iterable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..util import (
|
from ..util import (
|
||||||
enforce_types
|
enforce_types,
|
||||||
|
URL_REGEX,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,7 +20,7 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||||
text_file.seek(0)
|
text_file.seek(0)
|
||||||
for line in text_file.readlines():
|
for line in text_file.readlines():
|
||||||
url = line.strip()
|
url = line.strip()
|
||||||
if not url:
|
if (not url) or not re.findall(URL_REGEX, url):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield Link(
|
yield Link(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue