accept local paths as valid link URLs when parsing

This commit is contained in:
Nick Sweeting 2020-07-13 11:22:58 -04:00
parent 7cbd068c95
commit 96b1e4a8ec

View file

@ -5,6 +5,7 @@ import re
from typing import IO, Iterable from typing import IO, Iterable
from datetime import datetime from datetime import datetime
from pathlib import Path
from ..index.schema import Link from ..index.schema import Link
from ..util import ( from ..util import (
@ -13,14 +14,28 @@ from ..util import (
URL_REGEX URL_REGEX
) )
@enforce_types @enforce_types
def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]: def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]:
"""Parse raw links from each line in a text file""" """Parse raw links from each line in a text file"""
text_file.seek(0) text_file.seek(0)
for line in text_file.readlines(): for line in text_file.readlines():
urls = re.findall(URL_REGEX, line) if line.strip() else () if not line.strip():
for url in urls: # type: ignore continue
# if the line is a local file path that resolves, then we can archive it
if Path(line).exists():
yield Link(
url=line,
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[text_file.name],
)
# otherwise look for anything that looks like a URL in the line
for url in re.findall(URL_REGEX, line):
yield Link( yield Link(
url=htmldecode(url), url=htmldecode(url),
timestamp=str(datetime.now().timestamp()), timestamp=str(datetime.now().timestamp()),