From 3e26ae4a66ec5a8390e76bbf51b874f1563b69eb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 27 Mar 2021 04:30:40 -0400 Subject: [PATCH] support finding multiple urls as substrings in text --- archivebox/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/archivebox/util.py b/archivebox/util.py index a96950bb..2f1bb248 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -56,11 +56,13 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat() URL_REGEX = re.compile( + r'(?=(' r'http[s]?://' # start matching from allowed schemes r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes - r'[^\]\[\(\)<>"\'\s]+', # stop parsing at these symbols + r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols + r'))', re.IGNORECASE, )