From 3e26ae4a66ec5a8390e76bbf51b874f1563b69eb Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 27 Mar 2021 04:30:40 -0400
Subject: [PATCH] support finding multiple urls as substrings in text

---
 archivebox/util.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/archivebox/util.py b/archivebox/util.py
index a96950bb..2f1bb248 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -56,11 +56,13 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
 
 
 URL_REGEX = re.compile(
+    r'(?=('
     r'http[s]?://'                    # start matching from allowed schemes
     r'(?:[a-zA-Z]|[0-9]'              # followed by allowed alphanum characters
     r'|[$-_@.&+]|[!*\(\),]'           #    or allowed symbols
     r'|(?:%[0-9a-fA-F][0-9a-fA-F]))'  #    or allowed unicode bytes
-    r'[^\]\[\(\)<>"\'\s]+',         # stop parsing at these symbols
+    r'[^\]\[\(\)<>"\'\s]+'          # stop parsing at these symbols
+    r'))',
     re.IGNORECASE,
 )