diff --git a/archivebox/util.py b/archivebox/util.py index a96950bb..2f1bb248 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -56,11 +56,13 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat() URL_REGEX = re.compile( + r'(?=(' r'http[s]?://' # start matching from allowed schemes r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes - r'[^\]\[\(\)<>"\'\s]+', # stop parsing at these symbols + r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols + r'))', re.IGNORECASE, )