diff --git a/archivebox/util.py b/archivebox/util.py index bccf3553..61d6322e 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -59,12 +59,11 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat() URL_REGEX = re.compile( r'(?=(' r'https?://' #match schemes http and https,but can't match ftp - r'(?:[A-Za-z0-9-]+\.)+[A-Za-z0-9-]+'#match domain - r'(?::\d+)?' #match port,mabey not occur - r'(?:/[^\\#\f\n\r\t\v]*)?' #match path and query,maybe not occur -## r'(?:#[^\]\[\(\)<>"\'\s]*){0,1}' #match fragment,but we don't need it actually + r'(?:[A-Za-z0-9-]+\.)*[A-Za-z0-9-]+'#match domain + r'[^\\#\f\n\r\t\v?&]*' #exclude '#' because don't need fragment, + #exclude '?' and '&' because url is invalid when '&' appear before '?' + r'(?:\?[^\\#\f\n\r\t\v]*)*' r'))', -## re.IGNORECASE, #don't need to consider case problem ) COLOR_REGEX = re.compile(r'\[(?P\d+)(;(?P\d+)(;(?P\d+))?)?m')