mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
fix URL_REGEX 2
This commit is contained in:
parent
4ae765ec27
commit
e4dc2701ef
1 changed files with 4 additions and 5 deletions
|
@ -59,12 +59,11 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
|
||||||
URL_REGEX = re.compile(
|
URL_REGEX = re.compile(
|
||||||
r'(?=('
|
r'(?=('
|
||||||
r'https?://' #match schemes http and https,but can't match ftp
|
r'https?://' #match schemes http and https,but can't match ftp
|
||||||
r'(?:[A-Za-z0-9-]+\.)+[A-Za-z0-9-]+'#match domain
|
r'(?:[A-Za-z0-9-]+\.)*[A-Za-z0-9-]+'#match domain
|
||||||
r'(?::\d+)?' #match port,mabey not occur
|
r'[^\\#\f\n\r\t\v?&]*' #exclude '#' because don't need fragment,
|
||||||
r'(?:/[^\\#\f\n\r\t\v]*)?' #match path and query,maybe not occur
|
#exclude '?' and '&' because url is invalid when '&' appear before '?'
|
||||||
## r'(?:#[^\]\[\(\)<>"\'\s]*){0,1}' #match fragment,but we don't need it actually
|
r'(?:\?[^\\#\f\n\r\t\v]*)*'
|
||||||
r'))',
|
r'))',
|
||||||
## re.IGNORECASE, #don't need to consider case problem
|
|
||||||
)
|
)
|
||||||
|
|
||||||
COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
|
COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue