diff --git a/archivebox/parse.py b/archivebox/parse.py
index 88f7f3f1..bd311288 100644
--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@@ -27,12 +27,15 @@ from util import (
str_between,
get_link_type,
URL_REGEX,
+ check_url_parsing,
)
def parse_links(path):
"""parse a list of links dictionaries from a bookmark export file"""
+ check_url_parsing()
+
links = []
with open(path, 'r', encoding='utf-8') as file:
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
@@ -192,7 +195,6 @@ def parse_shaarli_rss_export(rss_file):
yield info
-
def parse_netscape_html_export(html_file):
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
diff --git a/archivebox/util.py b/archivebox/util.py
index 9c93a9fd..3e679825 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -58,8 +58,19 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
short_ts = lambda ts: ts.split('.')[0]
-URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
-HTML_TITLE_REGEX = '
(.[^<>]+)'
+URL_REGEX = re.compile(
+ r'http[s]?://' # start matching from allowed schemes
+ r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
+ r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
+ r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
+ r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
+ re.IGNORECASE,
+)
+HTML_TITLE_REGEX = re.compile(
+ r'' # start matching text after tag
+ r'(.[^<>]+)', # get everything up to these symbols
+ re.IGNORECASE,
+)
def check_dependencies():
@@ -124,6 +135,30 @@ def check_dependencies():
raise SystemExit(1)
+def check_url_parsing():
+ """Check that plain text regex URL parsing works as expected"""
+ test_urls = '''
+ https://example1.com/what/is/happening.html?what=1#how-about-this=1
+ https://example2.com/what/is/happening/?what=1#how-about-this=1
+ HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
+ https://example4.com/what/is/happening.html
+ https://example5.com/
+ https://example6.com
+
+ http://example7.com
+ [https://example8.com/what/is/this.php?what=1]
+ [and http://example9.com?what=1&other=3#and-thing=2]
+ https://example10.com#and-thing=2 "
+ abcdef
+ sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
+ example13.bada
+ and example14.badb
+ htt://example15.badc
+ '''
+ # print('\n'.join(re.findall(URL_REGEX, test_urls)))
+ assert len(re.findall(URL_REGEX, test_urls)) == 12
+
+
def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
"""chmod -R /"""