diff --git a/archivebox/parse.py b/archivebox/parse.py index 88f7f3f1..bd311288 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -27,12 +27,15 @@ from util import ( str_between, get_link_type, URL_REGEX, + check_url_parsing, ) def parse_links(path): """parse a list of links dictionaries from a bookmark export file""" + check_url_parsing() + links = [] with open(path, 'r', encoding='utf-8') as file: print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( @@ -192,7 +195,6 @@ def parse_shaarli_rss_export(rss_file): yield info - def parse_netscape_html_export(html_file): """Parse netscape-format bookmarks export files (produced by all browsers)""" diff --git a/archivebox/util.py b/archivebox/util.py index 9c93a9fd..3e679825 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -58,8 +58,19 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links short_ts = lambda ts: ts.split('.')[0] -URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+' -HTML_TITLE_REGEX = '(.[^<>]+)' +URL_REGEX = re.compile( + r'http[s]?://' # start matching from allowed schemes + r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters + r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols + r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes + r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols + re.IGNORECASE, +) +HTML_TITLE_REGEX = re.compile( + r'<title>' # start matching text after <title> tag + r'(.[^<>]+)', # get everything up to these symbols + re.IGNORECASE, +) def check_dependencies(): @@ -124,6 +135,30 @@ def check_dependencies(): raise SystemExit(1) +def check_url_parsing(): + """Check that plain text regex URL parsing works as expected""" + test_urls = ''' + https://example1.com/what/is/happening.html?what=1#how-about-this=1 + https://example2.com/what/is/happening/?what=1#how-about-this=1 + HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f + https://example4.com/what/is/happening.html + https://example5.com/ + https://example6.com + + <test>http://example7.com</test> + [https://example8.com/what/is/this.php?what=1] + [and http://example9.com?what=1&other=3#and-thing=2] + <what>https://example10.com#and-thing=2 "</about> + abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def + sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi + example13.bada + and example14.badb + <or>htt://example15.badc</that> + ''' + # print('\n'.join(re.findall(URL_REGEX, test_urls))) + assert len(re.findall(URL_REGEX, test_urls)) == 12 + + def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30): """chmod -R <permissions> <cwd>/<path>"""