mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-28 21:54:14 -04:00
check more url parsing invariants on startup
This commit is contained in:
parent
fea0b89dbe
commit
5fb9ca389f
1 changed files with 45 additions and 31 deletions
|
@ -68,7 +68,6 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
|
||||||
"""
|
"""
|
||||||
parse a list of URLS without touching the filesystem
|
parse a list of URLS without touching the filesystem
|
||||||
"""
|
"""
|
||||||
check_url_parsing_invariants()
|
|
||||||
|
|
||||||
timer = TimedProgress(TIMEOUT * 4)
|
timer = TimedProgress(TIMEOUT * 4)
|
||||||
#urls = list(map(lambda x: x + "\n", urls))
|
#urls = list(map(lambda x: x + "\n", urls))
|
||||||
|
@ -89,8 +88,6 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
|
||||||
RSS feed, bookmarks export, or text file
|
RSS feed, bookmarks export, or text file
|
||||||
"""
|
"""
|
||||||
|
|
||||||
check_url_parsing_invariants()
|
|
||||||
|
|
||||||
timer = TimedProgress(TIMEOUT * 4)
|
timer = TimedProgress(TIMEOUT * 4)
|
||||||
with open(source_file, 'r', encoding='utf-8') as file:
|
with open(source_file, 'r', encoding='utf-8') as file:
|
||||||
links, parser = run_parser_functions(file, timer, root_url=root_url)
|
links, parser = run_parser_functions(file, timer, root_url=root_url)
|
||||||
|
@ -173,31 +170,48 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
|
||||||
return source_path
|
return source_path
|
||||||
|
|
||||||
|
|
||||||
def check_url_parsing_invariants() -> None:
|
# Check that plain text regex URL parsing works as expected
|
||||||
"""Check that plain text regex URL parsing works as expected"""
|
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||||
|
# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
|
||||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
# the consequences of bad URL parsing could be disastrous and lead to many
|
||||||
# misbehaving, as the consequences could be disastrous and lead to many
|
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
|
||||||
# incorrect/badly parsed links being added to the archive
|
_test_url_strs = {
|
||||||
|
'example.com': 0,
|
||||||
test_urls = '''
|
'/example.com': 0,
|
||||||
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
'//example.com': 0,
|
||||||
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
':/example.com': 0,
|
||||||
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
|
'://example.com': 0,
|
||||||
https://example4.com/what/is/happening.html
|
'htt://example8.com': 0,
|
||||||
https://example5.com/
|
'/htt://example.com': 0,
|
||||||
https://example6.com
|
'https://example': 1,
|
||||||
|
'https://localhost/2345': 1,
|
||||||
<test>http://example7.com</test>
|
'https://localhost:1234/123': 1,
|
||||||
[https://example8.com/what/is/this.php?what=1]
|
'://': 0,
|
||||||
[and http://example9.com?what=1&other=3#and-thing=2]
|
'https://': 0,
|
||||||
<what>https://example10.com#and-thing=2 "</about>
|
'http://': 0,
|
||||||
abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
|
'ftp://': 0,
|
||||||
sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
|
'ftp://example.com': 0,
|
||||||
example13.bada
|
'https://example.com': 1,
|
||||||
and example14.badb
|
'https://example.com/': 1,
|
||||||
<or>htt://example15.badc</that>
|
'https://a.example.com': 1,
|
||||||
'''
|
'https://a.example.com/': 1,
|
||||||
# print('\n'.join(re.findall(URL_REGEX, test_urls)))
|
'https://a.example.com/what/is/happening.html': 1,
|
||||||
assert len(re.findall(URL_REGEX, test_urls)) == 12
|
'https://a.example.com/what/ís/happening.html': 1,
|
||||||
|
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
|
||||||
|
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
|
||||||
|
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
|
||||||
|
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
||||||
|
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
||||||
|
'<test>http://example7.com</test>': 1,
|
||||||
|
'[https://example8.com/what/is/this.php?what=1]': 1,
|
||||||
|
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
||||||
|
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
||||||
|
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
|
||||||
|
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
|
||||||
|
'<or>http://examplehttp://15.badc</that>': 2,
|
||||||
|
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
|
||||||
|
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
|
||||||
|
}
|
||||||
|
for url_str, num_urls in _test_url_strs.items():
|
||||||
|
assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
|
||||||
|
f'{url_str} does not contain {num_urls} urls')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue