mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
fix urljoin bug causing multiple slashes to be merged into one
This commit is contained in:
parent
6cb357e76c
commit
e5aba0dc2e
1 changed files with 85 additions and 3 deletions
|
@ -40,9 +40,20 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
||||||
parser.feed(line)
|
parser.feed(line)
|
||||||
for url in parser.urls:
|
for url in parser.urls:
|
||||||
if root_url:
|
if root_url:
|
||||||
# resolve relative urls /home.html -> https://example.com/home.html
|
url_is_absolute = (url.lower().startswith('http://') or url.lower().startswith('https://'))
|
||||||
url = urljoin(root_url, url)
|
# url = https://abc.com => True
|
||||||
# TODO: fix double // getting stripped by urljoin bug https://github.com/python/cpython/issues/96015
|
# url = /page.php?next=https://example.com => False
|
||||||
|
|
||||||
|
if not url_is_absolute: # resolve it by joining it with root_url
|
||||||
|
relative_path = url
|
||||||
|
|
||||||
|
url = urljoin(root_url, relative_path) # https://example.com/somepage.html + /home.html
|
||||||
|
# => https://example.com/home.html
|
||||||
|
|
||||||
|
# special case to handle bug around // handling, crucial for urls that contain sub-urls
|
||||||
|
# e.g. https://web.archive.org/web/https://example.com
|
||||||
|
if did_urljoin_misbehave(root_url, relative_path, url):
|
||||||
|
url = fix_urljoin_bug(url)
|
||||||
|
|
||||||
for archivable_url in find_all_urls(url):
|
for archivable_url in find_all_urls(url):
|
||||||
yield Link(
|
yield Link(
|
||||||
|
@ -57,3 +68,74 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
||||||
KEY = 'html'
|
KEY = 'html'
|
||||||
NAME = 'Generic HTML'
|
NAME = 'Generic HTML'
|
||||||
PARSER = parse_generic_html_export
|
PARSER = parse_generic_html_export
|
||||||
|
|
||||||
|
|
||||||
|
#### WORKAROUND CODE FOR https://github.com/python/cpython/issues/96015 ####
|
||||||
|
|
||||||
|
def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Handle urljoin edge case bug where multiple slashes get turned into a single slash:
|
||||||
|
- https://github.com/python/cpython/issues/96015
|
||||||
|
- https://github.com/ArchiveBox/ArchiveBox/issues/1411
|
||||||
|
|
||||||
|
This workaround only fixes the most common case of a sub-URL inside an outer URL, e.g.:
|
||||||
|
https://web.archive.org/web/https://example.com/some/inner/url
|
||||||
|
|
||||||
|
But there are other valid URLs containing // that are not fixed by this workaround, e.g.:
|
||||||
|
https://example.com/drives/C//some/file
|
||||||
|
"""
|
||||||
|
|
||||||
|
# if relative path is actually an absolute url, cut off its own scheme so we check the path component only
|
||||||
|
relative_path = relative_path.lower()
|
||||||
|
if relative_path.startswith('http://') or relative_path.startswith('https://'):
|
||||||
|
relative_path = relative_path.split('://', 1)[-1]
|
||||||
|
|
||||||
|
# TODO: properly fix all double // getting stripped by urljoin, not just ://
|
||||||
|
original_path_had_suburl = '://' in relative_path
|
||||||
|
original_root_had_suburl = '://' in root_url[8:] # ignore first 8 chars because root always starts with https://
|
||||||
|
final_joined_has_suburl = '://' in final_url[8:] # ignore first 8 chars because final always starts with https://
|
||||||
|
|
||||||
|
urljoin_broke_suburls = (
|
||||||
|
(original_root_had_suburl or original_path_had_suburl)
|
||||||
|
and not final_joined_has_suburl
|
||||||
|
)
|
||||||
|
return urljoin_broke_suburls
|
||||||
|
|
||||||
|
|
||||||
|
def fix_urljoin_bug(url: str, nesting_limit=5):
|
||||||
|
"""
|
||||||
|
recursively replace broken suburls .../http:/... with .../http://...
|
||||||
|
|
||||||
|
basically equivalent to this for 99.9% of cases:
|
||||||
|
url = url.replace('/http:/', '/http://')
|
||||||
|
url = url.replace('/https:/', '/https://')
|
||||||
|
except this handles:
|
||||||
|
other schemes besides http/https (e.g. https://example.com/link/git+ssh://github.com/example)
|
||||||
|
other preceding separators besides / (e.g. https://example.com/login/?next=https://example.com/home)
|
||||||
|
fixing multiple suburls recursively
|
||||||
|
"""
|
||||||
|
input_url = url
|
||||||
|
for _ in range(nesting_limit):
|
||||||
|
url = re.sub(
|
||||||
|
r'(?P<root>.+?)' # https://web.archive.org/web
|
||||||
|
+ r'(?P<separator>[-=/_&+%$#@!*\(\\])' # /
|
||||||
|
+ r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/' # http:/
|
||||||
|
+ r'(?P<suburl>[^/\\]+)', # example.com
|
||||||
|
r"\1\2\3://\4",
|
||||||
|
input_url,
|
||||||
|
re.IGNORECASE | re.UNICODE,
|
||||||
|
)
|
||||||
|
if url == input_url:
|
||||||
|
break # nothing left to replace, all suburls are fixed
|
||||||
|
input_url = url
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
# sanity check to make sure workaround code works as expected and doesnt introduce *more* bugs
|
||||||
|
assert did_urljoin_misbehave('https://web.archive.org/web/https://example.com', 'abc.html', 'https://web.archive.org/web/https:/example.com/abc.html') == True
|
||||||
|
assert did_urljoin_misbehave('http://example.com', 'https://web.archive.org/web/http://example.com/abc.html', 'https://web.archive.org/web/http:/example.com/abc.html') == True
|
||||||
|
assert fix_urljoin_bug('https:/example.com') == 'https:/example.com' # should not modify original url's scheme, only sub-urls
|
||||||
|
assert fix_urljoin_bug('https://web.archive.org/web/https:/example.com/abc.html') == 'https://web.archive.org/web/https://example.com/abc.html'
|
||||||
|
assert fix_urljoin_bug('http://example.com/link/git+ssh:/github.com/example?next=ftp:/example.com') == 'http://example.com/link/git+ssh://github.com/example?next=ftp://example.com'
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue