mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
properly handle querystrings for wget .html appended links
This commit is contained in:
parent
6bb91fbb45
commit
2265f2aaf0
1 changed files with 8 additions and 2 deletions
10
parse.py
10
parse.py
|
@ -175,16 +175,22 @@ def html_appended_url(link):
|
||||||
See docs on wget --adjust-extension."""
|
See docs on wget --adjust-extension."""
|
||||||
|
|
||||||
split_url = link['url'].split('#', 1)
|
split_url = link['url'].split('#', 1)
|
||||||
|
query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
|
||||||
|
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
||||||
# already ends in .html
|
# already ends in .html
|
||||||
return link['base_url']
|
return link['base_url']
|
||||||
else:
|
else:
|
||||||
# .html needs to be appended
|
# .html needs to be appended
|
||||||
without_scheme = split_url[0].split('://', 1)[-1]
|
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
||||||
if without_scheme.endswith('/'):
|
if without_scheme.endswith('/'):
|
||||||
|
if query:
|
||||||
|
return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
|
||||||
return '#'.join([without_scheme + 'index.html', *split_url[1:]])
|
return '#'.join([without_scheme + 'index.html', *split_url[1:]])
|
||||||
return '#'.join([without_scheme + '.html', *split_url[1:]])
|
else:
|
||||||
|
if query:
|
||||||
|
return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
|
||||||
|
return '#'.join([without_scheme + '.html', *split_url[1:]])
|
||||||
|
|
||||||
|
|
||||||
def derived_link_info(link):
|
def derived_link_info(link):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue