mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
fix wget_output_path urlencoding
This commit is contained in:
parent
64e6eb5f7b
commit
b7cae4f72e
3 changed files with 36 additions and 16 deletions
|
@ -7,7 +7,7 @@ from subprocess import run, PIPE, DEVNULL
|
||||||
|
|
||||||
from peekable import Peekable
|
from peekable import Peekable
|
||||||
|
|
||||||
from index import html_appended_url, parse_json_link_index, write_link_index
|
from index import wget_output_path, parse_json_link_index, write_link_index
|
||||||
from links import links_after_timestamp
|
from links import links_after_timestamp
|
||||||
from config import (
|
from config import (
|
||||||
ARCHIVE_DIR,
|
ARCHIVE_DIR,
|
||||||
|
@ -182,8 +182,9 @@ def attach_result_to_link(method):
|
||||||
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT):
|
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT):
|
||||||
"""download full site using wget"""
|
"""download full site using wget"""
|
||||||
|
|
||||||
if os.path.exists(os.path.join(link_dir, link['domain'])):
|
domain_dir = os.path.join(link_dir, link['domain'])
|
||||||
return {'output': html_appended_url(link), 'status': 'skipped'}
|
if os.path.exists(domain_dir):
|
||||||
|
return {'output': wget_output_path(link, look_in=domain_dir), 'status': 'skipped'}
|
||||||
|
|
||||||
CMD = [
|
CMD = [
|
||||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||||
|
@ -220,7 +221,7 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI
|
||||||
"""print PDF of site to file using chrome --headless"""
|
"""print PDF of site to file using chrome --headless"""
|
||||||
|
|
||||||
if link['type'] in ('PDF', 'image'):
|
if link['type'] in ('PDF', 'image'):
|
||||||
return {'output': html_appended_url(link)}
|
return {'output': wget_output_path(link)}
|
||||||
|
|
||||||
if os.path.exists(os.path.join(link_dir, 'output.pdf')):
|
if os.path.exists(os.path.join(link_dir, 'output.pdf')):
|
||||||
return {'output': 'output.pdf', 'status': 'skipped'}
|
return {'output': 'output.pdf', 'status': 'skipped'}
|
||||||
|
@ -256,7 +257,7 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_
|
||||||
"""take screenshot of site using chrome --headless"""
|
"""take screenshot of site using chrome --headless"""
|
||||||
|
|
||||||
if link['type'] in ('PDF', 'image'):
|
if link['type'] in ('PDF', 'image'):
|
||||||
return {'output': html_appended_url(link)}
|
return {'output': wget_output_path(link)}
|
||||||
|
|
||||||
if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
|
if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
|
||||||
return {'output': 'screenshot.png', 'status': 'skipped'}
|
return {'output': 'screenshot.png', 'status': 'skipped'}
|
||||||
|
|
3
index.py
3
index.py
|
@ -14,10 +14,11 @@ from config import (
|
||||||
ARCHIVE_DIR,
|
ARCHIVE_DIR,
|
||||||
ANSI,
|
ANSI,
|
||||||
GIT_SHA,
|
GIT_SHA,
|
||||||
|
FOOTER_INFO,
|
||||||
)
|
)
|
||||||
from util import (
|
from util import (
|
||||||
chmod_file,
|
chmod_file,
|
||||||
html_appended_url,
|
wget_output_path,
|
||||||
derived_link_info,
|
derived_link_info,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
38
util.py
38
util.py
|
@ -8,10 +8,12 @@ import requests
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from subprocess import run, PIPE, DEVNULL
|
from subprocess import run, PIPE, DEVNULL
|
||||||
from multiprocessing import Process
|
from multiprocessing import Process
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
from config import (
|
from config import (
|
||||||
IS_TTY,
|
IS_TTY,
|
||||||
ARCHIVE_PERMISSIONS,
|
ARCHIVE_PERMISSIONS,
|
||||||
|
HTML_FOLDER,
|
||||||
ARCHIVE_DIR,
|
ARCHIVE_DIR,
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
TERM_WIDTH,
|
TERM_WIDTH,
|
||||||
|
@ -394,35 +396,51 @@ def cleanup_archive(archive_path, links):
|
||||||
print(' '+ '\n '.join(unmatched))
|
print(' '+ '\n '.join(unmatched))
|
||||||
|
|
||||||
|
|
||||||
def html_appended_url(link):
|
def wget_output_path(link, look_in=None):
|
||||||
"""calculate the path to the wgetted .html file, since wget may
|
"""calculate the path to the wgetted .html file, since wget may
|
||||||
adjust some paths to be different than the base_url path.
|
adjust some paths to be different than the base_url path.
|
||||||
|
|
||||||
See docs on wget --adjust-extension.
|
See docs on wget --adjust-extension (-E)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if link['type'] in ('PDF', 'image'):
|
if link['type'] in ('PDF', 'image'):
|
||||||
return link['base_url']
|
return quote(link['base_url'])
|
||||||
|
|
||||||
|
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
||||||
|
# instead of trying to emulate it here, we just look in the output folder
|
||||||
|
# to see what html file wget actually created as the output
|
||||||
|
wget_folder = link['base_url'].rsplit('/', 1)[0]
|
||||||
|
look_in = look_in or os.path.join(HTML_FOLDER, 'archive', link['timestamp'], wget_folder)
|
||||||
|
|
||||||
|
if look_in and os.path.exists(look_in):
|
||||||
|
html_files = [
|
||||||
|
f for f in os.listdir(look_in)
|
||||||
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
||||||
|
]
|
||||||
|
if html_files:
|
||||||
|
return quote(os.path.join(wget_folder, html_files[0]))
|
||||||
|
|
||||||
|
# If finding the actual output file didn't work, fall back to the buggy
|
||||||
|
# implementation of the wget .html appending algorithm
|
||||||
split_url = link['url'].split('#', 1)
|
split_url = link['url'].split('#', 1)
|
||||||
query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
|
query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
|
||||||
|
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
||||||
# already ends in .html
|
# already ends in .html
|
||||||
return link['base_url']
|
return quote(link['base_url'])
|
||||||
else:
|
else:
|
||||||
# .html needs to be appended
|
# .html needs to be appended
|
||||||
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
||||||
if without_scheme.endswith('/'):
|
if without_scheme.endswith('/'):
|
||||||
if query:
|
if query:
|
||||||
return '#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])
|
return quote('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
|
||||||
return '#'.join([without_scheme + 'index.html', *split_url[1:]])
|
return quote('#'.join([without_scheme + 'index.html', *split_url[1:]]))
|
||||||
else:
|
else:
|
||||||
if query:
|
if query:
|
||||||
return '#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])
|
return quote('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
|
||||||
elif '/' in without_scheme:
|
elif '/' in without_scheme:
|
||||||
return '#'.join([without_scheme + '.html', *split_url[1:]])
|
return quote('#'.join([without_scheme + '.html', *split_url[1:]]))
|
||||||
return link['base_url'] + '/index.html'
|
return quote(link['base_url'] + '/index.html')
|
||||||
|
|
||||||
|
|
||||||
def derived_link_info(link):
|
def derived_link_info(link):
|
||||||
|
@ -434,7 +452,7 @@ def derived_link_info(link):
|
||||||
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
|
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
|
||||||
'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
|
'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
|
||||||
'files_url': 'archive/{timestamp}/index.html'.format(**link),
|
'files_url': 'archive/{timestamp}/index.html'.format(**link),
|
||||||
'archive_url': 'archive/{}/{}'.format(link['timestamp'], html_appended_url(link)),
|
'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link)),
|
||||||
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
|
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
|
||||||
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
|
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
|
||||||
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
|
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue