fix archive.org header case parsing

This commit is contained in:
Nick Sweeting 2018-04-25 03:49:26 -04:00
parent 31476fe21b
commit a532d11549

View file

@ -2,6 +2,7 @@ import os
import sys import sys
from functools import wraps from functools import wraps
from collections import defaultdict
from datetime import datetime from datetime import datetime
from subprocess import run, PIPE, DEVNULL from subprocess import run, PIPE, DEVNULL
@ -308,20 +309,27 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
end() end()
# Parse archive.org response headers # Parse archive.org response headers
headers = result.stdout.splitlines() headers = defaultdict(list)
content_location = [h for h in headers if b'content-location: ' in h]
errors = [h for h in headers if h and b'X-Archive-Wayback-Runtime-Error: ' in h] # lowercase all the header names and store in dict
for header in result.stdout.splitlines():
if b':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors
content_location = headers['content-location']
errors = headers['x-archive-wayback-runtime-error']
if content_location: if content_location:
archive_path = content_location[0].split(b'content-location: ', 1)[-1].decode('utf-8') saved_url = 'https://web.archive.org{}'.format(content_location[0])
saved_url = 'https://web.archive.org{}'.format(archive_path)
success = True success = True
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
elif len(errors) == 1 and b'RobotAccessControlException' in errors[0]:
output = submit_url output = submit_url
# raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain'])) # raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
elif errors: elif errors:
raise Exception(', '.join(e.decode() for e in errors)) raise Exception(', '.join(errors))
else: else:
raise Exception('Failed to find "content-location" URL header in Archive.org response.') raise Exception('Failed to find "content-location" URL header in Archive.org response.')
except Exception as e: except Exception as e: