mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
fix archive.org header case parsing
This commit is contained in:
parent
31476fe21b
commit
a532d11549
1 changed files with 16 additions and 8 deletions
|
@ -2,6 +2,7 @@ import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from subprocess import run, PIPE, DEVNULL
|
from subprocess import run, PIPE, DEVNULL
|
||||||
|
|
||||||
|
@ -308,20 +309,27 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||||
end()
|
end()
|
||||||
|
|
||||||
# Parse archive.org response headers
|
# Parse archive.org response headers
|
||||||
headers = result.stdout.splitlines()
|
headers = defaultdict(list)
|
||||||
content_location = [h for h in headers if b'content-location: ' in h]
|
|
||||||
errors = [h for h in headers if h and b'X-Archive-Wayback-Runtime-Error: ' in h]
|
# lowercase all the header names and store in dict
|
||||||
|
for header in result.stdout.splitlines():
|
||||||
|
if b':' not in header or not header.strip():
|
||||||
|
continue
|
||||||
|
name, val = header.decode().split(':', 1)
|
||||||
|
headers[name.lower().strip()].append(val.strip())
|
||||||
|
|
||||||
|
# Get successful archive url in "content-location" header or any errors
|
||||||
|
content_location = headers['content-location']
|
||||||
|
errors = headers['x-archive-wayback-runtime-error']
|
||||||
|
|
||||||
if content_location:
|
if content_location:
|
||||||
archive_path = content_location[0].split(b'content-location: ', 1)[-1].decode('utf-8')
|
saved_url = 'https://web.archive.org{}'.format(content_location[0])
|
||||||
saved_url = 'https://web.archive.org{}'.format(archive_path)
|
|
||||||
success = True
|
success = True
|
||||||
|
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
||||||
elif len(errors) == 1 and b'RobotAccessControlException' in errors[0]:
|
|
||||||
output = submit_url
|
output = submit_url
|
||||||
# raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
|
# raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
|
||||||
elif errors:
|
elif errors:
|
||||||
raise Exception(', '.join(e.decode() for e in errors))
|
raise Exception(', '.join(errors))
|
||||||
else:
|
else:
|
||||||
raise Exception('Failed to find "content-location" URL header in Archive.org response.')
|
raise Exception('Failed to find "content-location" URL header in Archive.org response.')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue