Merge pull request #264 from shakkhar/v0.4.3+

V0.4.3+
This commit is contained in:
Nick Sweeting 2019-09-25 01:36:51 -04:00 committed by GitHub
commit 374dd39d7a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 3 additions and 44 deletions

View file

@ -59,6 +59,7 @@ class ArchiveResult:
} }
info['start_ts'] = parse_date(info['start_ts']) info['start_ts'] = parse_date(info['start_ts'])
info['end_ts'] = parse_date(info['end_ts']) info['end_ts'] = parse_date(info['end_ts'])
info['cmd_version'] = info.get('cmd_version')
return cls(**info) return cls(**info)
def to_dict(self, *keys) -> dict: def to_dict(self, *keys) -> dict:

View file

@ -10,6 +10,7 @@ from urllib.request import Request, urlopen
from urllib.parse import urlparse, quote, unquote from urllib.parse import urlparse, quote, unquote
from html import escape, unescape from html import escape, unescape
from datetime import datetime from datetime import datetime
from dateutil import parser as dateparser
from base32_crockford import encode as base32_encode # type: ignore from base32_crockford import encode as base32_encode # type: ignore
import json as pyjson import json as pyjson
@ -140,51 +141,8 @@ def parse_date(date: Any) -> Optional[datetime]:
date = str(date) date = str(date)
if isinstance(date, str): if isinstance(date, str):
if date.replace('.', '').isdigit(): return dateparser.parse(date)
# this is a brittle attempt at unix timestamp parsing (which is
# notoriously hard to do). It may lead to dates being off by
# anything from hours to decades, depending on which app, OS,
# and sytem time configuration was used for the original timestamp
# more info: https://github.com/pirate/ArchiveBox/issues/119
# Note: always always always store the original timestamp string
# somewhere indepentendly of the parsed datetime, so that later
# bugs dont repeatedly misparse and rewrite increasingly worse dates.
# the correct date can always be re-derived from the timestamp str
timestamp = float(date)
EARLIEST_POSSIBLE = 473403600.0 # 1985
LATEST_POSSIBLE = 1735707600.0 # 2025
if EARLIEST_POSSIBLE < timestamp < LATEST_POSSIBLE:
# number is seconds
return datetime.fromtimestamp(timestamp)
elif EARLIEST_POSSIBLE * 1000 < timestamp < LATEST_POSSIBLE * 1000:
# number is milliseconds
return datetime.fromtimestamp(timestamp / 1000)
elif EARLIEST_POSSIBLE * 1000*1000 < timestamp < LATEST_POSSIBLE * 1000*1000:
# number is microseconds
return datetime.fromtimestamp(timestamp / (1000*1000))
else:
# continue to the end and raise a parsing failed error.
# we dont want to even attempt parsing timestamp strings that
# arent within these ranges
pass
if '-' in date:
# 2019-04-07T05:44:39.227520
try:
return datetime.fromisoformat(date)
except Exception:
pass
try:
return datetime.strptime(date, '%Y-%m-%d %H:%M')
except Exception:
pass
raise ValueError('Tried to parse invalid date! {}'.format(date)) raise ValueError('Tried to parse invalid date! {}'.format(date))