mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
util.py: Use dateparser to parse date strings.
This commit is contained in:
parent
ca9c9ef956
commit
0bb216ce02
1 changed files with 2 additions and 44 deletions
|
@ -10,6 +10,7 @@ from urllib.request import Request, urlopen
|
||||||
from urllib.parse import urlparse, quote, unquote
|
from urllib.parse import urlparse, quote, unquote
|
||||||
from html import escape, unescape
|
from html import escape, unescape
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from dateutil import parser as dateparser
|
||||||
|
|
||||||
from base32_crockford import encode as base32_encode # type: ignore
|
from base32_crockford import encode as base32_encode # type: ignore
|
||||||
import json as pyjson
|
import json as pyjson
|
||||||
|
@ -140,51 +141,8 @@ def parse_date(date: Any) -> Optional[datetime]:
|
||||||
date = str(date)
|
date = str(date)
|
||||||
|
|
||||||
if isinstance(date, str):
|
if isinstance(date, str):
|
||||||
if date.replace('.', '').isdigit():
|
return dateparser.parse(date)
|
||||||
# this is a brittle attempt at unix timestamp parsing (which is
|
|
||||||
# notoriously hard to do). It may lead to dates being off by
|
|
||||||
# anything from hours to decades, depending on which app, OS,
|
|
||||||
# and sytem time configuration was used for the original timestamp
|
|
||||||
# more info: https://github.com/pirate/ArchiveBox/issues/119
|
|
||||||
|
|
||||||
# Note: always always always store the original timestamp string
|
|
||||||
# somewhere indepentendly of the parsed datetime, so that later
|
|
||||||
# bugs dont repeatedly misparse and rewrite increasingly worse dates.
|
|
||||||
# the correct date can always be re-derived from the timestamp str
|
|
||||||
timestamp = float(date)
|
|
||||||
|
|
||||||
EARLIEST_POSSIBLE = 473403600.0 # 1985
|
|
||||||
LATEST_POSSIBLE = 1735707600.0 # 2025
|
|
||||||
|
|
||||||
if EARLIEST_POSSIBLE < timestamp < LATEST_POSSIBLE:
|
|
||||||
# number is seconds
|
|
||||||
return datetime.fromtimestamp(timestamp)
|
|
||||||
|
|
||||||
elif EARLIEST_POSSIBLE * 1000 < timestamp < LATEST_POSSIBLE * 1000:
|
|
||||||
# number is milliseconds
|
|
||||||
return datetime.fromtimestamp(timestamp / 1000)
|
|
||||||
|
|
||||||
elif EARLIEST_POSSIBLE * 1000*1000 < timestamp < LATEST_POSSIBLE * 1000*1000:
|
|
||||||
# number is microseconds
|
|
||||||
return datetime.fromtimestamp(timestamp / (1000*1000))
|
|
||||||
|
|
||||||
else:
|
|
||||||
# continue to the end and raise a parsing failed error.
|
|
||||||
# we dont want to even attempt parsing timestamp strings that
|
|
||||||
# arent within these ranges
|
|
||||||
pass
|
|
||||||
|
|
||||||
if '-' in date:
|
|
||||||
# 2019-04-07T05:44:39.227520
|
|
||||||
try:
|
|
||||||
return datetime.fromisoformat(date)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
return datetime.strptime(date, '%Y-%m-%d %H:%M')
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
raise ValueError('Tried to parse invalid date! {}'.format(date))
|
raise ValueError('Tried to parse invalid date! {}'.format(date))
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue