mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 15:14:31 -04:00
add proper typechecked json parsing and dumping
This commit is contained in:
parent
35c05c321f
commit
73f46b0b29
3 changed files with 75 additions and 22 deletions
|
@ -121,18 +121,12 @@ def write_json_links_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||||
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
def parse_json_links_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
"""parse a archive index json file and return the list of links"""
|
"""parse a archive index json file and return the list of links"""
|
||||||
|
|
||||||
allowed_fields = {f.name for f in fields(Link)}
|
|
||||||
|
|
||||||
index_path = os.path.join(out_dir, 'index.json')
|
index_path = os.path.join(out_dir, 'index.json')
|
||||||
if os.path.exists(index_path):
|
if os.path.exists(index_path):
|
||||||
with open(index_path, 'r', encoding='utf-8') as f:
|
with open(index_path, 'r', encoding='utf-8') as f:
|
||||||
links = json.load(f)['links']
|
links = json.load(f)['links']
|
||||||
for link_json in links:
|
for link_json in links:
|
||||||
yield Link(**{
|
yield Link.from_json(link_json)
|
||||||
key: val
|
|
||||||
for key, val in link_json.items()
|
|
||||||
if key in allowed_fields
|
|
||||||
})
|
|
||||||
|
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from datetime import datetime
|
||||||
|
|
||||||
from typing import List, Dict, Any, Optional, Union
|
from typing import List, Dict, Any, Optional, Union
|
||||||
|
|
||||||
from dataclasses import dataclass, asdict, field
|
from dataclasses import dataclass, asdict, field, fields
|
||||||
|
|
||||||
|
|
||||||
class ArchiveError(Exception):
|
class ArchiveError(Exception):
|
||||||
|
@ -28,11 +28,38 @@ class ArchiveResult:
|
||||||
schema: str = 'ArchiveResult'
|
schema: str = 'ArchiveResult'
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
assert self.schema == self.__class__.__name__
|
self.typecheck()
|
||||||
|
|
||||||
def _asdict(self):
|
def _asdict(self):
|
||||||
return asdict(self)
|
return asdict(self)
|
||||||
|
|
||||||
|
def typecheck(self) -> None:
|
||||||
|
assert self.schema == self.__class__.__name__
|
||||||
|
assert isinstance(self.status, str) and self.status
|
||||||
|
assert isinstance(self.start_ts, datetime)
|
||||||
|
assert isinstance(self.end_ts, datetime)
|
||||||
|
assert isinstance(self.cmd, list)
|
||||||
|
assert all(isinstance(arg, str) and arg for arg in self.cmd)
|
||||||
|
assert self.pwd is None or isinstance(self.pwd, str) and self.pwd
|
||||||
|
assert self.cmd_version is None or isinstance(self.cmd_version, str) and self.cmd_version
|
||||||
|
assert self.output is None or isinstance(self.output, (str, Exception))
|
||||||
|
if isinstance(self.output, str):
|
||||||
|
assert self.output
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_json(cls, json_info):
|
||||||
|
from .util import parse_date
|
||||||
|
|
||||||
|
allowed_fields = {f.name for f in fields(cls)}
|
||||||
|
info = {
|
||||||
|
key: val
|
||||||
|
for key, val in json_info.items()
|
||||||
|
if key in allowed_fields
|
||||||
|
}
|
||||||
|
info['start_ts'] = parse_date(info['start_ts'])
|
||||||
|
info['end_ts'] = parse_date(info['end_ts'])
|
||||||
|
return cls(**info)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def duration(self) -> int:
|
def duration(self) -> int:
|
||||||
return (self.end_ts - self.start_ts).seconds
|
return (self.end_ts - self.start_ts).seconds
|
||||||
|
@ -49,17 +76,7 @@ class Link:
|
||||||
schema: str = 'Link'
|
schema: str = 'Link'
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
"""fix any history result items to be type-checked ArchiveResults"""
|
self.typecheck()
|
||||||
assert self.schema == self.__class__.__name__
|
|
||||||
cast_history = {}
|
|
||||||
for method, method_history in self.history.items():
|
|
||||||
cast_history[method] = []
|
|
||||||
for result in method_history:
|
|
||||||
if isinstance(result, dict):
|
|
||||||
result = ArchiveResult(**result)
|
|
||||||
cast_history[method].append(result)
|
|
||||||
|
|
||||||
object.__setattr__(self, 'history', cast_history)
|
|
||||||
|
|
||||||
def overwrite(self, **kwargs):
|
def overwrite(self, **kwargs):
|
||||||
"""pure functional version of dict.update that returns a new instance"""
|
"""pure functional version of dict.update that returns a new instance"""
|
||||||
|
@ -76,6 +93,22 @@ class Link:
|
||||||
if not self.timestamp or not other.timestamp:
|
if not self.timestamp or not other.timestamp:
|
||||||
return
|
return
|
||||||
return float(self.timestamp) > float(other.timestamp)
|
return float(self.timestamp) > float(other.timestamp)
|
||||||
|
|
||||||
|
def typecheck(self) -> None:
|
||||||
|
assert self.schema == self.__class__.__name__
|
||||||
|
assert isinstance(self.timestamp, str) and self.timestamp
|
||||||
|
assert self.timestamp.replace('.', '').isdigit()
|
||||||
|
assert isinstance(self.url, str) and '://' in self.url
|
||||||
|
assert self.updated is None or isinstance(self.updated, datetime)
|
||||||
|
assert self.title is None or isinstance(self.title, str) and self.title
|
||||||
|
assert self.tags is None or isinstance(self.tags, str) and self.tags
|
||||||
|
assert isinstance(self.sources, list)
|
||||||
|
assert all(isinstance(source, str) and source for source in self.sources)
|
||||||
|
assert isinstance(self.history, dict)
|
||||||
|
for method, results in self.history.items():
|
||||||
|
assert isinstance(method, str) and method
|
||||||
|
assert isinstance(results, list)
|
||||||
|
assert all(isinstance(result, ArchiveResult) for result in results)
|
||||||
|
|
||||||
def _asdict(self, extended=False):
|
def _asdict(self, extended=False):
|
||||||
info = {
|
info = {
|
||||||
|
@ -108,6 +141,32 @@ class Link:
|
||||||
})
|
})
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_json(cls, json_info):
|
||||||
|
from .util import parse_date
|
||||||
|
|
||||||
|
allowed_fields = {f.name for f in fields(cls)}
|
||||||
|
info = {
|
||||||
|
key: val
|
||||||
|
for key, val in json_info.items()
|
||||||
|
if key in allowed_fields
|
||||||
|
}
|
||||||
|
info['updated'] = parse_date(info['updated'])
|
||||||
|
|
||||||
|
json_history = info['history']
|
||||||
|
cast_history = {}
|
||||||
|
|
||||||
|
for method, method_history in json_history.items():
|
||||||
|
cast_history[method] = []
|
||||||
|
for json_result in method_history:
|
||||||
|
assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
|
||||||
|
cast_result = ArchiveResult.from_json(json_result)
|
||||||
|
cast_history[method].append(cast_result)
|
||||||
|
|
||||||
|
info['history'] = cast_history
|
||||||
|
return cls(**info)
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def link_dir(self) -> str:
|
def link_dir(self) -> str:
|
||||||
from .config import ARCHIVE_DIR
|
from .config import ARCHIVE_DIR
|
||||||
|
|
|
@ -675,8 +675,8 @@ class ExtendedEncoder(JSONEncoder):
|
||||||
return JSONEncoder.default(self, obj)
|
return JSONEncoder.default(self, obj)
|
||||||
|
|
||||||
|
|
||||||
def atomic_write(contents: Union[dict, str], path: str):
|
def atomic_write(contents: Union[dict, str], path: str) -> None:
|
||||||
"""Safe atomic file write and swap using a tmp file"""
|
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
|
||||||
try:
|
try:
|
||||||
tmp_file = '{}.tmp'.format(path)
|
tmp_file = '{}.tmp'.format(path)
|
||||||
with open(tmp_file, 'w+', encoding='utf-8') as f:
|
with open(tmp_file, 'w+', encoding='utf-8') as f:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue