From 0272c9b8c0b6c2d3230d98f8e6371035d18c4088 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 11 Apr 2019 03:41:25 -0400 Subject: [PATCH] deduplicate method history when merging links --- archivebox/legacy/index.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/archivebox/legacy/index.py b/archivebox/legacy/index.py index a28192b2..5edde1b6 100644 --- a/archivebox/legacy/index.py +++ b/archivebox/legacy/index.py @@ -28,6 +28,7 @@ from .util import ( TimedProgress, copy_and_overwrite, atomic_write, + ExtendedEncoder, ) from .parse import parse_links from .logs import ( @@ -93,6 +94,16 @@ def merge_links(a: Link, b: Link) -> Link: method: (a.history.get(method) or []) + (b.history.get(method) or []) for method in all_methods } + for method in all_methods: + deduped_jsons = { + json.dumps(result, sort_keys=True, cls=ExtendedEncoder) + for result in history[method] + } + history[method] = list(reversed(sorted( + (ArchiveResult.from_json(json.loads(result)) for result in deduped_jsons), + key=lambda result: result.start_ts, + ))) + return Link( url=url,