From 1e759084f3bb4fb7545d9819e190b746826e3739 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 16 Apr 2019 23:19:44 -0400 Subject: [PATCH] dedupe urls using exact url instead of fuzzy url --- archivebox/legacy/index.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/archivebox/legacy/index.py b/archivebox/legacy/index.py index 20fb0dc9..c76da968 100644 --- a/archivebox/legacy/index.py +++ b/archivebox/legacy/index.py @@ -149,11 +149,10 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: unique_urls: OrderedDict[str, Link] = OrderedDict() for link in sorted_links: - fuzzy = fuzzy_url(link.url) - if fuzzy in unique_urls: + if link.base_url in unique_urls: # merge with any other links that share the same url - link = merge_links(unique_urls[fuzzy], link) - unique_urls[fuzzy] = link + link = merge_links(unique_urls[link.base_url], link) + unique_urls[link.base_url] = link unique_timestamps: OrderedDict[str, Link] = OrderedDict() for link in unique_urls.values():