dedupe urls using exact url instead of fuzzy url

This commit is contained in:
Nick Sweeting 2019-04-16 23:19:44 -04:00
parent 6e5a77e1ad
commit 1e759084f3

View file

@ -149,11 +149,10 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
unique_urls: OrderedDict[str, Link] = OrderedDict()
for link in sorted_links:
fuzzy = fuzzy_url(link.url)
if fuzzy in unique_urls:
if link.base_url in unique_urls:
# merge with any other links that share the same url
link = merge_links(unique_urls[fuzzy], link)
unique_urls[fuzzy] = link
link = merge_links(unique_urls[link.base_url], link)
unique_urls[link.base_url] = link
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
for link in unique_urls.values():