mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 07:04:27 -04:00
dedupe urls using exact url instead of fuzzy url
This commit is contained in:
parent
6e5a77e1ad
commit
1e759084f3
1 changed files with 3 additions and 4 deletions
|
@ -149,11 +149,10 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
||||||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||||
|
|
||||||
for link in sorted_links:
|
for link in sorted_links:
|
||||||
fuzzy = fuzzy_url(link.url)
|
if link.base_url in unique_urls:
|
||||||
if fuzzy in unique_urls:
|
|
||||||
# merge with any other links that share the same url
|
# merge with any other links that share the same url
|
||||||
link = merge_links(unique_urls[fuzzy], link)
|
link = merge_links(unique_urls[link.base_url], link)
|
||||||
unique_urls[fuzzy] = link
|
unique_urls[link.base_url] = link
|
||||||
|
|
||||||
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
||||||
for link in unique_urls.values():
|
for link in unique_urls.values():
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue