mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 22:54:27 -04:00
dedupe urls using exact url instead of fuzzy url
This commit is contained in:
parent
6e5a77e1ad
commit
1e759084f3
1 changed files with 3 additions and 4 deletions
|
@ -149,11 +149,10 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
|||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||
|
||||
for link in sorted_links:
|
||||
fuzzy = fuzzy_url(link.url)
|
||||
if fuzzy in unique_urls:
|
||||
if link.base_url in unique_urls:
|
||||
# merge with any other links that share the same url
|
||||
link = merge_links(unique_urls[fuzzy], link)
|
||||
unique_urls[fuzzy] = link
|
||||
link = merge_links(unique_urls[link.base_url], link)
|
||||
unique_urls[link.base_url] = link
|
||||
|
||||
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
||||
for link in unique_urls.values():
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue