mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 22:54:27 -04:00
finished manual link merging logic to fix folder conflicts
This commit is contained in:
parent
db26ab9aa9
commit
318b9ae1db
2 changed files with 113 additions and 39 deletions
27
links.py
27
links.py
|
@ -35,8 +35,9 @@ Link {
|
|||
from util import (
|
||||
domain,
|
||||
base_url,
|
||||
get_str_between,
|
||||
str_between,
|
||||
get_link_type,
|
||||
merge_links,
|
||||
)
|
||||
|
||||
|
||||
|
@ -89,30 +90,6 @@ def sorted_links(links):
|
|||
sort_func = lambda link: (link['timestamp'], link['url'])
|
||||
return sorted(links, key=sort_func, reverse=True)
|
||||
|
||||
|
||||
|
||||
def merge_links(a, b):
|
||||
"""deterministially merge two links, favoring longer field values over shorter,
|
||||
and "cleaner" values over worse ones.
|
||||
"""
|
||||
longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
|
||||
earlier = lambda key: a[key] if a[key] < b[key] else b[key]
|
||||
|
||||
url = longer('url')
|
||||
longest_title = longer('title')
|
||||
cleanest_title = a['title'] if '://' not in a['title'] else b['title']
|
||||
link = {
|
||||
'timestamp': earlier('timestamp'),
|
||||
'url': url,
|
||||
'domain': domain(url),
|
||||
'base_url': base_url(url),
|
||||
'tags': longer('tags'),
|
||||
'title': longest_title if '://' not in longest_title else cleanest_title,
|
||||
'sources': list(set(a.get('sources', []) + b.get('sources', []))),
|
||||
}
|
||||
link['type'] = get_link_type(link)
|
||||
return link
|
||||
|
||||
def links_after_timestamp(links, timestamp=None):
|
||||
if not timestamp:
|
||||
yield from links
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue