mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-15 07:34:27 -04:00
Optionally import only new links
When importing a huge list of links periodically (from a big dump of links from a bookmark service for example) with a lot of broken links, this links will always be rechecked. To skip this, the environment variable ONLY_NEW can be used to only import new links and skip the rest altogether. This partially fixes #95.
This commit is contained in:
parent
bf6e8f03e4
commit
69c007ce85
4 changed files with 37 additions and 4 deletions
|
@ -74,6 +74,21 @@ def validate_links(links):
|
|||
|
||||
return list(links)
|
||||
|
||||
def new_links(imported_links, existing_links):
|
||||
"""
|
||||
Return all links which are in the imported_links but not in the existing_links.
|
||||
This is used to determine which links are new and not indexed jet. Set the
|
||||
ONLY_NEW environment variable to activate this filter mechanism.
|
||||
"""
|
||||
new_links = []
|
||||
for i_link in imported_links:
|
||||
found_link_in_existing_links = False
|
||||
for e_link in existing_links:
|
||||
if i_link['url'] == e_link['url']:
|
||||
found_link_in_existing_links = True
|
||||
if not found_link_in_existing_links:
|
||||
new_links.append(i_link)
|
||||
return new_links
|
||||
|
||||
def archivable_links(links):
|
||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue