From 69c007ce8536db173b4ba367236a811848464808 Mon Sep 17 00:00:00 2001
From: Aaron Fischer <mail@aaron-fischer.net>
Date: Fri, 19 Oct 2018 21:28:38 +0200
Subject: [PATCH 1/4] Optionally import only new links

When importing a huge list of links periodically (from a big dump of
links from a bookmark service for example) with a lot of broken links,
this links will always be rechecked. To skip this, the environment
variable ONLY_NEW can be used to only import new links and skip the rest
altogether. This partially fixes #95.
---
 README.md           |  6 ++++++
 archiver/archive.py | 19 +++++++++++++++----
 archiver/config.py  |  1 +
 archiver/links.py   | 15 +++++++++++++++
 4 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 6b05a0b3..5f159e08 100644
--- a/README.md
+++ b/README.md
@@ -140,6 +140,11 @@ You can run it in parallel by using the `resume` feature, or by manually splitti
 ```
 Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running).
 
+If you already imported a huge list of bookmarks and want to import only new
+bookmarks, you can use the `ONLY_NEW` environment variable. This is useful if
+you want to import a bookmark dump periodically and want to skip broken links
+which are already in the index.
+
 ## Configuration
 
 You can tweak parameters via environment variables, or by editing `config.py` directly:
@@ -158,6 +163,7 @@ env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./arc
 
 **Archive Options:**
  - maximum allowed download time per link: `TIMEOUT` values: [`60`]/`30`/`...`
+ - import only new links: `ONLY_NEW` values `True`/[`False`]
  - archive methods (values: [`True`]/`False`):
    - fetch page with wget: `FETCH_WGET`
    - fetch images/css/js with wget: `FETCH_WGET_REQUISITES` (True is highly recommended)
diff --git a/archiver/archive.py b/archiver/archive.py
index c795b316..b3384bc7 100755
--- a/archiver/archive.py
+++ b/archiver/archive.py
@@ -10,7 +10,10 @@ from datetime import datetime
 from subprocess import run
 
 from parse import parse_links
-from links import validate_links
+from links import (
+    new_links,
+    validate_links
+)
 from archive_methods import archive_links, _RESULTS_TOTALS
 from index import (
     write_links_index,
@@ -19,6 +22,7 @@ from index import (
     parse_json_link_index,
 )
 from config import (
+    ONLY_NEW,
     OUTPUT_PERMISSIONS,
     OUTPUT_DIR,
     ANSI,
@@ -45,7 +49,7 @@ def print_help():
     print("    ./bin/bookmark-archiver ~/Downloads/bookmarks_export.html\n")
 
 
-def merge_links(archive_path=OUTPUT_DIR, import_path=None):
+def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False):
     """get new links from file and optionally append them to links in existing archive"""
     all_links = []
     if import_path:
@@ -76,6 +80,9 @@ def merge_links(archive_path=OUTPUT_DIR, import_path=None):
     #         **ANSI,
     #     ))
 
+    if only_new:
+        return new_links(all_links, existing_links)
+
     return all_links
 
 def update_archive(archive_path, links, source=None, resume=None, append=True):
@@ -158,7 +165,7 @@ if __name__ == '__main__':
         source = download_url(source)
 
     # Step 1: Parse the links and dedupe them with existing archive
-    links = merge_links(archive_path=out_dir, import_path=source)
+    links = merge_links(archive_path=out_dir, import_path=source, only_new=False)
     
     # Step 2: Write new index
     write_links_index(out_dir=out_dir, links=links)
@@ -167,4 +174,8 @@ if __name__ == '__main__':
     # cleanup_archive(out_dir, links)
 
     # Step 4: Run the archive methods for each link
-    update_archive(out_dir, links, source=source, resume=resume, append=True)
+    if ONLY_NEW:
+        new_links = merge_links(archive_path=out_dir, import_path=source, only_new=True)
+        update_archive(out_dir, new_links, source=source, resume=resume, append=True)
+    else:
+        update_archive(out_dir, links, source=source, resume=resume, append=True)
diff --git a/archiver/config.py b/archiver/config.py
index 2817cdef..1fc2eb0a 100644
--- a/archiver/config.py
+++ b/archiver/config.py
@@ -13,6 +13,7 @@ from subprocess import run, PIPE
 IS_TTY = sys.stdout.isatty()
 USE_COLOR =              os.getenv('USE_COLOR',              str(IS_TTY)        ).lower() == 'true'
 SHOW_PROGRESS =          os.getenv('SHOW_PROGRESS',          str(IS_TTY)        ).lower() == 'true'
+ONLY_NEW =               os.getenv('ONLY_NEW',               'False'            ).lower() == 'true'
 FETCH_WGET =             os.getenv('FETCH_WGET',             'True'             ).lower() == 'true'
 FETCH_WGET_REQUISITES =  os.getenv('FETCH_WGET_REQUISITES',  'True'             ).lower() == 'true'
 FETCH_AUDIO =            os.getenv('FETCH_AUDIO',            'False'            ).lower() == 'true'
diff --git a/archiver/links.py b/archiver/links.py
index 04e7ed24..a8e8b7e5 100644
--- a/archiver/links.py
+++ b/archiver/links.py
@@ -74,6 +74,21 @@ def validate_links(links):
 
     return list(links)
 
+def new_links(imported_links, existing_links):
+    """
+    Return all links which are in the imported_links but not in the existing_links.
+    This is used to determine which links are new and not indexed jet. Set the
+    ONLY_NEW environment variable to activate this filter mechanism.
+    """
+    new_links = []
+    for i_link in imported_links:
+        found_link_in_existing_links = False
+        for e_link in existing_links:
+            if i_link['url'] == e_link['url']:
+                found_link_in_existing_links = True
+        if not found_link_in_existing_links:
+            new_links.append(i_link)
+    return new_links
 
 def archivable_links(links):
     """remove chrome://, about:// or other schemed links that cant be archived"""

From b1b6be4f13a403420a58fc8c06462c605deb16ed Mon Sep 17 00:00:00 2001
From: Aaron Fischer <mail@aaron-fischer.net>
Date: Fri, 19 Oct 2018 22:35:08 +0200
Subject: [PATCH 2/4] merge_links() used wrong index

Because merge_links() use the index, we need to get the new_links() _before_ we manipulate the index with write_links_index(). This has the negative side effect that the "Adding X new links ..." will output twice (because we execute merge_links() twice. For that, we only output stuff when the only_new is not set.
---
 archiver/archive.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/archiver/archive.py b/archiver/archive.py
index b3384bc7..64aa0f25 100755
--- a/archiver/archive.py
+++ b/archiver/archive.py
@@ -64,7 +64,7 @@ def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False):
         all_links = validate_links(existing_links + all_links)
     
     num_new_links = len(all_links) - len(existing_links)
-    if num_new_links:
+    if num_new_links and not only_new:
         print('[{green}+{reset}] [{}] Adding {} new links from {} to {}/index.json'.format(
             datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
             num_new_links,
@@ -166,7 +166,8 @@ if __name__ == '__main__':
 
     # Step 1: Parse the links and dedupe them with existing archive
     links = merge_links(archive_path=out_dir, import_path=source, only_new=False)
-    
+    new_links = merge_links(archive_path=out_dir, import_path=source, only_new=True)
+
     # Step 2: Write new index
     write_links_index(out_dir=out_dir, links=links)
 
@@ -175,7 +176,6 @@ if __name__ == '__main__':
 
     # Step 4: Run the archive methods for each link
     if ONLY_NEW:
-        new_links = merge_links(archive_path=out_dir, import_path=source, only_new=True)
         update_archive(out_dir, new_links, source=source, resume=resume, append=True)
     else:
         update_archive(out_dir, links, source=source, resume=resume, append=True)

From ebc327bb897c137c66ee4a0cbb0b616f17175897 Mon Sep 17 00:00:00 2001
From: Aaron Fischer <mail@aaron-fischer.net>
Date: Sun, 21 Oct 2018 22:36:32 +0200
Subject: [PATCH 3/4] Make O(n^2) loop to an O(n) problem.

---
 archiver/links.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/archiver/links.py b/archiver/links.py
index a8e8b7e5..990df953 100644
--- a/archiver/links.py
+++ b/archiver/links.py
@@ -74,21 +74,14 @@ def validate_links(links):
 
     return list(links)
 
-def new_links(imported_links, existing_links):
+def new_links(all_links, existing_links):
     """
-    Return all links which are in the imported_links but not in the existing_links.
+    Return all links which are in the all_links but not in the existing_links.
     This is used to determine which links are new and not indexed jet. Set the
     ONLY_NEW environment variable to activate this filter mechanism.
     """
-    new_links = []
-    for i_link in imported_links:
-        found_link_in_existing_links = False
-        for e_link in existing_links:
-            if i_link['url'] == e_link['url']:
-                found_link_in_existing_links = True
-        if not found_link_in_existing_links:
-            new_links.append(i_link)
-    return new_links
+    existing_urls = list(map(lambda l: l['url'], existing_links))
+    return list(filter(lambda l: l['url'] not in existing_urls, all_links))
 
 def archivable_links(links):
     """remove chrome://, about:// or other schemed links that cant be archived"""

From a2f5fa8ba69ed87916208e3f0439509f7a72da98 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@nicksweeting.com>
Date: Wed, 24 Oct 2018 21:07:35 +0200
Subject: [PATCH 4/4] Use a more appropriate coding style from @pirate.

Co-Authored-By: f0086 <mail@aaron-fischer.net>
---
 archiver/links.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/archiver/links.py b/archiver/links.py
index 990df953..a16ca594 100644
--- a/archiver/links.py
+++ b/archiver/links.py
@@ -80,8 +80,8 @@ def new_links(all_links, existing_links):
     This is used to determine which links are new and not indexed jet. Set the
     ONLY_NEW environment variable to activate this filter mechanism.
     """
-    existing_urls = list(map(lambda l: l['url'], existing_links))
-    return list(filter(lambda l: l['url'] not in existing_urls, all_links))
+    existing_urls = {link['url'] for link in existing_links}
+    return [link for link in all_links if link['url'] not in existing_urls]
 
 def archivable_links(links):
     """remove chrome://, about:// or other schemed links that cant be archived"""