Merge pull request #107 from f0086/import-only-new-links

Optionally import only new links
2025-05-14 15:14:31 -04:00 · 2018-10-25 23:31:16 -04:00 · 2018-10-25 23:31:16 -04:00 · 678ce229c4
commit 678ce229c4
parent dbb87311ed a2f5fa8ba6
4 changed files with 32 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -142,6 +142,11 @@ You can run it in parallel by using the `resume` feature, or by manually splitti
 ```
 Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running).
 If you already imported a huge list of bookmarks and want to import only new
 bookmarks, you can use the `ONLY_NEW` environment variable. This is useful if
 you want to import a bookmark dump periodically and want to skip broken links
 which are already in the index.
 ## Configuration
 You can tweak parameters via environment variables, or by editing `config.py` directly:
@ -160,6 +165,7 @@ env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./arc
 **Archive Options:**
 - maximum allowed download time per link: `TIMEOUT` values: [`60`]/`30`/`...`
 - import only new links: `ONLY_NEW` values `True`/[`False`]
 - archive methods (values: [`True`]/`False`):
   - fetch page with wget: `FETCH_WGET`
   - fetch images/css/js with wget: `FETCH_WGET_REQUISITES` (True is highly recommended)
--- a/archiver/archive.py
+++ b/archiver/archive.py
@ -10,7 +10,10 @@ from datetime import datetime
 from subprocess import run
 from parse import parse_links
-from links import validate_links
+from links import (
    new_links,
    validate_links
 )
 from archive_methods import archive_links, _RESULTS_TOTALS
 from index import (
    write_links_index,
@ -19,6 +22,7 @@ from index import (
    parse_json_link_index,
 )
 from config import (
    ONLY_NEW,
    OUTPUT_PERMISSIONS,
    OUTPUT_DIR,
    ANSI,
@ -45,7 +49,7 @@ def print_help():
    print("    ./bin/bookmark-archiver ~/Downloads/bookmarks_export.html\n")
-def merge_links(archive_path=OUTPUT_DIR, import_path=None):
+def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False):
    """get new links from file and optionally append them to links in existing archive"""
    all_links = []
    if import_path:
@ -60,7 +64,7 @@ def merge_links(archive_path=OUTPUT_DIR, import_path=None):
        all_links = validate_links(existing_links + all_links)
    num_new_links = len(all_links) - len(existing_links)
-    if num_new_links:
+    if num_new_links and not only_new:
        print('[{green}+{reset}] [{}] Adding {} new links from {} to {}/index.json'.format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            num_new_links,
@ -76,6 +80,9 @@ def merge_links(archive_path=OUTPUT_DIR, import_path=None):
    #         **ANSI,
    #     ))
    if only_new:
        return new_links(all_links, existing_links)
    return all_links
 def update_archive(archive_path, links, source=None, resume=None, append=True):
@ -158,7 +165,8 @@ if __name__ == '__main__':
        source = download_url(source)
    # Step 1: Parse the links and dedupe them with existing archive
-    links = merge_links(archive_path=out_dir, import_path=source)
+    links = merge_links(archive_path=out_dir, import_path=source, only_new=False)
    new_links = merge_links(archive_path=out_dir, import_path=source, only_new=True)
    # Step 2: Write new index
    write_links_index(out_dir=out_dir, links=links)
@ -167,4 +175,7 @@ if __name__ == '__main__':
    # cleanup_archive(out_dir, links)
    # Step 4: Run the archive methods for each link
-    update_archive(out_dir, links, source=source, resume=resume, append=True)
+    if ONLY_NEW:
        update_archive(out_dir, new_links, source=source, resume=resume, append=True)
    else:
        update_archive(out_dir, links, source=source, resume=resume, append=True)
--- a/archiver/config.py
+++ b/archiver/config.py
@ -13,6 +13,7 @@ from subprocess import run, PIPE
 IS_TTY = sys.stdout.isatty()
 USE_COLOR =              os.getenv('USE_COLOR',              str(IS_TTY)        ).lower() == 'true'
 SHOW_PROGRESS =          os.getenv('SHOW_PROGRESS',          str(IS_TTY)        ).lower() == 'true'
 ONLY_NEW =               os.getenv('ONLY_NEW',               'False'            ).lower() == 'true'
 FETCH_WGET =             os.getenv('FETCH_WGET',             'True'             ).lower() == 'true'
 FETCH_WGET_REQUISITES =  os.getenv('FETCH_WGET_REQUISITES',  'True'             ).lower() == 'true'
 FETCH_AUDIO =            os.getenv('FETCH_AUDIO',            'False'            ).lower() == 'true'
--- a/archiver/links.py
+++ b/archiver/links.py
@ -74,6 +74,14 @@ def validate_links(links):
    return list(links)
 def new_links(all_links, existing_links):
    """
    Return all links which are in the all_links but not in the existing_links.
    This is used to determine which links are new and not indexed jet. Set the
    ONLY_NEW environment variable to activate this filter mechanism.
    """
    existing_urls = {link['url'] for link in existing_links}
    return [link for link in all_links if link['url'] not in existing_urls]
 def archivable_links(links):
    """remove chrome://, about:// or other schemed links that cant be archived"""