From 69c007ce8536db173b4ba367236a811848464808 Mon Sep 17 00:00:00 2001 From: Aaron Fischer Date: Fri, 19 Oct 2018 21:28:38 +0200 Subject: [PATCH 1/4] Optionally import only new links When importing a huge list of links periodically (from a big dump of links from a bookmark service for example) with a lot of broken links, this links will always be rechecked. To skip this, the environment variable ONLY_NEW can be used to only import new links and skip the rest altogether. This partially fixes #95. --- README.md | 6 ++++++ archiver/archive.py | 19 +++++++++++++++---- archiver/config.py | 1 + archiver/links.py | 15 +++++++++++++++ 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6b05a0b3..5f159e08 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,11 @@ You can run it in parallel by using the `resume` feature, or by manually splitti ``` Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running). +If you already imported a huge list of bookmarks and want to import only new +bookmarks, you can use the `ONLY_NEW` environment variable. This is useful if +you want to import a bookmark dump periodically and want to skip broken links +which are already in the index. + ## Configuration You can tweak parameters via environment variables, or by editing `config.py` directly: @@ -158,6 +163,7 @@ env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./arc **Archive Options:** - maximum allowed download time per link: `TIMEOUT` values: [`60`]/`30`/`...` + - import only new links: `ONLY_NEW` values `True`/[`False`] - archive methods (values: [`True`]/`False`): - fetch page with wget: `FETCH_WGET` - fetch images/css/js with wget: `FETCH_WGET_REQUISITES` (True is highly recommended) diff --git a/archiver/archive.py b/archiver/archive.py index c795b316..b3384bc7 100755 --- a/archiver/archive.py +++ b/archiver/archive.py @@ -10,7 +10,10 @@ from datetime import datetime from subprocess import run from parse import parse_links -from links import validate_links +from links import ( + new_links, + validate_links +) from archive_methods import archive_links, _RESULTS_TOTALS from index import ( write_links_index, @@ -19,6 +22,7 @@ from index import ( parse_json_link_index, ) from config import ( + ONLY_NEW, OUTPUT_PERMISSIONS, OUTPUT_DIR, ANSI, @@ -45,7 +49,7 @@ def print_help(): print(" ./bin/bookmark-archiver ~/Downloads/bookmarks_export.html\n") -def merge_links(archive_path=OUTPUT_DIR, import_path=None): +def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False): """get new links from file and optionally append them to links in existing archive""" all_links = [] if import_path: @@ -76,6 +80,9 @@ def merge_links(archive_path=OUTPUT_DIR, import_path=None): # **ANSI, # )) + if only_new: + return new_links(all_links, existing_links) + return all_links def update_archive(archive_path, links, source=None, resume=None, append=True): @@ -158,7 +165,7 @@ if __name__ == '__main__': source = download_url(source) # Step 1: Parse the links and dedupe them with existing archive - links = merge_links(archive_path=out_dir, import_path=source) + links = merge_links(archive_path=out_dir, import_path=source, only_new=False) # Step 2: Write new index write_links_index(out_dir=out_dir, links=links) @@ -167,4 +174,8 @@ if __name__ == '__main__': # cleanup_archive(out_dir, links) # Step 4: Run the archive methods for each link - update_archive(out_dir, links, source=source, resume=resume, append=True) + if ONLY_NEW: + new_links = merge_links(archive_path=out_dir, import_path=source, only_new=True) + update_archive(out_dir, new_links, source=source, resume=resume, append=True) + else: + update_archive(out_dir, links, source=source, resume=resume, append=True) diff --git a/archiver/config.py b/archiver/config.py index 2817cdef..1fc2eb0a 100644 --- a/archiver/config.py +++ b/archiver/config.py @@ -13,6 +13,7 @@ from subprocess import run, PIPE IS_TTY = sys.stdout.isatty() USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true' SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true' +ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true' FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true' FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true' FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' ).lower() == 'true' diff --git a/archiver/links.py b/archiver/links.py index 04e7ed24..a8e8b7e5 100644 --- a/archiver/links.py +++ b/archiver/links.py @@ -74,6 +74,21 @@ def validate_links(links): return list(links) +def new_links(imported_links, existing_links): + """ + Return all links which are in the imported_links but not in the existing_links. + This is used to determine which links are new and not indexed jet. Set the + ONLY_NEW environment variable to activate this filter mechanism. + """ + new_links = [] + for i_link in imported_links: + found_link_in_existing_links = False + for e_link in existing_links: + if i_link['url'] == e_link['url']: + found_link_in_existing_links = True + if not found_link_in_existing_links: + new_links.append(i_link) + return new_links def archivable_links(links): """remove chrome://, about:// or other schemed links that cant be archived""" From b1b6be4f13a403420a58fc8c06462c605deb16ed Mon Sep 17 00:00:00 2001 From: Aaron Fischer Date: Fri, 19 Oct 2018 22:35:08 +0200 Subject: [PATCH 2/4] merge_links() used wrong index Because merge_links() use the index, we need to get the new_links() _before_ we manipulate the index with write_links_index(). This has the negative side effect that the "Adding X new links ..." will output twice (because we execute merge_links() twice. For that, we only output stuff when the only_new is not set. --- archiver/archive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archiver/archive.py b/archiver/archive.py index b3384bc7..64aa0f25 100755 --- a/archiver/archive.py +++ b/archiver/archive.py @@ -64,7 +64,7 @@ def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False): all_links = validate_links(existing_links + all_links) num_new_links = len(all_links) - len(existing_links) - if num_new_links: + if num_new_links and not only_new: print('[{green}+{reset}] [{}] Adding {} new links from {} to {}/index.json'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), num_new_links, @@ -166,7 +166,8 @@ if __name__ == '__main__': # Step 1: Parse the links and dedupe them with existing archive links = merge_links(archive_path=out_dir, import_path=source, only_new=False) - + new_links = merge_links(archive_path=out_dir, import_path=source, only_new=True) + # Step 2: Write new index write_links_index(out_dir=out_dir, links=links) @@ -175,7 +176,6 @@ if __name__ == '__main__': # Step 4: Run the archive methods for each link if ONLY_NEW: - new_links = merge_links(archive_path=out_dir, import_path=source, only_new=True) update_archive(out_dir, new_links, source=source, resume=resume, append=True) else: update_archive(out_dir, links, source=source, resume=resume, append=True) From ebc327bb897c137c66ee4a0cbb0b616f17175897 Mon Sep 17 00:00:00 2001 From: Aaron Fischer Date: Sun, 21 Oct 2018 22:36:32 +0200 Subject: [PATCH 3/4] Make O(n^2) loop to an O(n) problem. --- archiver/links.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/archiver/links.py b/archiver/links.py index a8e8b7e5..990df953 100644 --- a/archiver/links.py +++ b/archiver/links.py @@ -74,21 +74,14 @@ def validate_links(links): return list(links) -def new_links(imported_links, existing_links): +def new_links(all_links, existing_links): """ - Return all links which are in the imported_links but not in the existing_links. + Return all links which are in the all_links but not in the existing_links. This is used to determine which links are new and not indexed jet. Set the ONLY_NEW environment variable to activate this filter mechanism. """ - new_links = [] - for i_link in imported_links: - found_link_in_existing_links = False - for e_link in existing_links: - if i_link['url'] == e_link['url']: - found_link_in_existing_links = True - if not found_link_in_existing_links: - new_links.append(i_link) - return new_links + existing_urls = list(map(lambda l: l['url'], existing_links)) + return list(filter(lambda l: l['url'] not in existing_urls, all_links)) def archivable_links(links): """remove chrome://, about:// or other schemed links that cant be archived""" From a2f5fa8ba69ed87916208e3f0439509f7a72da98 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 24 Oct 2018 21:07:35 +0200 Subject: [PATCH 4/4] Use a more appropriate coding style from @pirate. Co-Authored-By: f0086 --- archiver/links.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archiver/links.py b/archiver/links.py index 990df953..a16ca594 100644 --- a/archiver/links.py +++ b/archiver/links.py @@ -80,8 +80,8 @@ def new_links(all_links, existing_links): This is used to determine which links are new and not indexed jet. Set the ONLY_NEW environment variable to activate this filter mechanism. """ - existing_urls = list(map(lambda l: l['url'], existing_links)) - return list(filter(lambda l: l['url'] not in existing_urls, all_links)) + existing_urls = {link['url'] for link in existing_links} + return [link for link in all_links if link['url'] not in existing_urls] def archivable_links(links): """remove chrome://, about:// or other schemed links that cant be archived"""