mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 15:14:31 -04:00
Merge pull request #107 from f0086/import-only-new-links
Optionally import only new links
This commit is contained in:
commit
678ce229c4
4 changed files with 32 additions and 6 deletions
|
@ -142,6 +142,11 @@ You can run it in parallel by using the `resume` feature, or by manually splitti
|
||||||
```
|
```
|
||||||
Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running).
|
Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running).
|
||||||
|
|
||||||
|
If you already imported a huge list of bookmarks and want to import only new
|
||||||
|
bookmarks, you can use the `ONLY_NEW` environment variable. This is useful if
|
||||||
|
you want to import a bookmark dump periodically and want to skip broken links
|
||||||
|
which are already in the index.
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
You can tweak parameters via environment variables, or by editing `config.py` directly:
|
You can tweak parameters via environment variables, or by editing `config.py` directly:
|
||||||
|
@ -160,6 +165,7 @@ env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./arc
|
||||||
|
|
||||||
**Archive Options:**
|
**Archive Options:**
|
||||||
- maximum allowed download time per link: `TIMEOUT` values: [`60`]/`30`/`...`
|
- maximum allowed download time per link: `TIMEOUT` values: [`60`]/`30`/`...`
|
||||||
|
- import only new links: `ONLY_NEW` values `True`/[`False`]
|
||||||
- archive methods (values: [`True`]/`False`):
|
- archive methods (values: [`True`]/`False`):
|
||||||
- fetch page with wget: `FETCH_WGET`
|
- fetch page with wget: `FETCH_WGET`
|
||||||
- fetch images/css/js with wget: `FETCH_WGET_REQUISITES` (True is highly recommended)
|
- fetch images/css/js with wget: `FETCH_WGET_REQUISITES` (True is highly recommended)
|
||||||
|
|
|
@ -10,7 +10,10 @@ from datetime import datetime
|
||||||
from subprocess import run
|
from subprocess import run
|
||||||
|
|
||||||
from parse import parse_links
|
from parse import parse_links
|
||||||
from links import validate_links
|
from links import (
|
||||||
|
new_links,
|
||||||
|
validate_links
|
||||||
|
)
|
||||||
from archive_methods import archive_links, _RESULTS_TOTALS
|
from archive_methods import archive_links, _RESULTS_TOTALS
|
||||||
from index import (
|
from index import (
|
||||||
write_links_index,
|
write_links_index,
|
||||||
|
@ -19,6 +22,7 @@ from index import (
|
||||||
parse_json_link_index,
|
parse_json_link_index,
|
||||||
)
|
)
|
||||||
from config import (
|
from config import (
|
||||||
|
ONLY_NEW,
|
||||||
OUTPUT_PERMISSIONS,
|
OUTPUT_PERMISSIONS,
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
ANSI,
|
ANSI,
|
||||||
|
@ -45,7 +49,7 @@ def print_help():
|
||||||
print(" ./bin/bookmark-archiver ~/Downloads/bookmarks_export.html\n")
|
print(" ./bin/bookmark-archiver ~/Downloads/bookmarks_export.html\n")
|
||||||
|
|
||||||
|
|
||||||
def merge_links(archive_path=OUTPUT_DIR, import_path=None):
|
def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False):
|
||||||
"""get new links from file and optionally append them to links in existing archive"""
|
"""get new links from file and optionally append them to links in existing archive"""
|
||||||
all_links = []
|
all_links = []
|
||||||
if import_path:
|
if import_path:
|
||||||
|
@ -60,7 +64,7 @@ def merge_links(archive_path=OUTPUT_DIR, import_path=None):
|
||||||
all_links = validate_links(existing_links + all_links)
|
all_links = validate_links(existing_links + all_links)
|
||||||
|
|
||||||
num_new_links = len(all_links) - len(existing_links)
|
num_new_links = len(all_links) - len(existing_links)
|
||||||
if num_new_links:
|
if num_new_links and not only_new:
|
||||||
print('[{green}+{reset}] [{}] Adding {} new links from {} to {}/index.json'.format(
|
print('[{green}+{reset}] [{}] Adding {} new links from {} to {}/index.json'.format(
|
||||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
num_new_links,
|
num_new_links,
|
||||||
|
@ -76,6 +80,9 @@ def merge_links(archive_path=OUTPUT_DIR, import_path=None):
|
||||||
# **ANSI,
|
# **ANSI,
|
||||||
# ))
|
# ))
|
||||||
|
|
||||||
|
if only_new:
|
||||||
|
return new_links(all_links, existing_links)
|
||||||
|
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
def update_archive(archive_path, links, source=None, resume=None, append=True):
|
def update_archive(archive_path, links, source=None, resume=None, append=True):
|
||||||
|
@ -158,7 +165,8 @@ if __name__ == '__main__':
|
||||||
source = download_url(source)
|
source = download_url(source)
|
||||||
|
|
||||||
# Step 1: Parse the links and dedupe them with existing archive
|
# Step 1: Parse the links and dedupe them with existing archive
|
||||||
links = merge_links(archive_path=out_dir, import_path=source)
|
links = merge_links(archive_path=out_dir, import_path=source, only_new=False)
|
||||||
|
new_links = merge_links(archive_path=out_dir, import_path=source, only_new=True)
|
||||||
|
|
||||||
# Step 2: Write new index
|
# Step 2: Write new index
|
||||||
write_links_index(out_dir=out_dir, links=links)
|
write_links_index(out_dir=out_dir, links=links)
|
||||||
|
@ -167,4 +175,7 @@ if __name__ == '__main__':
|
||||||
# cleanup_archive(out_dir, links)
|
# cleanup_archive(out_dir, links)
|
||||||
|
|
||||||
# Step 4: Run the archive methods for each link
|
# Step 4: Run the archive methods for each link
|
||||||
update_archive(out_dir, links, source=source, resume=resume, append=True)
|
if ONLY_NEW:
|
||||||
|
update_archive(out_dir, new_links, source=source, resume=resume, append=True)
|
||||||
|
else:
|
||||||
|
update_archive(out_dir, links, source=source, resume=resume, append=True)
|
||||||
|
|
|
@ -13,6 +13,7 @@ from subprocess import run, PIPE
|
||||||
IS_TTY = sys.stdout.isatty()
|
IS_TTY = sys.stdout.isatty()
|
||||||
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
|
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
|
||||||
SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true'
|
SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true'
|
||||||
|
ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true'
|
||||||
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
|
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
|
||||||
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
|
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
|
||||||
FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' ).lower() == 'true'
|
FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' ).lower() == 'true'
|
||||||
|
|
|
@ -74,6 +74,14 @@ def validate_links(links):
|
||||||
|
|
||||||
return list(links)
|
return list(links)
|
||||||
|
|
||||||
|
def new_links(all_links, existing_links):
|
||||||
|
"""
|
||||||
|
Return all links which are in the all_links but not in the existing_links.
|
||||||
|
This is used to determine which links are new and not indexed jet. Set the
|
||||||
|
ONLY_NEW environment variable to activate this filter mechanism.
|
||||||
|
"""
|
||||||
|
existing_urls = {link['url'] for link in existing_links}
|
||||||
|
return [link for link in all_links if link['url'] not in existing_urls]
|
||||||
|
|
||||||
def archivable_links(links):
|
def archivable_links(links):
|
||||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue