mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-25 12:14:32 -04:00
major codebase-wide code cleanups
This commit is contained in:
parent
c806068683
commit
e6bd1f8ca8
8 changed files with 825 additions and 743 deletions
archivebox
|
@ -1,225 +1,132 @@
|
|||
#!/usr/bin/env python3
|
||||
# ArchiveBox
|
||||
# Nick Sweeting 2017 | MIT License
|
||||
# https://github.com/pirate/ArchiveBox
|
||||
"""
|
||||
ArchiveBox command line application.
|
||||
|
||||
./archive and ./bin/archivebox both point to this file,
|
||||
but you can also run it directly using `python3 archive.py`
|
||||
|
||||
Usage & Documentation:
|
||||
https://github.com/pirate/ArchiveBox/Wiki
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from datetime import datetime
|
||||
from peekable import Peekable
|
||||
|
||||
|
||||
from parse import parse_links
|
||||
from links import validate_links, links_after_timestamp
|
||||
from archive_methods import archive_link, _RESULTS_TOTALS
|
||||
from index import (
|
||||
write_links_index,
|
||||
parse_json_links_index,
|
||||
)
|
||||
from links import links_after_timestamp
|
||||
from index import write_links_index, load_links_index
|
||||
from archive_methods import archive_link
|
||||
from config import (
|
||||
ARCHIVE_DIR,
|
||||
ONLY_NEW,
|
||||
OUTPUT_DIR,
|
||||
REPO_DIR,
|
||||
ANSI,
|
||||
GIT_SHA,
|
||||
)
|
||||
from util import (
|
||||
check_dependencies,
|
||||
save_remote_source,
|
||||
save_stdin_source,
|
||||
pretty_path,
|
||||
check_links_structure,
|
||||
)
|
||||
from logs import (
|
||||
log_archiving_started,
|
||||
log_archiving_paused,
|
||||
log_archiving_finished,
|
||||
)
|
||||
|
||||
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
|
||||
__VERSION__ = GIT_SHA
|
||||
__DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.'
|
||||
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
|
||||
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
|
||||
|
||||
|
||||
def print_help():
|
||||
print(__DESCRIPTION__)
|
||||
print("Documentation: {}\n".format(__DOCUMENTATION__))
|
||||
print('ArchiveBox: The self-hosted internet archive.\n')
|
||||
print("Documentation:")
|
||||
print(" https://github.com/pirate/ArchiveBox/wiki\n")
|
||||
print("Usage:")
|
||||
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
|
||||
print("")
|
||||
print(" ./bin/archivebox https://example.com/feed.rss\n")
|
||||
print("")
|
||||
print(" echo 'https://examplecom' | ./bin/archivebox\n")
|
||||
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
|
||||
print(" ./bin/archivebox https://example.com/feed.rss\n")
|
||||
print(" ./bin/archivebox 15109948213.123\n")
|
||||
|
||||
|
||||
def load_links(archive_path=OUTPUT_DIR, import_path=None):
|
||||
"""get new links from file and optionally append them to links in existing archive"""
|
||||
|
||||
existing_links = []
|
||||
if archive_path:
|
||||
existing_links = parse_json_links_index(archive_path)
|
||||
check_links_structure(existing_links)
|
||||
|
||||
new_links = []
|
||||
if import_path:
|
||||
# parse and validate the import file
|
||||
raw_links, parser_name = parse_links(import_path)
|
||||
new_links = validate_links(raw_links)
|
||||
check_links_structure(new_links)
|
||||
|
||||
# merge existing links in archive_path and new links
|
||||
all_links = validate_links(existing_links + new_links)
|
||||
check_links_structure(all_links)
|
||||
num_new_links = len(all_links) - len(existing_links)
|
||||
|
||||
if import_path and parser_name:
|
||||
print(' > Adding {} new links to index (parsed import as {})'.format(
|
||||
num_new_links,
|
||||
parser_name,
|
||||
))
|
||||
|
||||
return all_links, new_links
|
||||
|
||||
|
||||
def update_archive(archive_path, links, source=None, resume=None, append=True):
|
||||
"""update or create index.html+json given a path to an export file containing new links"""
|
||||
|
||||
start_ts = datetime.now().timestamp()
|
||||
|
||||
if resume:
|
||||
print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
resume,
|
||||
**ANSI,
|
||||
))
|
||||
else:
|
||||
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(links),
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
check_links_structure(links)
|
||||
|
||||
# prefetch the first link off the generator so that if we pause or fail
|
||||
# immediately we can show that we paused on the first link and not just None
|
||||
to_archive = Peekable(links_after_timestamp(links, resume))
|
||||
idx, link = 0, to_archive.peek(0)
|
||||
|
||||
# loop over links and archive them
|
||||
try:
|
||||
check_dependencies()
|
||||
for idx, link in enumerate(to_archive):
|
||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||
archive_link(link_dir, link)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit, Exception) as e:
|
||||
# if isinstance(e, KeyboardInterrupt):
|
||||
# # Step 4: Re-write links index with updated titles, icons, and resources
|
||||
# all_links, _ = load_links(archive_path=out_dir)
|
||||
# write_links_index(out_dir=out_dir, links=all_links, finished=True)
|
||||
print()
|
||||
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
|
||||
**ANSI,
|
||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
idx=idx+1,
|
||||
timestamp=link['timestamp'],
|
||||
total=len(links),
|
||||
))
|
||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
||||
print(' Continue where you left off by running:')
|
||||
print(' {} {}'.format(
|
||||
pretty_path(sys.argv[0]),
|
||||
link['timestamp'],
|
||||
))
|
||||
if not isinstance(e, KeyboardInterrupt):
|
||||
print()
|
||||
raise e
|
||||
raise SystemExit(1)
|
||||
|
||||
# print timing information & summary
|
||||
end_ts = datetime.now().timestamp()
|
||||
seconds = end_ts - start_ts
|
||||
if seconds > 60:
|
||||
duration = '{0:.2f} min'.format(seconds / 60, 2)
|
||||
else:
|
||||
duration = '{0:.2f} sec'.format(seconds, 2)
|
||||
|
||||
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
|
||||
ANSI['green'],
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(links),
|
||||
duration,
|
||||
ANSI['reset'],
|
||||
))
|
||||
print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
|
||||
print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
|
||||
print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
|
||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
argc = len(sys.argv)
|
||||
|
||||
if set(sys.argv).intersection(('-h', '--help', 'help')):
|
||||
def main(*args):
|
||||
if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
|
||||
print_help()
|
||||
raise SystemExit(0)
|
||||
|
||||
source = sys.argv[1] if argc > 1 else None # path of links file to import
|
||||
resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
|
||||
|
||||
stdin_raw_text = ''
|
||||
### Handle CLI arguments
|
||||
# ./archive bookmarks.html
|
||||
# ./archive 1523422111.234
|
||||
import_path, resume = None, None
|
||||
if len(args) == 2:
|
||||
# if the argument is a string, it's a import_path file to import
|
||||
# if it's a number, it's a timestamp to resume archiving from
|
||||
if args[1].replace('.', '').isdigit():
|
||||
import_path, resume = None, args[1]
|
||||
else:
|
||||
import_path, resume = args[1], None
|
||||
|
||||
### Set up output folder
|
||||
if not os.path.exists(OUTPUT_DIR):
|
||||
os.makedirs(OUTPUT_DIR)
|
||||
|
||||
### Handle ingesting urls piped in through stdin
|
||||
# (.e.g if user does cat example_urls.txt | ./archive)
|
||||
if not sys.stdin.isatty():
|
||||
stdin_raw_text = sys.stdin.read()
|
||||
if stdin_raw_text and import_path:
|
||||
print(
|
||||
'[X] You should pass either a path as an argument, '
|
||||
'or pass a list of links via stdin, but not both.\n'
|
||||
)
|
||||
print_help()
|
||||
raise SystemExit(1)
|
||||
|
||||
if source and stdin_raw_text:
|
||||
print(
|
||||
'[X] You should pass either a path as an argument, '
|
||||
'or pass a list of links via stdin, but not both.\n'
|
||||
)
|
||||
print_help()
|
||||
raise SystemExit(1)
|
||||
import_path = save_stdin_source(stdin_raw_text)
|
||||
|
||||
### Handle ingesting urls from a remote file/feed
|
||||
# (e.g. if an RSS feed URL is used as the import path)
|
||||
if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
import_path = save_remote_source(import_path)
|
||||
|
||||
### Run the main archive update process
|
||||
update_archive_data(import_path=import_path, resume=resume)
|
||||
|
||||
|
||||
if argc == 1:
|
||||
source, resume = None, None
|
||||
elif argc == 2:
|
||||
if all(d.isdigit() for d in sys.argv[1].split('.')):
|
||||
# argv[1] is a resume timestamp
|
||||
source, resume = None, sys.argv[1]
|
||||
else:
|
||||
# argv[1] is a path to a file to import
|
||||
source, resume = sys.argv[1].strip(), None
|
||||
elif argc == 3:
|
||||
source, resume = sys.argv[1].strip(), sys.argv[2]
|
||||
else:
|
||||
print_help()
|
||||
raise SystemExit(1)
|
||||
def update_archive_data(import_path=None, resume=None):
|
||||
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
||||
check_dependencies()
|
||||
|
||||
# See if archive folder already exists
|
||||
for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
|
||||
if os.path.exists(out_dir):
|
||||
break
|
||||
else:
|
||||
out_dir = OUTPUT_DIR
|
||||
# Step 1: Load list of links from the existing index
|
||||
# merge in and dedupe new links from import_path
|
||||
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
|
||||
|
||||
# Step 0: Download url to local file (only happens if a URL is specified instead of local path)
|
||||
if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
source = save_remote_source(source)
|
||||
elif stdin_raw_text:
|
||||
source = save_stdin_source(stdin_raw_text)
|
||||
|
||||
# Step 1: Parse the links and dedupe them with existing archive
|
||||
all_links, new_links = load_links(archive_path=out_dir, import_path=source)
|
||||
|
||||
# Step 2: Write new index
|
||||
write_links_index(out_dir=out_dir, links=all_links)
|
||||
# Step 2: Write updated index with deduped old and new links back to disk
|
||||
write_links_index(out_dir=OUTPUT_DIR, links=all_links)
|
||||
|
||||
# Step 3: Run the archive methods for each link
|
||||
if ONLY_NEW:
|
||||
update_archive(out_dir, new_links, source=source, resume=resume, append=True)
|
||||
else:
|
||||
update_archive(out_dir, all_links, source=source, resume=resume, append=True)
|
||||
links = new_links if ONLY_NEW else all_links
|
||||
log_archiving_started(len(links), resume)
|
||||
idx, link = 0, 0
|
||||
try:
|
||||
for idx, link in enumerate(links_after_timestamp(links, resume)):
|
||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||
archive_link(link_dir, link)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
log_archiving_paused(len(links), idx, link and link['timestamp'])
|
||||
raise SystemExit(0)
|
||||
|
||||
except:
|
||||
print()
|
||||
raise
|
||||
|
||||
log_archiving_finished(len(links))
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
all_links, _ = load_links(archive_path=out_dir)
|
||||
write_links_index(out_dir=out_dir, links=all_links, finished=True)
|
||||
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
|
||||
write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(*sys.argv)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue