mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 06:34:25 -04:00
major codebase-wide code cleanups
This commit is contained in:
parent
c806068683
commit
e6bd1f8ca8
8 changed files with 825 additions and 743 deletions
|
@ -1,225 +1,132 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# ArchiveBox
|
"""
|
||||||
# Nick Sweeting 2017 | MIT License
|
ArchiveBox command line application.
|
||||||
# https://github.com/pirate/ArchiveBox
|
|
||||||
|
./archive and ./bin/archivebox both point to this file,
|
||||||
|
but you can also run it directly using `python3 archive.py`
|
||||||
|
|
||||||
|
Usage & Documentation:
|
||||||
|
https://github.com/pirate/ArchiveBox/Wiki
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from datetime import datetime
|
from links import links_after_timestamp
|
||||||
from peekable import Peekable
|
from index import write_links_index, load_links_index
|
||||||
|
from archive_methods import archive_link
|
||||||
|
|
||||||
from parse import parse_links
|
|
||||||
from links import validate_links, links_after_timestamp
|
|
||||||
from archive_methods import archive_link, _RESULTS_TOTALS
|
|
||||||
from index import (
|
|
||||||
write_links_index,
|
|
||||||
parse_json_links_index,
|
|
||||||
)
|
|
||||||
from config import (
|
from config import (
|
||||||
ARCHIVE_DIR,
|
ARCHIVE_DIR,
|
||||||
ONLY_NEW,
|
ONLY_NEW,
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
REPO_DIR,
|
|
||||||
ANSI,
|
|
||||||
GIT_SHA,
|
GIT_SHA,
|
||||||
)
|
)
|
||||||
from util import (
|
from util import (
|
||||||
check_dependencies,
|
check_dependencies,
|
||||||
save_remote_source,
|
save_remote_source,
|
||||||
save_stdin_source,
|
save_stdin_source,
|
||||||
pretty_path,
|
)
|
||||||
check_links_structure,
|
from logs import (
|
||||||
|
log_archiving_started,
|
||||||
|
log_archiving_paused,
|
||||||
|
log_archiving_finished,
|
||||||
)
|
)
|
||||||
|
|
||||||
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
|
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
|
||||||
__VERSION__ = GIT_SHA
|
__VERSION__ = GIT_SHA
|
||||||
__DESCRIPTION__ = 'ArchiveBox Usage: Create a browsable html archive of a list of links.'
|
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
|
||||||
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
|
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
|
||||||
|
|
||||||
|
|
||||||
def print_help():
|
def print_help():
|
||||||
print(__DESCRIPTION__)
|
print('ArchiveBox: The self-hosted internet archive.\n')
|
||||||
print("Documentation: {}\n".format(__DOCUMENTATION__))
|
print("Documentation:")
|
||||||
|
print(" https://github.com/pirate/ArchiveBox/wiki\n")
|
||||||
print("Usage:")
|
print("Usage:")
|
||||||
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
|
|
||||||
print("")
|
|
||||||
print(" ./bin/archivebox https://example.com/feed.rss\n")
|
|
||||||
print("")
|
|
||||||
print(" echo 'https://examplecom' | ./bin/archivebox\n")
|
print(" echo 'https://examplecom' | ./bin/archivebox\n")
|
||||||
|
print(" ./bin/archivebox ~/Downloads/bookmarks_export.html\n")
|
||||||
|
print(" ./bin/archivebox https://example.com/feed.rss\n")
|
||||||
|
print(" ./bin/archivebox 15109948213.123\n")
|
||||||
|
|
||||||
|
|
||||||
def load_links(archive_path=OUTPUT_DIR, import_path=None):
|
def main(*args):
|
||||||
"""get new links from file and optionally append them to links in existing archive"""
|
if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
|
||||||
|
|
||||||
existing_links = []
|
|
||||||
if archive_path:
|
|
||||||
existing_links = parse_json_links_index(archive_path)
|
|
||||||
check_links_structure(existing_links)
|
|
||||||
|
|
||||||
new_links = []
|
|
||||||
if import_path:
|
|
||||||
# parse and validate the import file
|
|
||||||
raw_links, parser_name = parse_links(import_path)
|
|
||||||
new_links = validate_links(raw_links)
|
|
||||||
check_links_structure(new_links)
|
|
||||||
|
|
||||||
# merge existing links in archive_path and new links
|
|
||||||
all_links = validate_links(existing_links + new_links)
|
|
||||||
check_links_structure(all_links)
|
|
||||||
num_new_links = len(all_links) - len(existing_links)
|
|
||||||
|
|
||||||
if import_path and parser_name:
|
|
||||||
print(' > Adding {} new links to index (parsed import as {})'.format(
|
|
||||||
num_new_links,
|
|
||||||
parser_name,
|
|
||||||
))
|
|
||||||
|
|
||||||
return all_links, new_links
|
|
||||||
|
|
||||||
|
|
||||||
def update_archive(archive_path, links, source=None, resume=None, append=True):
|
|
||||||
"""update or create index.html+json given a path to an export file containing new links"""
|
|
||||||
|
|
||||||
start_ts = datetime.now().timestamp()
|
|
||||||
|
|
||||||
if resume:
|
|
||||||
print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'.format(
|
|
||||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
||||||
resume,
|
|
||||||
**ANSI,
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
|
|
||||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
||||||
len(links),
|
|
||||||
**ANSI,
|
|
||||||
))
|
|
||||||
|
|
||||||
check_links_structure(links)
|
|
||||||
|
|
||||||
# prefetch the first link off the generator so that if we pause or fail
|
|
||||||
# immediately we can show that we paused on the first link and not just None
|
|
||||||
to_archive = Peekable(links_after_timestamp(links, resume))
|
|
||||||
idx, link = 0, to_archive.peek(0)
|
|
||||||
|
|
||||||
# loop over links and archive them
|
|
||||||
try:
|
|
||||||
check_dependencies()
|
|
||||||
for idx, link in enumerate(to_archive):
|
|
||||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
|
||||||
archive_link(link_dir, link)
|
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit, Exception) as e:
|
|
||||||
# if isinstance(e, KeyboardInterrupt):
|
|
||||||
# # Step 4: Re-write links index with updated titles, icons, and resources
|
|
||||||
# all_links, _ = load_links(archive_path=out_dir)
|
|
||||||
# write_links_index(out_dir=out_dir, links=all_links, finished=True)
|
|
||||||
print()
|
|
||||||
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
|
|
||||||
**ANSI,
|
|
||||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
||||||
idx=idx+1,
|
|
||||||
timestamp=link['timestamp'],
|
|
||||||
total=len(links),
|
|
||||||
))
|
|
||||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
|
||||||
print(' Continue where you left off by running:')
|
|
||||||
print(' {} {}'.format(
|
|
||||||
pretty_path(sys.argv[0]),
|
|
||||||
link['timestamp'],
|
|
||||||
))
|
|
||||||
if not isinstance(e, KeyboardInterrupt):
|
|
||||||
print()
|
|
||||||
raise e
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
# print timing information & summary
|
|
||||||
end_ts = datetime.now().timestamp()
|
|
||||||
seconds = end_ts - start_ts
|
|
||||||
if seconds > 60:
|
|
||||||
duration = '{0:.2f} min'.format(seconds / 60, 2)
|
|
||||||
else:
|
|
||||||
duration = '{0:.2f} sec'.format(seconds, 2)
|
|
||||||
|
|
||||||
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
|
|
||||||
ANSI['green'],
|
|
||||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
||||||
len(links),
|
|
||||||
duration,
|
|
||||||
ANSI['reset'],
|
|
||||||
))
|
|
||||||
print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped']))
|
|
||||||
print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded']))
|
|
||||||
print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
|
|
||||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
argc = len(sys.argv)
|
|
||||||
|
|
||||||
if set(sys.argv).intersection(('-h', '--help', 'help')):
|
|
||||||
print_help()
|
print_help()
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
|
||||||
source = sys.argv[1] if argc > 1 else None # path of links file to import
|
### Handle CLI arguments
|
||||||
resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
|
# ./archive bookmarks.html
|
||||||
|
# ./archive 1523422111.234
|
||||||
stdin_raw_text = ''
|
import_path, resume = None, None
|
||||||
|
if len(args) == 2:
|
||||||
|
# if the argument is a string, it's a import_path file to import
|
||||||
|
# if it's a number, it's a timestamp to resume archiving from
|
||||||
|
if args[1].replace('.', '').isdigit():
|
||||||
|
import_path, resume = None, args[1]
|
||||||
|
else:
|
||||||
|
import_path, resume = args[1], None
|
||||||
|
|
||||||
|
### Set up output folder
|
||||||
|
if not os.path.exists(OUTPUT_DIR):
|
||||||
|
os.makedirs(OUTPUT_DIR)
|
||||||
|
|
||||||
|
### Handle ingesting urls piped in through stdin
|
||||||
|
# (.e.g if user does cat example_urls.txt | ./archive)
|
||||||
if not sys.stdin.isatty():
|
if not sys.stdin.isatty():
|
||||||
stdin_raw_text = sys.stdin.read()
|
stdin_raw_text = sys.stdin.read()
|
||||||
|
if stdin_raw_text and import_path:
|
||||||
|
print(
|
||||||
|
'[X] You should pass either a path as an argument, '
|
||||||
|
'or pass a list of links via stdin, but not both.\n'
|
||||||
|
)
|
||||||
|
print_help()
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
if source and stdin_raw_text:
|
import_path = save_stdin_source(stdin_raw_text)
|
||||||
print(
|
|
||||||
'[X] You should pass either a path as an argument, '
|
### Handle ingesting urls from a remote file/feed
|
||||||
'or pass a list of links via stdin, but not both.\n'
|
# (e.g. if an RSS feed URL is used as the import path)
|
||||||
)
|
if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||||
print_help()
|
import_path = save_remote_source(import_path)
|
||||||
raise SystemExit(1)
|
|
||||||
|
### Run the main archive update process
|
||||||
|
update_archive_data(import_path=import_path, resume=resume)
|
||||||
|
|
||||||
|
|
||||||
if argc == 1:
|
def update_archive_data(import_path=None, resume=None):
|
||||||
source, resume = None, None
|
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
||||||
elif argc == 2:
|
check_dependencies()
|
||||||
if all(d.isdigit() for d in sys.argv[1].split('.')):
|
|
||||||
# argv[1] is a resume timestamp
|
|
||||||
source, resume = None, sys.argv[1]
|
|
||||||
else:
|
|
||||||
# argv[1] is a path to a file to import
|
|
||||||
source, resume = sys.argv[1].strip(), None
|
|
||||||
elif argc == 3:
|
|
||||||
source, resume = sys.argv[1].strip(), sys.argv[2]
|
|
||||||
else:
|
|
||||||
print_help()
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
# See if archive folder already exists
|
# Step 1: Load list of links from the existing index
|
||||||
for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
|
# merge in and dedupe new links from import_path
|
||||||
if os.path.exists(out_dir):
|
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
|
||||||
break
|
|
||||||
else:
|
|
||||||
out_dir = OUTPUT_DIR
|
|
||||||
|
|
||||||
# Step 0: Download url to local file (only happens if a URL is specified instead of local path)
|
# Step 2: Write updated index with deduped old and new links back to disk
|
||||||
if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
write_links_index(out_dir=OUTPUT_DIR, links=all_links)
|
||||||
source = save_remote_source(source)
|
|
||||||
elif stdin_raw_text:
|
|
||||||
source = save_stdin_source(stdin_raw_text)
|
|
||||||
|
|
||||||
# Step 1: Parse the links and dedupe them with existing archive
|
|
||||||
all_links, new_links = load_links(archive_path=out_dir, import_path=source)
|
|
||||||
|
|
||||||
# Step 2: Write new index
|
|
||||||
write_links_index(out_dir=out_dir, links=all_links)
|
|
||||||
|
|
||||||
# Step 3: Run the archive methods for each link
|
# Step 3: Run the archive methods for each link
|
||||||
if ONLY_NEW:
|
links = new_links if ONLY_NEW else all_links
|
||||||
update_archive(out_dir, new_links, source=source, resume=resume, append=True)
|
log_archiving_started(len(links), resume)
|
||||||
else:
|
idx, link = 0, 0
|
||||||
update_archive(out_dir, all_links, source=source, resume=resume, append=True)
|
try:
|
||||||
|
for idx, link in enumerate(links_after_timestamp(links, resume)):
|
||||||
|
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||||
|
archive_link(link_dir, link)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
log_archiving_paused(len(links), idx, link and link['timestamp'])
|
||||||
|
raise SystemExit(0)
|
||||||
|
|
||||||
|
except:
|
||||||
|
print()
|
||||||
|
raise
|
||||||
|
|
||||||
|
log_archiving_finished(len(links))
|
||||||
|
|
||||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||||
all_links, _ = load_links(archive_path=out_dir)
|
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
|
||||||
write_links_index(out_dir=out_dir, links=all_links, finished=True)
|
write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main(*sys.argv)
|
||||||
|
|
|
@ -3,18 +3,18 @@ import os
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from stdlib_patches import run, PIPE, DEVNULL
|
||||||
|
|
||||||
from index import (
|
from index import (
|
||||||
parse_json_link_index,
|
|
||||||
write_link_index,
|
write_link_index,
|
||||||
update_main_index,
|
patch_links_index,
|
||||||
|
load_json_link_index,
|
||||||
)
|
)
|
||||||
from config import (
|
from config import (
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
GIT_BINARY,
|
GIT_BINARY,
|
||||||
WGET_BINARY,
|
WGET_BINARY,
|
||||||
YOUTUBEDL_BINARY,
|
YOUTUBEDL_BINARY,
|
||||||
CHROME_BINARY,
|
|
||||||
FETCH_FAVICON,
|
FETCH_FAVICON,
|
||||||
FETCH_TITLE,
|
FETCH_TITLE,
|
||||||
FETCH_WGET,
|
FETCH_WGET,
|
||||||
|
@ -25,62 +25,37 @@ from config import (
|
||||||
FETCH_WARC,
|
FETCH_WARC,
|
||||||
FETCH_GIT,
|
FETCH_GIT,
|
||||||
FETCH_MEDIA,
|
FETCH_MEDIA,
|
||||||
RESOLUTION,
|
|
||||||
CHECK_SSL_VALIDITY,
|
|
||||||
SUBMIT_ARCHIVE_DOT_ORG,
|
SUBMIT_ARCHIVE_DOT_ORG,
|
||||||
COOKIES_FILE,
|
|
||||||
WGET_USER_AGENT,
|
|
||||||
CHROME_USER_AGENT,
|
|
||||||
CHROME_USER_DATA_DIR,
|
|
||||||
CHROME_HEADLESS,
|
|
||||||
CHROME_SANDBOX,
|
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
MEDIA_TIMEOUT,
|
MEDIA_TIMEOUT,
|
||||||
ANSI,
|
ANSI,
|
||||||
ARCHIVE_DIR,
|
OUTPUT_DIR,
|
||||||
GIT_DOMAINS,
|
GIT_DOMAINS,
|
||||||
GIT_SHA,
|
GIT_SHA,
|
||||||
|
WGET_USER_AGENT,
|
||||||
|
CHECK_SSL_VALIDITY,
|
||||||
|
COOKIES_FILE,
|
||||||
)
|
)
|
||||||
from util import (
|
from util import (
|
||||||
domain,
|
domain,
|
||||||
|
extension,
|
||||||
without_query,
|
without_query,
|
||||||
without_fragment,
|
without_fragment,
|
||||||
fetch_page_title,
|
fetch_page_title,
|
||||||
is_static_file,
|
is_static_file,
|
||||||
progress,
|
progress,
|
||||||
chmod_file,
|
chmod_file,
|
||||||
pretty_path,
|
|
||||||
print_error_hints,
|
|
||||||
check_link_structure,
|
check_link_structure,
|
||||||
wget_output_path,
|
wget_output_path,
|
||||||
run, PIPE, DEVNULL,
|
chrome_args,
|
||||||
|
)
|
||||||
|
from logs import (
|
||||||
|
_LAST_RUN_STATS,
|
||||||
|
log_link_archiving_started,
|
||||||
|
log_link_archiving_failed,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
_RESULTS_TOTALS = { # globals are bad, mmkay
|
|
||||||
'skipped': 0,
|
|
||||||
'succeded': 0,
|
|
||||||
'failed': 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
def load_link_index(link_dir, link):
|
|
||||||
"""check for an existing link archive in the given directory,
|
|
||||||
and load+merge it into the given link dict
|
|
||||||
"""
|
|
||||||
is_new = not os.path.exists(link_dir)
|
|
||||||
if is_new:
|
|
||||||
os.makedirs(link_dir)
|
|
||||||
else:
|
|
||||||
link = {
|
|
||||||
**parse_json_link_index(link_dir),
|
|
||||||
**link,
|
|
||||||
}
|
|
||||||
|
|
||||||
check_link_structure(link)
|
|
||||||
print_link_status_line(link_dir, link, is_new)
|
|
||||||
|
|
||||||
return link
|
|
||||||
|
|
||||||
|
|
||||||
class ArchiveError(Exception):
|
class ArchiveError(Exception):
|
||||||
def __init__(self, message, hints=None):
|
def __init__(self, message, hints=None):
|
||||||
|
@ -105,32 +80,24 @@ def archive_link(link_dir, link, overwrite=True):
|
||||||
active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle]
|
active_methods = [method for toggle, method in ARCHIVE_METHODS if toggle]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
link = load_link_index(link_dir, link)
|
is_new = not os.path.exists(link_dir)
|
||||||
|
if is_new:
|
||||||
|
os.makedirs(link_dir)
|
||||||
|
|
||||||
|
link = load_json_link_index(link_dir, link)
|
||||||
|
log_link_archiving_started(link_dir, link, is_new)
|
||||||
|
|
||||||
for archive_method in active_methods:
|
for archive_method in active_methods:
|
||||||
archive_method(link_dir, link, overwrite=overwrite)
|
archive_method(link_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
write_link_index(link_dir, link)
|
write_link_index(link_dir, link)
|
||||||
update_main_index(link)
|
patch_links_index(link)
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
||||||
|
|
||||||
return link
|
return link
|
||||||
|
|
||||||
def print_link_status_line(link_dir, link, is_new):
|
|
||||||
print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
|
|
||||||
symbol='+' if is_new else '*',
|
|
||||||
symbol_color=ANSI['green' if is_new else 'black'],
|
|
||||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
||||||
**{**link, 'title': link['title'] or link['url']},
|
|
||||||
**ANSI,
|
|
||||||
))
|
|
||||||
|
|
||||||
print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def attach_result_to_link(method):
|
def attach_result_to_link(method):
|
||||||
"""
|
"""
|
||||||
|
@ -178,15 +145,75 @@ def attach_result_to_link(method):
|
||||||
link['history'][method].append(history_entry)
|
link['history'][method].append(history_entry)
|
||||||
link['latest'][method] = result['output']
|
link['latest'][method] = result['output']
|
||||||
|
|
||||||
_RESULTS_TOTALS[history_entry['status']] += 1
|
_LAST_RUN_STATS[history_entry['status']] += 1
|
||||||
|
|
||||||
return link
|
return link
|
||||||
return timed_fetch_func
|
return timed_fetch_func
|
||||||
return decorator
|
return decorator
|
||||||
|
|
||||||
|
@attach_result_to_link('title')
|
||||||
|
def fetch_title(link_dir, link, timeout=TIMEOUT):
|
||||||
|
"""try to guess the page's title from its content"""
|
||||||
|
|
||||||
|
# if link already has valid title, skip it
|
||||||
|
if link['title'] and not link['title'].lower().startswith('http'):
|
||||||
|
return {'output': link['title'], 'status': 'skipped'}
|
||||||
|
|
||||||
|
if is_static_file(link['url']):
|
||||||
|
return {'output': None, 'status': 'skipped'}
|
||||||
|
|
||||||
|
end = progress(timeout, prefix=' ')
|
||||||
|
try:
|
||||||
|
title = fetch_page_title(link['url'], timeout=timeout, progress=False)
|
||||||
|
end()
|
||||||
|
output = title
|
||||||
|
except Exception as e:
|
||||||
|
end()
|
||||||
|
output = e
|
||||||
|
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||||
|
|
||||||
|
if title and title.strip():
|
||||||
|
link['title'] = title
|
||||||
|
output = title
|
||||||
|
|
||||||
|
return {
|
||||||
|
'cmd': 'fetch_page_title("{}")'.format(link['url']),
|
||||||
|
'output': output,
|
||||||
|
}
|
||||||
|
|
||||||
|
@attach_result_to_link('favicon')
|
||||||
|
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
|
||||||
|
"""download site favicon from google's favicon api"""
|
||||||
|
|
||||||
|
output = 'favicon.ico'
|
||||||
|
if os.path.exists(os.path.join(link_dir, output)):
|
||||||
|
return {'output': output, 'status': 'skipped'}
|
||||||
|
|
||||||
|
CMD = [
|
||||||
|
CURL_BINARY,
|
||||||
|
'--max-time', str(timeout),
|
||||||
|
'--location',
|
||||||
|
'--output', output,
|
||||||
|
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
||||||
|
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
|
||||||
|
]
|
||||||
|
end = progress(timeout, prefix=' ')
|
||||||
|
try:
|
||||||
|
run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
||||||
|
end()
|
||||||
|
chmod_file(output, cwd=link_dir)
|
||||||
|
except Exception as e:
|
||||||
|
end()
|
||||||
|
output = e
|
||||||
|
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'cmd': CMD,
|
||||||
|
'output': output,
|
||||||
|
}
|
||||||
|
|
||||||
@attach_result_to_link('wget')
|
@attach_result_to_link('wget')
|
||||||
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC, timeout=TIMEOUT):
|
def fetch_wget(link_dir, link, timeout=TIMEOUT):
|
||||||
"""download full site using wget"""
|
"""download full site using wget"""
|
||||||
|
|
||||||
domain_dir = os.path.join(link_dir, domain(link['url']))
|
domain_dir = os.path.join(link_dir, domain(link['url']))
|
||||||
|
@ -194,7 +221,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
||||||
if os.path.exists(domain_dir) and existing_file:
|
if os.path.exists(domain_dir) and existing_file:
|
||||||
return {'output': existing_file, 'status': 'skipped'}
|
return {'output': existing_file, 'status': 'skipped'}
|
||||||
|
|
||||||
if warc:
|
if FETCH_WARC:
|
||||||
warc_dir = os.path.join(link_dir, 'warc')
|
warc_dir = os.path.join(link_dir, 'warc')
|
||||||
os.makedirs(warc_dir, exist_ok=True)
|
os.makedirs(warc_dir, exist_ok=True)
|
||||||
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
|
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
|
||||||
|
@ -213,8 +240,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
||||||
'-e', 'robots=off',
|
'-e', 'robots=off',
|
||||||
'--restrict-file-names=unix',
|
'--restrict-file-names=unix',
|
||||||
'--timeout={}'.format(timeout),
|
'--timeout={}'.format(timeout),
|
||||||
*(() if warc else ('--timestamping',)),
|
*(() if FETCH_WARC else ('--timestamping',)),
|
||||||
*(('--warc-file={}'.format(warc_path),) if warc else ()),
|
*(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
|
||||||
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
|
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
|
||||||
*(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
|
*(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
|
||||||
*(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
|
*(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
|
||||||
|
@ -233,7 +260,8 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
||||||
if line.strip()
|
if line.strip()
|
||||||
]
|
]
|
||||||
|
|
||||||
# parse out number of files downloaded from "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
# parse out number of files downloaded from last line of stderr:
|
||||||
|
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||||
files_downloaded = (
|
files_downloaded = (
|
||||||
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
|
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
|
||||||
if 'Downloaded:' in output_tail[-1]
|
if 'Downloaded:' in output_tail[-1]
|
||||||
|
@ -263,20 +291,19 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
||||||
'output': output,
|
'output': output,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@attach_result_to_link('pdf')
|
@attach_result_to_link('pdf')
|
||||||
def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
def fetch_pdf(link_dir, link, timeout=TIMEOUT):
|
||||||
"""print PDF of site to file using chrome --headless"""
|
"""print PDF of site to file using chrome --headless"""
|
||||||
|
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link['url']):
|
||||||
return {'output': wget_output_path(link), 'status': 'skipped'}
|
return {'output': None, 'status': 'skipped'}
|
||||||
|
|
||||||
output = 'output.pdf'
|
output = 'output.pdf'
|
||||||
if os.path.exists(os.path.join(link_dir, output)):
|
if os.path.exists(os.path.join(link_dir, output)):
|
||||||
return {'output': output, 'status': 'skipped'}
|
return {'output': output, 'status': 'skipped'}
|
||||||
|
|
||||||
CMD = [
|
CMD = [
|
||||||
*chrome_headless(timeout=timeout, **chrome_kwargs),
|
*chrome_args(timeout=timeout),
|
||||||
'--print-to-pdf',
|
'--print-to-pdf',
|
||||||
link['url']
|
link['url']
|
||||||
]
|
]
|
||||||
|
@ -302,18 +329,18 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
||||||
}
|
}
|
||||||
|
|
||||||
@attach_result_to_link('screenshot')
|
@attach_result_to_link('screenshot')
|
||||||
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
|
||||||
"""take screenshot of site using chrome --headless"""
|
"""take screenshot of site using chrome --headless"""
|
||||||
|
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link['url']):
|
||||||
return {'output': wget_output_path(link), 'status': 'skipped'}
|
return {'output': None, 'status': 'skipped'}
|
||||||
|
|
||||||
output = 'screenshot.png'
|
output = 'screenshot.png'
|
||||||
if os.path.exists(os.path.join(link_dir, output)):
|
if os.path.exists(os.path.join(link_dir, output)):
|
||||||
return {'output': output, 'status': 'skipped'}
|
return {'output': output, 'status': 'skipped'}
|
||||||
|
|
||||||
CMD = [
|
CMD = [
|
||||||
*chrome_headless(timeout=timeout, **chrome_kwargs),
|
*chrome_args(timeout=timeout),
|
||||||
'--screenshot',
|
'--screenshot',
|
||||||
link['url'],
|
link['url'],
|
||||||
]
|
]
|
||||||
|
@ -337,18 +364,19 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
||||||
}
|
}
|
||||||
|
|
||||||
@attach_result_to_link('dom')
|
@attach_result_to_link('dom')
|
||||||
def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
def fetch_dom(link_dir, link, timeout=TIMEOUT):
|
||||||
"""print HTML of site to file using chrome --dump-html"""
|
"""print HTML of site to file using chrome --dump-html"""
|
||||||
|
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link['url']):
|
||||||
return {'output': wget_output_path(link), 'status': 'skipped'}
|
return {'output': None, 'status': 'skipped'}
|
||||||
|
|
||||||
output = 'output.html'
|
output = 'output.html'
|
||||||
if os.path.exists(os.path.join(link_dir, output)):
|
output_path = os.path.join(link_dir, output)
|
||||||
|
if os.path.exists(output_path):
|
||||||
return {'output': output, 'status': 'skipped'}
|
return {'output': output, 'status': 'skipped'}
|
||||||
|
|
||||||
CMD = [
|
CMD = [
|
||||||
*chrome_headless(timeout=timeout, **chrome_kwargs),
|
*chrome_args(timeout=timeout),
|
||||||
'--dump-dom',
|
'--dump-dom',
|
||||||
link['url']
|
link['url']
|
||||||
]
|
]
|
||||||
|
@ -372,6 +400,116 @@ def fetch_dom(link_dir, link, timeout=TIMEOUT, **chrome_kwargs):
|
||||||
'output': output,
|
'output': output,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@attach_result_to_link('git')
|
||||||
|
def fetch_git(link_dir, link, timeout=TIMEOUT):
|
||||||
|
"""download full site using git"""
|
||||||
|
|
||||||
|
is_clonable_url = (
|
||||||
|
domain(link['url']) in GIT_DOMAINS
|
||||||
|
or extension(link['url']) == 'git'
|
||||||
|
)
|
||||||
|
if is_static_file(link['url']) or not is_clonable_url:
|
||||||
|
return {'output': None, 'status': 'skipped'}
|
||||||
|
|
||||||
|
output = 'git'
|
||||||
|
output_path = os.path.join(link_dir, 'git')
|
||||||
|
|
||||||
|
if os.path.exists(output_path):
|
||||||
|
return {'output': output, 'status': 'skipped'}
|
||||||
|
|
||||||
|
os.makedirs(output_path, exist_ok=True)
|
||||||
|
CMD = [
|
||||||
|
GIT_BINARY,
|
||||||
|
'clone',
|
||||||
|
'--mirror',
|
||||||
|
'--recursive',
|
||||||
|
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
|
||||||
|
without_query(without_fragment(link['url'])),
|
||||||
|
]
|
||||||
|
end = progress(timeout, prefix=' ')
|
||||||
|
try:
|
||||||
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||||
|
end()
|
||||||
|
|
||||||
|
if result.returncode == 128:
|
||||||
|
# ignore failed re-download when the folder already exists
|
||||||
|
pass
|
||||||
|
elif result.returncode > 0:
|
||||||
|
hints = 'got git response code {}:'.format(result.returncode)
|
||||||
|
raise ArchiveError('Failed git download', hints)
|
||||||
|
except Exception as e:
|
||||||
|
end()
|
||||||
|
output = e
|
||||||
|
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'cmd': CMD,
|
||||||
|
'output': output,
|
||||||
|
}
|
||||||
|
|
||||||
|
@attach_result_to_link('media')
|
||||||
|
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
|
||||||
|
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||||
|
|
||||||
|
output = 'media'
|
||||||
|
output_path = os.path.join(link_dir, 'media')
|
||||||
|
|
||||||
|
if os.path.exists(output_path) and not overwrite:
|
||||||
|
return {'output': output, 'status': 'skipped'}
|
||||||
|
|
||||||
|
os.makedirs(output_path, exist_ok=True)
|
||||||
|
CMD = [
|
||||||
|
YOUTUBEDL_BINARY,
|
||||||
|
'--write-description',
|
||||||
|
'--write-info-json',
|
||||||
|
'--write-annotations',
|
||||||
|
'--yes-playlist',
|
||||||
|
'--write-thumbnail',
|
||||||
|
'--no-call-home',
|
||||||
|
'--no-check-certificate',
|
||||||
|
'--user-agent',
|
||||||
|
'--all-subs',
|
||||||
|
'--extract-audio',
|
||||||
|
'--keep-video',
|
||||||
|
'--ignore-errors',
|
||||||
|
'--geo-bypass',
|
||||||
|
'--audio-format', 'mp3',
|
||||||
|
'--audio-quality', '320K',
|
||||||
|
'--embed-thumbnail',
|
||||||
|
'--add-metadata',
|
||||||
|
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
|
||||||
|
link['url'],
|
||||||
|
]
|
||||||
|
|
||||||
|
end = progress(timeout, prefix=' ')
|
||||||
|
try:
|
||||||
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||||
|
chmod_file(output, cwd=link_dir)
|
||||||
|
end()
|
||||||
|
if result.returncode:
|
||||||
|
if (b'ERROR: Unsupported URL' in result.stderr
|
||||||
|
or b'HTTP Error 404' in result.stderr
|
||||||
|
or b'HTTP Error 403' in result.stderr
|
||||||
|
or b'URL could be a direct video link' in result.stderr
|
||||||
|
or b'Unable to extract container ID' in result.stderr):
|
||||||
|
# These happen too frequently on non-media pages to warrant printing to console
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
hints = (
|
||||||
|
'got youtubedl response code {}:'.format(result.returncode),
|
||||||
|
*result.stderr.decode().split('\n'),
|
||||||
|
)
|
||||||
|
raise ArchiveError('Failed to download media', hints)
|
||||||
|
except Exception as e:
|
||||||
|
end()
|
||||||
|
output = e
|
||||||
|
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'cmd': CMD,
|
||||||
|
'output': output,
|
||||||
|
}
|
||||||
|
|
||||||
def parse_archive_dot_org_response(response):
|
def parse_archive_dot_org_response(response):
|
||||||
# Parse archive.org response headers
|
# Parse archive.org response headers
|
||||||
headers = defaultdict(list)
|
headers = defaultdict(list)
|
||||||
|
@ -445,226 +583,4 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||||
'output': output,
|
'output': output,
|
||||||
}
|
}
|
||||||
|
|
||||||
@attach_result_to_link('favicon')
|
|
||||||
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
|
|
||||||
"""download site favicon from google's favicon api"""
|
|
||||||
|
|
||||||
output = 'favicon.ico'
|
|
||||||
if os.path.exists(os.path.join(link_dir, output)):
|
|
||||||
return {'output': output, 'status': 'skipped'}
|
|
||||||
|
|
||||||
CMD = [
|
|
||||||
CURL_BINARY,
|
|
||||||
'--max-time', str(timeout),
|
|
||||||
'--location',
|
|
||||||
'--output', output,
|
|
||||||
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
|
||||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
|
|
||||||
]
|
|
||||||
end = progress(timeout, prefix=' ')
|
|
||||||
try:
|
|
||||||
run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
|
||||||
end()
|
|
||||||
chmod_file(output, cwd=link_dir)
|
|
||||||
except Exception as e:
|
|
||||||
end()
|
|
||||||
output = e
|
|
||||||
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
|
||||||
|
|
||||||
return {
|
|
||||||
'cmd': CMD,
|
|
||||||
'output': output,
|
|
||||||
}
|
|
||||||
|
|
||||||
@attach_result_to_link('title')
|
|
||||||
def fetch_title(link_dir, link, timeout=TIMEOUT):
|
|
||||||
"""try to guess the page's title from its content"""
|
|
||||||
|
|
||||||
# if link already has valid title, skip it
|
|
||||||
if link['title'] and not link['title'].lower().startswith('http'):
|
|
||||||
return {'output': link['title'], 'status': 'skipped'}
|
|
||||||
|
|
||||||
if is_static_file(link['url']):
|
|
||||||
return {'output': None, 'status': 'skipped'}
|
|
||||||
|
|
||||||
end = progress(timeout, prefix=' ')
|
|
||||||
try:
|
|
||||||
title = fetch_page_title(link['url'], timeout=timeout, progress=False)
|
|
||||||
end()
|
|
||||||
output = title
|
|
||||||
except Exception as e:
|
|
||||||
end()
|
|
||||||
output = e
|
|
||||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
|
||||||
|
|
||||||
if title and title.strip():
|
|
||||||
link['title'] = title
|
|
||||||
output = title
|
|
||||||
|
|
||||||
return {
|
|
||||||
'cmd': 'fetch_page_title("{}")'.format(link['url']),
|
|
||||||
'output': output,
|
|
||||||
}
|
|
||||||
|
|
||||||
@attach_result_to_link('media')
|
|
||||||
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
|
|
||||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
|
||||||
|
|
||||||
output = 'media'
|
|
||||||
output_path = os.path.join(link_dir, 'media')
|
|
||||||
|
|
||||||
if os.path.exists(output_path) and not overwrite:
|
|
||||||
return {'output': output, 'status': 'skipped'}
|
|
||||||
|
|
||||||
os.makedirs(output_path, exist_ok=True)
|
|
||||||
CMD = [
|
|
||||||
YOUTUBEDL_BINARY,
|
|
||||||
'--write-description',
|
|
||||||
'--write-info-json',
|
|
||||||
'--write-annotations',
|
|
||||||
'--yes-playlist',
|
|
||||||
'--write-thumbnail',
|
|
||||||
'--no-call-home',
|
|
||||||
'--no-check-certificate',
|
|
||||||
'--user-agent',
|
|
||||||
'--all-subs',
|
|
||||||
'--extract-audio',
|
|
||||||
'--keep-video',
|
|
||||||
'--ignore-errors',
|
|
||||||
'--geo-bypass',
|
|
||||||
'--audio-format', 'mp3',
|
|
||||||
'--audio-quality', '320K',
|
|
||||||
'--embed-thumbnail',
|
|
||||||
'--add-metadata',
|
|
||||||
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
|
|
||||||
link['url'],
|
|
||||||
]
|
|
||||||
|
|
||||||
end = progress(timeout, prefix=' ')
|
|
||||||
try:
|
|
||||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
|
||||||
chmod_file(output, cwd=link_dir)
|
|
||||||
end()
|
|
||||||
if result.returncode:
|
|
||||||
if (b'ERROR: Unsupported URL' in result.stderr
|
|
||||||
or b'HTTP Error 404' in result.stderr
|
|
||||||
or b'HTTP Error 403' in result.stderr
|
|
||||||
or b'URL could be a direct video link' in result.stderr
|
|
||||||
or b'Unable to extract container ID' in result.stderr):
|
|
||||||
# These happen too frequently on non-media pages to warrant printing to console
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
hints = (
|
|
||||||
'got youtubedl response code {}:'.format(result.returncode),
|
|
||||||
*result.stderr.decode().split('\n'),
|
|
||||||
)
|
|
||||||
raise ArchiveError('Failed to download media', hints)
|
|
||||||
except Exception as e:
|
|
||||||
end()
|
|
||||||
output = e
|
|
||||||
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
|
||||||
|
|
||||||
return {
|
|
||||||
'cmd': CMD,
|
|
||||||
'output': output,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@attach_result_to_link('git')
|
|
||||||
def fetch_git(link_dir, link, timeout=TIMEOUT):
|
|
||||||
"""download full site using git"""
|
|
||||||
|
|
||||||
url_is_clonable = (
|
|
||||||
domain(link['url']) in GIT_DOMAINS
|
|
||||||
or link['url'].endswith('.git')
|
|
||||||
)
|
|
||||||
if not url_is_clonable or is_static_file(link['url']):
|
|
||||||
return {'output': None, 'status': 'skipped'}
|
|
||||||
|
|
||||||
output = 'git'
|
|
||||||
output_path = os.path.join(link_dir, 'git')
|
|
||||||
|
|
||||||
if os.path.exists(output_path):
|
|
||||||
return {'output': output, 'status': 'skipped'}
|
|
||||||
|
|
||||||
os.makedirs(output_path, exist_ok=True)
|
|
||||||
CMD = [
|
|
||||||
GIT_BINARY,
|
|
||||||
'clone',
|
|
||||||
'--mirror',
|
|
||||||
'--recursive',
|
|
||||||
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
|
|
||||||
without_query(without_fragment(link['url'])),
|
|
||||||
]
|
|
||||||
end = progress(timeout, prefix=' ')
|
|
||||||
try:
|
|
||||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
|
||||||
end()
|
|
||||||
|
|
||||||
if result.returncode == 128:
|
|
||||||
# ignore failed re-download when the folder already exists
|
|
||||||
pass
|
|
||||||
elif result.returncode > 0:
|
|
||||||
hints = 'got git response code {}:'.format(result.returncode)
|
|
||||||
raise ArchiveError('Failed git download', hints)
|
|
||||||
except Exception as e:
|
|
||||||
end()
|
|
||||||
output = e
|
|
||||||
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
|
||||||
|
|
||||||
return {
|
|
||||||
'cmd': CMD,
|
|
||||||
'output': output,
|
|
||||||
}
|
|
||||||
|
|
||||||
def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR, headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX, check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT, resolution=RESOLUTION, timeout=TIMEOUT):
|
|
||||||
global CACHED_USER_DATA_DIR
|
|
||||||
user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
|
|
||||||
cmd_args = [binary]
|
|
||||||
|
|
||||||
if headless:
|
|
||||||
cmd_args += ('--headless',)
|
|
||||||
|
|
||||||
if not sandbox:
|
|
||||||
# dont use GPU or sandbox when running inside docker container
|
|
||||||
cmd_args += ('--no-sandbox', '--disable-gpu')
|
|
||||||
|
|
||||||
if not check_ssl_validity:
|
|
||||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
|
||||||
|
|
||||||
if user_agent:
|
|
||||||
cmd_args += ('--user-agent={}'.format(user_agent),)
|
|
||||||
|
|
||||||
if resolution:
|
|
||||||
cmd_args += ('--window-size={}'.format(RESOLUTION),)
|
|
||||||
|
|
||||||
if timeout:
|
|
||||||
cmd_args += ('--timeout={}'.format((timeout) * 1000),)
|
|
||||||
|
|
||||||
# Find chrome user data directory
|
|
||||||
default_profile_paths = (
|
|
||||||
'~/.config/chromium',
|
|
||||||
'~/.config/google-chrome',
|
|
||||||
'~/.config/google-chrome-beta',
|
|
||||||
'~/.config/google-chrome-unstable',
|
|
||||||
'~/Library/Application Support/Chromium',
|
|
||||||
'~/Library/Application Support/Google/Chrome',
|
|
||||||
'~/Library/Application Support/Google/Chrome Canary',
|
|
||||||
'~/AppData/Local/Chromium/User Data',
|
|
||||||
'~/AppData/Local/Google/Chrome/User Data',
|
|
||||||
'~/AppData/Local/Google/Chrome SxS/User Data',
|
|
||||||
)
|
|
||||||
if user_data_dir:
|
|
||||||
cmd_args.append('--user-data-dir={}'.format(user_data_dir))
|
|
||||||
else:
|
|
||||||
for path in default_profile_paths:
|
|
||||||
full_path = os.path.expanduser(path)
|
|
||||||
if os.path.exists(full_path):
|
|
||||||
CACHED_USER_DATA_DIR = full_path
|
|
||||||
cmd_args.append('--user-data-dir={}'.format(full_path))
|
|
||||||
break
|
|
||||||
|
|
||||||
return cmd_args
|
|
||||||
|
|
||||||
|
|
||||||
CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
|
|
||||||
|
|
|
@ -12,18 +12,24 @@ except ImportError:
|
||||||
from config import (
|
from config import (
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
TEMPLATES_DIR,
|
TEMPLATES_DIR,
|
||||||
ANSI,
|
|
||||||
GIT_SHA,
|
GIT_SHA,
|
||||||
FOOTER_INFO,
|
FOOTER_INFO,
|
||||||
)
|
)
|
||||||
from util import (
|
from util import (
|
||||||
chmod_file,
|
chmod_file,
|
||||||
derived_link_info,
|
derived_link_info,
|
||||||
pretty_path,
|
|
||||||
check_link_structure,
|
check_link_structure,
|
||||||
check_links_structure,
|
check_links_structure,
|
||||||
wget_output_path,
|
wget_output_path,
|
||||||
)
|
)
|
||||||
|
from parse import parse_links
|
||||||
|
from links import validate_links
|
||||||
|
from logs import (
|
||||||
|
log_indexing_started,
|
||||||
|
log_indexing_finished,
|
||||||
|
log_parsing_started,
|
||||||
|
log_parsing_finished,
|
||||||
|
)
|
||||||
|
|
||||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||||
|
|
||||||
|
@ -33,21 +39,40 @@ TITLE_LOADING_MSG = 'Not yet archived...'
|
||||||
def write_links_index(out_dir, links, finished=False):
|
def write_links_index(out_dir, links, finished=False):
|
||||||
"""create index.html file for a given list of links"""
|
"""create index.html file for a given list of links"""
|
||||||
|
|
||||||
|
log_indexing_started()
|
||||||
check_links_structure(links)
|
check_links_structure(links)
|
||||||
|
|
||||||
if not os.path.exists(out_dir):
|
|
||||||
os.makedirs(out_dir)
|
|
||||||
|
|
||||||
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
|
||||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
||||||
**ANSI,
|
|
||||||
))
|
|
||||||
write_json_links_index(out_dir, links)
|
write_json_links_index(out_dir, links)
|
||||||
print(' > {}/index.json'.format(pretty_path(out_dir)))
|
log_indexing_finished(out_dir, 'index.json')
|
||||||
|
|
||||||
write_html_links_index(out_dir, links, finished=finished)
|
write_html_links_index(out_dir, links, finished=finished)
|
||||||
print(' > {}/index.html'.format(pretty_path(out_dir)))
|
log_indexing_finished(out_dir, 'index.html')
|
||||||
|
|
||||||
|
def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
|
||||||
|
"""parse and load existing index with any new links from import_path merged in"""
|
||||||
|
|
||||||
|
existing_links = []
|
||||||
|
if out_dir:
|
||||||
|
existing_links = parse_json_links_index(out_dir)
|
||||||
|
check_links_structure(existing_links)
|
||||||
|
|
||||||
|
new_links = []
|
||||||
|
if import_path:
|
||||||
|
# parse and validate the import file
|
||||||
|
log_parsing_started(import_path)
|
||||||
|
raw_links, parser_name = parse_links(import_path)
|
||||||
|
new_links = validate_links(raw_links)
|
||||||
|
check_links_structure(new_links)
|
||||||
|
|
||||||
|
# merge existing links in out_dir and new links
|
||||||
|
all_links = validate_links(existing_links + new_links)
|
||||||
|
check_links_structure(all_links)
|
||||||
|
num_new_links = len(all_links) - len(existing_links)
|
||||||
|
|
||||||
|
if import_path and parser_name:
|
||||||
|
log_parsing_finished(num_new_links, parser_name)
|
||||||
|
|
||||||
|
return all_links, new_links
|
||||||
|
|
||||||
def write_json_links_index(out_dir, links):
|
def write_json_links_index(out_dir, links):
|
||||||
"""write the json link index to a given path"""
|
"""write the json link index to a given path"""
|
||||||
|
@ -70,8 +95,8 @@ def write_json_links_index(out_dir, links):
|
||||||
|
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
def parse_json_links_index(out_dir):
|
def parse_json_links_index(out_dir=OUTPUT_DIR):
|
||||||
"""load the index in a given directory and merge it with the given link"""
|
"""parse a archive index json file and return the list of links"""
|
||||||
index_path = os.path.join(out_dir, 'index.json')
|
index_path = os.path.join(out_dir, 'index.json')
|
||||||
if os.path.exists(index_path):
|
if os.path.exists(index_path):
|
||||||
with open(index_path, 'r', encoding='utf-8') as f:
|
with open(index_path, 'r', encoding='utf-8') as f:
|
||||||
|
@ -136,31 +161,26 @@ def write_html_links_index(out_dir, links, finished=False):
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
||||||
|
|
||||||
def update_main_index(link):
|
def patch_links_index(link, out_dir=OUTPUT_DIR):
|
||||||
"""hack to in-place update one row's info in the generated index html"""
|
"""hack to in-place update one row's info in the generated index html"""
|
||||||
|
|
||||||
title = link['latest']['title']
|
title = link['latest']['title']
|
||||||
successful = len([entry for entry in link['latest'].values() if entry])
|
successful = len([entry for entry in link['latest'].values() if entry])
|
||||||
|
|
||||||
# Patch JSON index
|
# Patch JSON index
|
||||||
json_path = os.path.join(OUTPUT_DIR, 'index.json')
|
|
||||||
|
|
||||||
links = parse_json_links_index(OUTPUT_DIR)
|
|
||||||
|
|
||||||
changed = False
|
changed = False
|
||||||
for json_link in links:
|
json_file_links = parse_json_links_index(out_dir)
|
||||||
if json_link['url'] == link['url']:
|
for saved_link in json_file_links:
|
||||||
json_link['title'] = title
|
if saved_link['url'] == link['url']:
|
||||||
json_link['latest'] = link['latest']
|
saved_link['title'] = title
|
||||||
|
saved_link['latest'] = link['latest']
|
||||||
changed = True
|
changed = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if changed:
|
if changed:
|
||||||
write_json_links_index(OUTPUT_DIR, links)
|
write_json_links_index(out_dir, json_file_links)
|
||||||
|
|
||||||
# Patch HTML index
|
# Patch HTML index
|
||||||
html_path = os.path.join(OUTPUT_DIR, 'index.html')
|
html_path = os.path.join(out_dir, 'index.html')
|
||||||
|
|
||||||
html = open(html_path, 'r').read().split('\n')
|
html = open(html_path, 'r').read().split('\n')
|
||||||
for idx, line in enumerate(html):
|
for idx, line in enumerate(html):
|
||||||
if title and ('<span data-title-for="{}"'.format(link['url']) in line):
|
if title and ('<span data-title-for="{}"'.format(link['url']) in line):
|
||||||
|
@ -172,6 +192,7 @@ def update_main_index(link):
|
||||||
with open(html_path, 'w') as f:
|
with open(html_path, 'w') as f:
|
||||||
f.write('\n'.join(html))
|
f.write('\n'.join(html))
|
||||||
|
|
||||||
|
|
||||||
### Individual link index
|
### Individual link index
|
||||||
|
|
||||||
def write_link_index(out_dir, link):
|
def write_link_index(out_dir, link):
|
||||||
|
@ -202,6 +223,18 @@ def parse_json_link_index(out_dir):
|
||||||
return link_json
|
return link_json
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def load_json_link_index(out_dir, link):
|
||||||
|
"""check for an existing link archive in the given directory,
|
||||||
|
and load+merge it into the given link dict
|
||||||
|
"""
|
||||||
|
link = {
|
||||||
|
**parse_json_link_index(out_dir),
|
||||||
|
**link,
|
||||||
|
}
|
||||||
|
|
||||||
|
check_link_structure(link)
|
||||||
|
return link
|
||||||
|
|
||||||
def write_html_link_index(out_dir, link):
|
def write_html_link_index(out_dir, link):
|
||||||
check_link_structure(link)
|
check_link_structure(link)
|
||||||
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
|
||||||
|
@ -224,7 +257,10 @@ def write_html_link_index(out_dir, link):
|
||||||
wget_output_path(link)
|
wget_output_path(link)
|
||||||
or (link['domain'] if link['is_archived'] else 'about:blank')
|
or (link['domain'] if link['is_archived'] else 'about:blank')
|
||||||
),
|
),
|
||||||
'extension': link['extension'] or 'HTML',
|
'extension': link['extension'] or 'html',
|
||||||
|
'tags': link['tags'].strip() or 'untagged',
|
||||||
|
'status': 'Archived' if link['is_archived'] else 'Not yet archived',
|
||||||
|
'status_color': 'success' if link['is_archived'] else 'danger',
|
||||||
}))
|
}))
|
||||||
|
|
||||||
chmod_file(path)
|
chmod_file(path)
|
||||||
|
|
161
archivebox/logs.py
Normal file
161
archivebox/logs.py
Normal file
|
@ -0,0 +1,161 @@
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from config import ANSI, REPO_DIR, OUTPUT_DIR
|
||||||
|
|
||||||
|
|
||||||
|
# globals are bad, mmkay
|
||||||
|
_LAST_RUN_STATS = {
|
||||||
|
'skipped': 0,
|
||||||
|
'succeded': 0,
|
||||||
|
'failed': 0,
|
||||||
|
|
||||||
|
'parsing_start_ts': 0,
|
||||||
|
'parsing_end_ts': 0,
|
||||||
|
|
||||||
|
'indexing_start_ts': 0,
|
||||||
|
'indexing_end_ts': 0,
|
||||||
|
|
||||||
|
'archiving_start_ts': 0,
|
||||||
|
'archiving_end_ts': 0,
|
||||||
|
|
||||||
|
'links': {},
|
||||||
|
}
|
||||||
|
|
||||||
|
def pretty_path(path):
|
||||||
|
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||||
|
return path.replace(REPO_DIR + '/', '')
|
||||||
|
|
||||||
|
|
||||||
|
def log_link_archiving_started(link_dir, link, is_new):
|
||||||
|
print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
|
||||||
|
symbol='+' if is_new else '*',
|
||||||
|
symbol_color=ANSI['green' if is_new else 'black'],
|
||||||
|
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
**{**link, 'title': link['title'] or link['url']},
|
||||||
|
**ANSI,
|
||||||
|
))
|
||||||
|
|
||||||
|
print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
|
||||||
|
|
||||||
|
|
||||||
|
def log_link_archiving_failed(cmd, pwd, err=None, hints=None, prefix=' '):
|
||||||
|
"""quote the argument with whitespace in a command so the user can
|
||||||
|
copy-paste the outputted string directly to run the cmd
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Prettify CMD string and make it save to copy-paste by quoting arguments
|
||||||
|
quoted_cmd = ' '.join(
|
||||||
|
'"{}"'.format(arg) if ' ' in arg else arg
|
||||||
|
for arg in cmd
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prettify error output hints string and limit to five lines
|
||||||
|
hints = hints or getattr(err, 'hints', None)
|
||||||
|
if hints:
|
||||||
|
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
||||||
|
hints = (
|
||||||
|
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||||
|
for line in hints[:5] if line.strip()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
hints = ()
|
||||||
|
|
||||||
|
output_lines = [
|
||||||
|
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
|
||||||
|
*hints,
|
||||||
|
'Run to see full output:'
|
||||||
|
' cd {};'.format(pwd),
|
||||||
|
' {}'.format(quoted_cmd),
|
||||||
|
]
|
||||||
|
|
||||||
|
return '\n'.join(
|
||||||
|
'{}{}'.format(prefix, line)
|
||||||
|
for line in output_lines
|
||||||
|
if line
|
||||||
|
)
|
||||||
|
|
||||||
|
### Logging Helpers
|
||||||
|
|
||||||
|
def log_parsing_started(source_file):
|
||||||
|
start_ts = datetime.now()
|
||||||
|
_LAST_RUN_STATS['parse_start_ts'] = start_ts
|
||||||
|
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
||||||
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
source_file.rsplit('/', 1)[-1],
|
||||||
|
**ANSI,
|
||||||
|
))
|
||||||
|
|
||||||
|
def log_parsing_finished(num_new_links, parser_name):
|
||||||
|
print(' > Adding {} new links to index (parsed import as {})'.format(
|
||||||
|
num_new_links,
|
||||||
|
parser_name,
|
||||||
|
))
|
||||||
|
|
||||||
|
def log_indexing_started():
|
||||||
|
start_ts = datetime.now()
|
||||||
|
_LAST_RUN_STATS['index_start_ts'] = start_ts
|
||||||
|
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
||||||
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
**ANSI,
|
||||||
|
))
|
||||||
|
|
||||||
|
def log_indexing_finished(out_dir, out_file):
|
||||||
|
end_ts = datetime.now()
|
||||||
|
_LAST_RUN_STATS['index_end_ts'] = end_ts
|
||||||
|
print(' > {}/{}'.format(pretty_path(out_dir), out_file))
|
||||||
|
|
||||||
|
def log_archiving_started(num_links, resume):
|
||||||
|
start_ts = datetime.now()
|
||||||
|
_LAST_RUN_STATS['start_ts'] = start_ts
|
||||||
|
if resume:
|
||||||
|
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
|
||||||
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
num_links,
|
||||||
|
resume,
|
||||||
|
**ANSI,
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
|
||||||
|
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
num_links,
|
||||||
|
**ANSI,
|
||||||
|
))
|
||||||
|
|
||||||
|
def log_archiving_paused(num_links, idx, timestamp):
|
||||||
|
end_ts = datetime.now()
|
||||||
|
_LAST_RUN_STATS['end_ts'] = end_ts
|
||||||
|
print()
|
||||||
|
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
|
||||||
|
**ANSI,
|
||||||
|
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
idx=idx+1,
|
||||||
|
timestamp=timestamp,
|
||||||
|
total=num_links,
|
||||||
|
))
|
||||||
|
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
||||||
|
print(' Continue where you left off by running:')
|
||||||
|
print(' {} {}'.format(
|
||||||
|
pretty_path(sys.argv[0]),
|
||||||
|
timestamp,
|
||||||
|
))
|
||||||
|
|
||||||
|
def log_archiving_finished(num_links):
|
||||||
|
end_ts = datetime.now()
|
||||||
|
_LAST_RUN_STATS['end_ts'] = end_ts
|
||||||
|
seconds = end_ts - _LAST_RUN_STATS['start_ts'].timestamp()
|
||||||
|
if seconds > 60:
|
||||||
|
duration = '{0:.2f} min'.format(seconds / 60, 2)
|
||||||
|
else:
|
||||||
|
duration = '{0:.2f} sec'.format(seconds, 2)
|
||||||
|
|
||||||
|
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
|
||||||
|
ANSI['green'],
|
||||||
|
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
|
num_links,
|
||||||
|
duration,
|
||||||
|
ANSI['reset'],
|
||||||
|
))
|
||||||
|
print(' - {} entries skipped'.format(_LAST_RUN_STATS['skipped']))
|
||||||
|
print(' - {} entries updated'.format(_LAST_RUN_STATS['succeded']))
|
||||||
|
print(' - {} errors'.format(_LAST_RUN_STATS['failed']))
|
||||||
|
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
|
@ -1,17 +1,19 @@
|
||||||
# coding: utf-8
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Everything related to parsing links from bookmark services.
|
Everything related to parsing links from input sources.
|
||||||
|
|
||||||
For a list of supported services, see the README.md.
|
For a list of supported services, see the README.md.
|
||||||
For examples of supported files see examples/.
|
For examples of supported import formats see tests/.
|
||||||
|
|
||||||
Parsed link schema: {
|
Link: {
|
||||||
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
|
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
|
||||||
'timestamp': '15442123124234',
|
'timestamp': '1544212312.4234',
|
||||||
'title': 'Example.com Page Title',
|
'title': 'Example.com Page Title',
|
||||||
'sources': ['ril_export.html', 'downloads/getpocket.com.txt'],
|
|
||||||
'tags': 'abc,def',
|
'tags': 'abc,def',
|
||||||
|
'sources': [
|
||||||
|
'output/sources/ril_export.html',
|
||||||
|
'output/sources/getpocket.com-1523422111.txt',
|
||||||
|
'output/sources/stdin-234234112312.txt'
|
||||||
|
]
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -19,45 +21,59 @@ import re
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from collections import OrderedDict
|
|
||||||
import xml.etree.ElementTree as etree
|
import xml.etree.ElementTree as etree
|
||||||
|
|
||||||
from config import ANSI
|
from config import TIMEOUT
|
||||||
from util import (
|
from util import (
|
||||||
str_between,
|
str_between,
|
||||||
URL_REGEX,
|
URL_REGEX,
|
||||||
check_url_parsing,
|
check_url_parsing_invariants,
|
||||||
|
progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_links(path):
|
def parse_links(source_file):
|
||||||
"""parse a list of links dictionaries from a bookmark export file"""
|
"""parse a list of URLs with their metadata from an
|
||||||
|
RSS feed, bookmarks export, or text file
|
||||||
check_url_parsing()
|
"""
|
||||||
|
|
||||||
links = []
|
check_url_parsing_invariants()
|
||||||
with open(path, 'r', encoding='utf-8') as file:
|
PARSERS = (
|
||||||
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
# Specialized parsers
|
||||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
('Pocket HTML', parse_pocket_html_export),
|
||||||
path.rsplit('/', 1)[-1],
|
('Pinboard RSS', parse_pinboard_rss_export),
|
||||||
**ANSI,
|
('Shaarli RSS', parse_shaarli_rss_export),
|
||||||
))
|
('Medium RSS', parse_medium_rss_export),
|
||||||
|
|
||||||
|
# General parsers
|
||||||
|
('Netscape HTML', parse_netscape_html_export),
|
||||||
|
('Generic RSS', parse_rss_export),
|
||||||
|
('Generic JSON', parse_json_export),
|
||||||
|
|
||||||
for parser_name, parser_func in PARSERS.items():
|
# Fallback parser
|
||||||
|
('Plain Text', parse_plain_text_export),
|
||||||
|
)
|
||||||
|
end = progress(TIMEOUT * 4, prefix=' ')
|
||||||
|
with open(source_file, 'r', encoding='utf-8') as file:
|
||||||
|
for parser_name, parser_func in PARSERS:
|
||||||
try:
|
try:
|
||||||
links += list(parser_func(file))
|
links = list(parser_func(file))
|
||||||
if links:
|
if links:
|
||||||
break
|
end()
|
||||||
|
return links, parser_name
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
# we try each parser one by one, wong parsers will throw exeptions
|
# Parsers are tried one by one down the list, and the first one
|
||||||
# if unsupported and we accept the first one that passes
|
# that succeeds is used. To see why a certain parser was not used
|
||||||
# uncomment the following line to see why the parser was unsupported for each attempted format
|
# due to error or format incompatibility, uncomment this line:
|
||||||
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
|
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return links, parser_name
|
end()
|
||||||
|
return [], 'Plain Text'
|
||||||
|
|
||||||
|
|
||||||
|
### Import Parser Functions
|
||||||
|
|
||||||
def parse_pocket_html_export(html_file):
|
def parse_pocket_html_export(html_file):
|
||||||
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
||||||
|
|
||||||
|
@ -81,40 +97,57 @@ def parse_pocket_html_export(html_file):
|
||||||
'sources': [html_file.name],
|
'sources': [html_file.name],
|
||||||
}
|
}
|
||||||
|
|
||||||
def parse_pinboard_json_export(json_file):
|
|
||||||
|
def parse_json_export(json_file):
|
||||||
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
||||||
|
|
||||||
json_file.seek(0)
|
json_file.seek(0)
|
||||||
json_content = json.load(json_file)
|
links = json.load(json_file)
|
||||||
for line in json_content:
|
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||||
|
|
||||||
|
for link in links:
|
||||||
# example line
|
# example line
|
||||||
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||||
if line:
|
if link:
|
||||||
erg = line
|
# Parse URL
|
||||||
if erg.get('timestamp'):
|
url = link.get('href') or link.get('url') or link.get('URL')
|
||||||
timestamp = str(erg['timestamp']/10000000) # chrome/ff histories use a very precise timestamp
|
if not url:
|
||||||
elif erg.get('time'):
|
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
|
||||||
timestamp = str(datetime.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ').timestamp())
|
|
||||||
elif erg.get('created_at'):
|
|
||||||
timestamp = str(datetime.strptime(erg['created_at'], '%Y-%m-%dT%H:%M:%S%z').timestamp())
|
|
||||||
else:
|
|
||||||
timestamp = str(datetime.now().timestamp())
|
|
||||||
if erg.get('href'):
|
|
||||||
url = erg['href']
|
|
||||||
else:
|
|
||||||
url = erg['url']
|
|
||||||
if erg.get('description'):
|
|
||||||
title = (erg.get('description') or '').replace(' — Readability', '')
|
|
||||||
else:
|
|
||||||
title = erg['title'].strip()
|
|
||||||
|
|
||||||
info = {
|
# Parse the timestamp
|
||||||
|
ts_str = str(datetime.now().timestamp())
|
||||||
|
if link.get('timestamp'):
|
||||||
|
# chrome/ff histories use a very precise timestamp
|
||||||
|
ts_str = str(link['timestamp'] / 10000000)
|
||||||
|
elif link.get('time'):
|
||||||
|
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
|
||||||
|
elif link.get('created_at'):
|
||||||
|
ts_str = str(json_date(link['created_at']).timestamp())
|
||||||
|
elif link.get('created'):
|
||||||
|
ts_str = str(json_date(link['created']).timestamp())
|
||||||
|
elif link.get('date'):
|
||||||
|
ts_str = str(json_date(link['date']).timestamp())
|
||||||
|
elif link.get('bookmarked'):
|
||||||
|
ts_str = str(json_date(link['bookmarked']).timestamp())
|
||||||
|
elif link.get('saved'):
|
||||||
|
ts_str = str(json_date(link['saved']).timestamp())
|
||||||
|
|
||||||
|
# Parse the title
|
||||||
|
title = None
|
||||||
|
if link.get('title'):
|
||||||
|
title = link['title'].strip() or None
|
||||||
|
elif link.get('description'):
|
||||||
|
title = link['description'].replace(' — Readability', '').strip() or None
|
||||||
|
elif link.get('name'):
|
||||||
|
title = link['name'].strip() or None
|
||||||
|
|
||||||
|
yield {
|
||||||
'url': url,
|
'url': url,
|
||||||
'timestamp': timestamp,
|
'timestamp': ts_str,
|
||||||
'title': title or None,
|
'title': title,
|
||||||
'tags': erg.get('tags') or '',
|
'tags': link.get('tags') or '',
|
||||||
'sources': [json_file.name],
|
'sources': [json_file.name],
|
||||||
}
|
}
|
||||||
yield info
|
|
||||||
|
|
||||||
|
|
||||||
def parse_rss_export(rss_file):
|
def parse_rss_export(rss_file):
|
||||||
|
@ -139,15 +172,15 @@ def parse_rss_export(rss_file):
|
||||||
def get_row(key):
|
def get_row(key):
|
||||||
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
||||||
|
|
||||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
|
||||||
url = str_between(get_row('link'), '<link>', '</link>')
|
url = str_between(get_row('link'), '<link>', '</link>')
|
||||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||||
|
title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
'url': url,
|
'url': url,
|
||||||
'timestamp': str(time.timestamp()),
|
'timestamp': str(time.timestamp()),
|
||||||
'title': title or None,
|
'title': title,
|
||||||
'tags': '',
|
'tags': '',
|
||||||
'sources': [rss_file.name],
|
'sources': [rss_file.name],
|
||||||
}
|
}
|
||||||
|
@ -224,9 +257,6 @@ def parse_pinboard_rss_export(rss_file):
|
||||||
tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
|
tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
|
||||||
title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
|
title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
|
||||||
ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
|
ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
|
||||||
# = 🌈🌈🌈🌈
|
|
||||||
# = 🌈🌈🌈🌈
|
|
||||||
# = 🏆🏆🏆🏆
|
|
||||||
|
|
||||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||||
# Python can't parse. Remove it:
|
# Python can't parse. Remove it:
|
||||||
|
@ -254,8 +284,6 @@ def parse_medium_rss_export(rss_file):
|
||||||
root = etree.parse(rss_file).getroot()
|
root = etree.parse(rss_file).getroot()
|
||||||
items = root.find("channel").findall("item")
|
items = root.find("channel").findall("item")
|
||||||
for item in items:
|
for item in items:
|
||||||
# for child in item:
|
|
||||||
# print(child.tag, child.text)
|
|
||||||
url = item.find("link").text
|
url = item.find("link").text
|
||||||
title = item.find("title").text.strip()
|
title = item.find("title").text.strip()
|
||||||
ts_str = item.find("pubDate").text
|
ts_str = item.find("pubDate").text
|
||||||
|
@ -274,31 +302,13 @@ def parse_plain_text_export(text_file):
|
||||||
"""Parse raw links from each line in a text file"""
|
"""Parse raw links from each line in a text file"""
|
||||||
|
|
||||||
text_file.seek(0)
|
text_file.seek(0)
|
||||||
text_content = text_file.readlines()
|
for line in text_file.readlines():
|
||||||
for line in text_content:
|
urls = re.findall(URL_REGEX, line) if line.strip() else ()
|
||||||
if line:
|
for url in urls:
|
||||||
urls = re.findall(URL_REGEX, line)
|
yield {
|
||||||
|
'url': url,
|
||||||
for url in urls:
|
'timestamp': str(datetime.now().timestamp()),
|
||||||
url = url.strip()
|
'title': None,
|
||||||
time = datetime.now()
|
'tags': '',
|
||||||
|
'sources': [text_file.name],
|
||||||
yield {
|
}
|
||||||
'url': url,
|
|
||||||
'timestamp': str(time.timestamp()),
|
|
||||||
'title': None,
|
|
||||||
'tags': '',
|
|
||||||
'sources': [text_file.name],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
PARSERS = OrderedDict([
|
|
||||||
('Pocket HTML', parse_pocket_html_export),
|
|
||||||
('Pinboard JSON', parse_pinboard_json_export),
|
|
||||||
('Netscape HTML', parse_netscape_html_export),
|
|
||||||
('RSS', parse_rss_export),
|
|
||||||
('Pinboard RSS', parse_pinboard_rss_export),
|
|
||||||
('Shaarli RSS', parse_shaarli_rss_export),
|
|
||||||
('Medium RSS', parse_medium_rss_export),
|
|
||||||
('Plain Text', parse_plain_text_export),
|
|
||||||
])
|
|
||||||
|
|
|
@ -1,10 +1,64 @@
|
||||||
|
"""
|
||||||
|
Patches, additions, and shortcuts for Python standard library functions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
### subprocess
|
||||||
|
|
||||||
|
from subprocess import (
|
||||||
|
Popen,
|
||||||
|
PIPE,
|
||||||
|
DEVNULL,
|
||||||
|
CompletedProcess,
|
||||||
|
TimeoutExpired,
|
||||||
|
CalledProcessError,
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
|
||||||
|
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
|
||||||
|
|
||||||
|
if input is not None:
|
||||||
|
if 'stdin' in kwargs:
|
||||||
|
raise ValueError('stdin and input arguments may not both be used.')
|
||||||
|
kwargs['stdin'] = PIPE
|
||||||
|
|
||||||
|
if capture_output:
|
||||||
|
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
||||||
|
raise ValueError('stdout and stderr arguments may not be used '
|
||||||
|
'with capture_output.')
|
||||||
|
kwargs['stdout'] = PIPE
|
||||||
|
kwargs['stderr'] = PIPE
|
||||||
|
|
||||||
|
with Popen(*popenargs, **kwargs) as process:
|
||||||
|
try:
|
||||||
|
stdout, stderr = process.communicate(input, timeout=timeout)
|
||||||
|
except TimeoutExpired:
|
||||||
|
process.kill()
|
||||||
|
try:
|
||||||
|
stdout, stderr = process.communicate(input, timeout=2)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raise TimeoutExpired(popenargs[0][0], timeout)
|
||||||
|
except BaseException as err:
|
||||||
|
process.kill()
|
||||||
|
# We don't call process.wait() as .__exit__ does that for us.
|
||||||
|
raise
|
||||||
|
retcode = process.poll()
|
||||||
|
if check and retcode:
|
||||||
|
raise CalledProcessError(retcode, process.args,
|
||||||
|
output=stdout, stderr=stderr)
|
||||||
|
return CompletedProcess(process.args, retcode, stdout, stderr)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### collections
|
||||||
|
|
||||||
from sys import maxsize
|
from sys import maxsize
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
|
||||||
_marker = object()
|
_marker = object()
|
||||||
|
|
||||||
class Peekable(object):
|
class PeekableGenerator:
|
||||||
"""Peekable version of a normal python generator.
|
"""Peekable version of a normal python generator.
|
||||||
Useful when you don't want to evaluate the entire iterable to look at
|
Useful when you don't want to evaluate the entire iterable to look at
|
||||||
a specific item at a given idx.
|
a specific item at a given idx.
|
||||||
|
@ -74,8 +128,6 @@ class Peekable(object):
|
||||||
|
|
||||||
return next(self._it)
|
return next(self._it)
|
||||||
|
|
||||||
next = __next__ # For Python 2 compatibility
|
|
||||||
|
|
||||||
def _get_slice(self, index):
|
def _get_slice(self, index):
|
||||||
# Normalize the slice's arguments
|
# Normalize the slice's arguments
|
||||||
step = 1 if (index.step is None) else index.step
|
step = 1 if (index.step is None) else index.step
|
|
@ -192,22 +192,27 @@
|
||||||
Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
|
Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
|
||||||
|
|
|
|
||||||
Last updated: <small title="Timestamp: $updated">$updated_date</small>
|
Last updated: <small title="Timestamp: $updated">$updated_date</small>
|
||||||
|
|
|
||||||
|
Total files: <small title="Archive methods">🗃 $num_outputs</small>
|
||||||
</div>
|
</div>
|
||||||
<div class="col-lg-4 alert well">
|
<div class="col-lg-4 alert well">
|
||||||
Type:
|
Type:
|
||||||
<span class="badge badge-default">$extension</span>
|
<span class="badge badge-default">$extension</span>
|
||||||
|
|
|
|
||||||
Tags:
|
Tags:
|
||||||
<span class="badge badge-success">$tags</span>
|
<span class="badge badge-warning">$tags</span>
|
||||||
|
|
|
||||||
|
Status:
|
||||||
|
<span class="badge badge-$status_color">$status</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="col-lg-4 alert well">
|
<div class="col-lg-4 alert well">
|
||||||
Download:
|
Archive Methods:
|
||||||
<a href="index.json" title="JSON summary of archived link.">JSON</a> |
|
<a href="index.json" title="JSON summary of archived link.">JSON</a> |
|
||||||
<a href="warc/" title="Any WARC archives for the page">WARC</a> |
|
<a href="warc/" title="Any WARC archives for the page">WARC</a> |
|
||||||
<a href="media/" title="Audio, Video, and Subtitle files.">Media</a> |
|
<a href="media/" title="Audio, Video, and Subtitle files.">Media</a> |
|
||||||
<a href="git/" title="Any git repos at the url">Git Repos</a> |
|
<a href="git/" title="Any git repos at the url">Git Repos</a> |
|
||||||
<a href="favicon.ico" title="Any git repos at the url">Favicon</a> |
|
<a href="favicon.ico" title="Any git repos at the url">Favicon</a> |
|
||||||
<a href="." title="Webserver-provided index of files directory.">More files...</a>
|
<a href="." title="Webserver-provided index of files directory.">See all files...</a>
|
||||||
</div>
|
</div>
|
||||||
<hr/>
|
<hr/>
|
||||||
<div class="col-lg-2">
|
<div class="col-lg-2">
|
||||||
|
|
|
@ -8,8 +8,8 @@ from urllib.parse import urlparse, quote
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from multiprocessing import Process
|
from multiprocessing import Process
|
||||||
from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, CalledProcessError
|
|
||||||
|
|
||||||
|
from stdlib_patches import run, PIPE, DEVNULL
|
||||||
from config import (
|
from config import (
|
||||||
ANSI,
|
ANSI,
|
||||||
TERM_WIDTH,
|
TERM_WIDTH,
|
||||||
|
@ -19,8 +19,6 @@ from config import (
|
||||||
OUTPUT_PERMISSIONS,
|
OUTPUT_PERMISSIONS,
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
SHOW_PROGRESS,
|
SHOW_PROGRESS,
|
||||||
CHECK_SSL_VALIDITY,
|
|
||||||
WGET_USER_AGENT,
|
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
WGET_BINARY,
|
WGET_BINARY,
|
||||||
CHROME_BINARY,
|
CHROME_BINARY,
|
||||||
|
@ -37,6 +35,13 @@ from config import (
|
||||||
FETCH_MEDIA,
|
FETCH_MEDIA,
|
||||||
SUBMIT_ARCHIVE_DOT_ORG,
|
SUBMIT_ARCHIVE_DOT_ORG,
|
||||||
ARCHIVE_DIR_NAME,
|
ARCHIVE_DIR_NAME,
|
||||||
|
RESOLUTION,
|
||||||
|
CHECK_SSL_VALIDITY,
|
||||||
|
WGET_USER_AGENT,
|
||||||
|
CHROME_USER_AGENT,
|
||||||
|
CHROME_USER_DATA_DIR,
|
||||||
|
CHROME_HEADLESS,
|
||||||
|
CHROME_SANDBOX,
|
||||||
)
|
)
|
||||||
|
|
||||||
### Parsing Helpers
|
### Parsing Helpers
|
||||||
|
@ -56,6 +61,7 @@ extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basen
|
||||||
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
||||||
|
|
||||||
short_ts = lambda ts: ts.split('.')[0]
|
short_ts = lambda ts: ts.split('.')[0]
|
||||||
|
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
|
||||||
|
|
||||||
URL_REGEX = re.compile(
|
URL_REGEX = re.compile(
|
||||||
r'http[s]?://' # start matching from allowed schemes
|
r'http[s]?://' # start matching from allowed schemes
|
||||||
|
@ -109,66 +115,74 @@ def check_links_structure(links):
|
||||||
def check_dependencies():
|
def check_dependencies():
|
||||||
"""Check that all necessary dependencies are installed, and have valid versions"""
|
"""Check that all necessary dependencies are installed, and have valid versions"""
|
||||||
|
|
||||||
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
|
try:
|
||||||
if python_vers < 3.5:
|
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
|
||||||
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
|
if python_vers < 3.5:
|
||||||
print(' See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
|
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
|
||||||
|
if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||||
|
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
|
||||||
|
print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if FETCH_WGET or FETCH_WARC:
|
||||||
|
if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||||
|
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
|
||||||
|
print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
|
||||||
|
if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||||
|
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
|
||||||
|
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
|
||||||
|
try:
|
||||||
|
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
|
||||||
|
version_str = result.stdout.decode('utf-8')
|
||||||
|
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
|
||||||
|
version = [l for l in version_lines if l.isdigit()][-1]
|
||||||
|
if int(version) < 59:
|
||||||
|
print(version_lines)
|
||||||
|
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
except (IndexError, TypeError, OSError):
|
||||||
|
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
|
||||||
|
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if FETCH_GIT:
|
||||||
|
if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||||
|
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
|
||||||
|
print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if FETCH_MEDIA:
|
||||||
|
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||||
|
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
|
||||||
|
print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
except (KeyboardInterrupt, Exception):
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
|
def check_url_parsing_invariants():
|
||||||
if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
|
|
||||||
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
|
|
||||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
if FETCH_WGET or FETCH_WARC:
|
|
||||||
if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
|
|
||||||
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
|
|
||||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
|
|
||||||
if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
|
|
||||||
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
|
|
||||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
|
|
||||||
try:
|
|
||||||
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
|
|
||||||
version_str = result.stdout.decode('utf-8')
|
|
||||||
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
|
|
||||||
version = [l for l in version_lines if l.isdigit()][-1]
|
|
||||||
if int(version) < 59:
|
|
||||||
print(version_lines)
|
|
||||||
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
except (IndexError, TypeError, OSError):
|
|
||||||
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
|
|
||||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
if FETCH_GIT:
|
|
||||||
if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
|
|
||||||
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
|
|
||||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
if FETCH_MEDIA:
|
|
||||||
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
|
|
||||||
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
|
|
||||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
def check_url_parsing():
|
|
||||||
"""Check that plain text regex URL parsing works as expected"""
|
"""Check that plain text regex URL parsing works as expected"""
|
||||||
|
|
||||||
|
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||||
|
# misbehaving, as the consequences could be disastrous and lead to many
|
||||||
|
# incorrect/badly parsed links being added to the archive
|
||||||
|
|
||||||
test_urls = '''
|
test_urls = '''
|
||||||
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
||||||
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
||||||
|
@ -276,22 +290,9 @@ def wget_output_path(link):
|
||||||
if link.get('latest', {}).get('wget'):
|
if link.get('latest', {}).get('wget'):
|
||||||
return link['latest']['wget']
|
return link['latest']['wget']
|
||||||
|
|
||||||
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
|
|
||||||
|
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link['url']):
|
||||||
return urlencode(without_scheme(without_fragment(link['url'])))
|
return urlencode(without_scheme(without_fragment(link['url'])))
|
||||||
|
|
||||||
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
|
||||||
# instead of trying to emulate it here, we just look in the output folder
|
|
||||||
# to see what html file wget actually created as the output
|
|
||||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
|
||||||
full_path = without_fragment(without_query(path(link['url']))).strip('/')
|
|
||||||
search_dir = os.path.join(
|
|
||||||
link_dir,
|
|
||||||
domain(link['url']),
|
|
||||||
full_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Wget downloads can save in a number of different ways depending on the url
|
# Wget downloads can save in a number of different ways depending on the url
|
||||||
# https://example.com
|
# https://example.com
|
||||||
# > output/archive/<timestamp>/example.com/index.html
|
# > output/archive/<timestamp>/example.com/index.html
|
||||||
|
@ -304,6 +305,19 @@ def wget_output_path(link):
|
||||||
|
|
||||||
# There's also lots of complexity around how the urlencoding and renaming
|
# There's also lots of complexity around how the urlencoding and renaming
|
||||||
# is done for pages with query and hash fragments or extensions like shtml / htm
|
# is done for pages with query and hash fragments or extensions like shtml / htm
|
||||||
|
|
||||||
|
# Since the wget algorithm for -E (appending .html) is incredibly complex
|
||||||
|
# and there's no way to get the computed output path from wget
|
||||||
|
# in order to avoid having to reverse-engineer how they calculate it,
|
||||||
|
# we just look in the output folder read the filename wget used from the filesystem
|
||||||
|
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||||
|
full_path = without_fragment(without_query(path(link['url']))).strip('/')
|
||||||
|
search_dir = os.path.join(
|
||||||
|
link_dir,
|
||||||
|
domain(link['url']),
|
||||||
|
full_path,
|
||||||
|
)
|
||||||
|
|
||||||
for _ in range(4):
|
for _ in range(4):
|
||||||
if os.path.exists(search_dir):
|
if os.path.exists(search_dir):
|
||||||
if os.path.isdir(search_dir):
|
if os.path.isdir(search_dir):
|
||||||
|
@ -356,47 +370,6 @@ def str_between(string, start, end=None):
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def pretty_path(path):
|
|
||||||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
|
||||||
return path.replace(REPO_DIR + '/', '')
|
|
||||||
|
|
||||||
|
|
||||||
def print_error_hints(cmd, pwd, err=None, hints=None, prefix=' '):
|
|
||||||
"""quote the argument with whitespace in a command so the user can
|
|
||||||
copy-paste the outputted string directly to run the cmd
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Prettify CMD string and make it save to copy-paste by quoting arguments
|
|
||||||
quoted_cmd = ' '.join(
|
|
||||||
'"{}"'.format(arg) if ' ' in arg else arg
|
|
||||||
for arg in cmd
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prettify error output hints string and limit to five lines
|
|
||||||
hints = hints or getattr(err, 'hints', None)
|
|
||||||
if hints:
|
|
||||||
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
|
||||||
hints = (
|
|
||||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
|
||||||
for line in hints[:5] if line.strip()
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
hints = ()
|
|
||||||
|
|
||||||
output_lines = [
|
|
||||||
'{}Failed: {} {}{}'.format(ANSI['red'], err.__class__.__name__, err, ANSI['reset']),
|
|
||||||
*hints,
|
|
||||||
'Run to see full output:'
|
|
||||||
' cd {};'.format(pwd),
|
|
||||||
' {}'.format(quoted_cmd),
|
|
||||||
]
|
|
||||||
|
|
||||||
return '\n'.join(
|
|
||||||
'{}{}'.format(prefix, line)
|
|
||||||
for line in output_lines
|
|
||||||
if line
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
### Link Helpers
|
### Link Helpers
|
||||||
|
|
||||||
|
@ -571,37 +544,59 @@ def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
|
||||||
print(' ', chmod_result.stderr.decode())
|
print(' ', chmod_result.stderr.decode())
|
||||||
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
||||||
|
|
||||||
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
|
|
||||||
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
|
|
||||||
|
|
||||||
if input is not None:
|
CACHED_USER_DATA_DIR = CHROME_USER_DATA_DIR
|
||||||
if 'stdin' in kwargs:
|
|
||||||
raise ValueError('stdin and input arguments may not both be used.')
|
|
||||||
kwargs['stdin'] = PIPE
|
|
||||||
|
|
||||||
if capture_output:
|
def chrome_args(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR,
|
||||||
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
headless=CHROME_HEADLESS, sandbox=CHROME_SANDBOX,
|
||||||
raise ValueError('stdout and stderr arguments may not be used '
|
check_ssl_validity=CHECK_SSL_VALIDITY, user_agent=CHROME_USER_AGENT,
|
||||||
'with capture_output.')
|
resolution=RESOLUTION, timeout=TIMEOUT):
|
||||||
kwargs['stdout'] = PIPE
|
"""helper to build up a chrome shell command with arguments"""
|
||||||
kwargs['stderr'] = PIPE
|
|
||||||
|
|
||||||
with Popen(*popenargs, **kwargs) as process:
|
global CACHED_USER_DATA_DIR
|
||||||
try:
|
user_data_dir = user_data_dir or CACHED_USER_DATA_DIR
|
||||||
stdout, stderr = process.communicate(input, timeout=timeout)
|
cmd_args = [binary]
|
||||||
except TimeoutExpired:
|
|
||||||
process.kill()
|
if headless:
|
||||||
try:
|
cmd_args += ('--headless',)
|
||||||
stdout, stderr = process.communicate(input, timeout=2)
|
|
||||||
except:
|
if not sandbox:
|
||||||
pass
|
# dont use GPU or sandbox when running inside docker container
|
||||||
raise TimeoutExpired(popenargs[0][0], timeout)
|
cmd_args += ('--no-sandbox', '--disable-gpu')
|
||||||
except BaseException as err:
|
|
||||||
process.kill()
|
if not check_ssl_validity:
|
||||||
# We don't call process.wait() as .__exit__ does that for us.
|
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||||
raise
|
|
||||||
retcode = process.poll()
|
if user_agent:
|
||||||
if check and retcode:
|
cmd_args += ('--user-agent={}'.format(user_agent),)
|
||||||
raise CalledProcessError(retcode, process.args,
|
|
||||||
output=stdout, stderr=stderr)
|
if resolution:
|
||||||
return CompletedProcess(process.args, retcode, stdout, stderr)
|
cmd_args += ('--window-size={}'.format(RESOLUTION),)
|
||||||
|
|
||||||
|
if timeout:
|
||||||
|
cmd_args += ('--timeout={}'.format((timeout) * 1000),)
|
||||||
|
|
||||||
|
# Find chrome user data directory
|
||||||
|
default_profile_paths = (
|
||||||
|
'~/.config/chromium',
|
||||||
|
'~/.config/google-chrome',
|
||||||
|
'~/.config/google-chrome-beta',
|
||||||
|
'~/.config/google-chrome-unstable',
|
||||||
|
'~/Library/Application Support/Chromium',
|
||||||
|
'~/Library/Application Support/Google/Chrome',
|
||||||
|
'~/Library/Application Support/Google/Chrome Canary',
|
||||||
|
'~/AppData/Local/Chromium/User Data',
|
||||||
|
'~/AppData/Local/Google/Chrome/User Data',
|
||||||
|
'~/AppData/Local/Google/Chrome SxS/User Data',
|
||||||
|
)
|
||||||
|
if user_data_dir:
|
||||||
|
cmd_args.append('--user-data-dir={}'.format(user_data_dir))
|
||||||
|
else:
|
||||||
|
for path in default_profile_paths:
|
||||||
|
full_path = os.path.expanduser(path)
|
||||||
|
if os.path.exists(full_path):
|
||||||
|
CACHED_USER_DATA_DIR = full_path
|
||||||
|
cmd_args.append('--user-data-dir={}'.format(full_path))
|
||||||
|
break
|
||||||
|
|
||||||
|
return cmd_args
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue