mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-14 15:14:31 -04:00
better config system
This commit is contained in:
parent
77ab8ebda6
commit
1249493fcd
2 changed files with 84 additions and 78 deletions
|
@ -7,7 +7,6 @@ from subprocess import run, PIPE, DEVNULL
|
||||||
from index import html_appended_url, parse_json_link_index, write_link_index
|
from index import html_appended_url, parse_json_link_index, write_link_index
|
||||||
from links import links_after_timestamp
|
from links import links_after_timestamp
|
||||||
from config import (
|
from config import (
|
||||||
ARCHIVE_PERMISSIONS,
|
|
||||||
ARCHIVE_DIR,
|
ARCHIVE_DIR,
|
||||||
CHROME_BINARY,
|
CHROME_BINARY,
|
||||||
FETCH_WGET,
|
FETCH_WGET,
|
||||||
|
@ -29,26 +28,90 @@ from util import (
|
||||||
chmod_file,
|
chmod_file,
|
||||||
)
|
)
|
||||||
|
|
||||||
_RESULTS_TOTALS = {
|
|
||||||
|
_RESULTS_TOTALS = { # globals are bad, mmkay
|
||||||
'skipped': 0,
|
'skipped': 0,
|
||||||
'succeded': 0,
|
'succeded': 0,
|
||||||
'failed': 0,
|
'failed': 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def archive_links(out_dir, links, export_path, resume=None):
|
||||||
|
check_dependencies()
|
||||||
|
|
||||||
|
to_archive = links_after_timestamp(links, resume)
|
||||||
|
try:
|
||||||
|
for idx, link in enumerate(to_archive):
|
||||||
|
out_dir = os.path.join(out_dir, link['timestamp'])
|
||||||
|
archive_link(out_dir, link)
|
||||||
|
|
||||||
|
except (KeyboardInterrupt, SystemExit, Exception) as e:
|
||||||
|
print('{red}[X] Archive update stopped on #{idx} out of {total} links{reset}'.format(
|
||||||
|
**ANSI,
|
||||||
|
idx=idx,
|
||||||
|
total=len(list(to_archive)),
|
||||||
|
))
|
||||||
|
print(' Continue where you left off by running:')
|
||||||
|
print(' ./archive.py {} {}'.format(
|
||||||
|
export_path,
|
||||||
|
link['timestamp'],
|
||||||
|
))
|
||||||
|
if not isinstance(e, KeyboardInterrupt):
|
||||||
|
raise e
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def archive_link(out_dir, link, overwrite=False):
|
||||||
|
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||||
|
|
||||||
|
link = {**parse_json_link_index(out_dir), **link}
|
||||||
|
log_link_archive(out_dir, link)
|
||||||
|
|
||||||
|
if FETCH_WGET:
|
||||||
|
link = fetch_wget(out_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
if FETCH_PDF:
|
||||||
|
link = fetch_pdf(out_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
if FETCH_SCREENSHOT:
|
||||||
|
link = fetch_screenshot(out_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
if SUBMIT_ARCHIVE_DOT_ORG:
|
||||||
|
link = archive_dot_org(out_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
# if FETCH_AUDIO:
|
||||||
|
# link = fetch_audio(out_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
# if FETCH_VIDEO:
|
||||||
|
# link = fetch_video(out_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
if FETCH_FAVICON:
|
||||||
|
link = fetch_favicon(out_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
write_link_index(out_dir, link)
|
||||||
|
|
||||||
|
return link
|
||||||
|
|
||||||
|
|
||||||
def attach_result_to_link(method):
|
def attach_result_to_link(method):
|
||||||
|
"""
|
||||||
|
Instead of returning a result={output:'...', status:'success'} object,
|
||||||
|
attach that result to the links's history & latest fields, then return
|
||||||
|
the updated link object.
|
||||||
|
"""
|
||||||
def decorator(fetch_func):
|
def decorator(fetch_func):
|
||||||
@wraps(fetch_func)
|
@wraps(fetch_func)
|
||||||
def timed_fetch_func(out_dir, link, overwrite=False, **kwargs):
|
def timed_fetch_func(out_dir, link, overwrite=False, **kwargs):
|
||||||
# initialize methods and history json field on link
|
# initialize methods and history json field on link
|
||||||
link['methods'] = link.get('methods') or {}
|
link['latest'] = link.get('latest') or {}
|
||||||
link['methods'][method] = link['methods'].get(method) or None
|
link['latest'][method] = link['latest'].get(method) or None
|
||||||
link['history'] = link.get('history') or {}
|
link['history'] = link.get('history') or {}
|
||||||
link['history'][method] = link['history'].get(method) or []
|
link['history'][method] = link['history'].get(method) or []
|
||||||
|
|
||||||
start_ts = datetime.now().timestamp()
|
start_ts = datetime.now().timestamp()
|
||||||
|
|
||||||
# if a valid method output is already present, dont run the fetch function
|
# if a valid method output is already present, dont run the fetch function
|
||||||
if link['methods'][method] and not overwrite:
|
if link['latest'][method] and not overwrite:
|
||||||
print(' √ Skipping: {}'.format(method))
|
print(' √ Skipping: {}'.format(method))
|
||||||
result = None
|
result = None
|
||||||
else:
|
else:
|
||||||
|
@ -74,7 +137,7 @@ def attach_result_to_link(method):
|
||||||
history_entry['duration'] = duration
|
history_entry['duration'] = duration
|
||||||
history_entry.update(result or {})
|
history_entry.update(result or {})
|
||||||
link['history'][method].append(history_entry)
|
link['history'][method].append(history_entry)
|
||||||
link['methods'][method] = result['output']
|
link['latest'][method] = result['output']
|
||||||
|
|
||||||
_RESULTS_TOTALS[history_entry['status']] += 1
|
_RESULTS_TOTALS[history_entry['status']] += 1
|
||||||
|
|
||||||
|
@ -105,7 +168,6 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT)
|
||||||
print(' got wget response code {}:'.format(result.returncode))
|
print(' got wget response code {}:'.format(result.returncode))
|
||||||
print('\n'.join(' ' + line for line in (result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:] if line.strip()))
|
print('\n'.join(' ' + line for line in (result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:] if line.strip()))
|
||||||
# raise Exception('Failed to wget download')
|
# raise Exception('Failed to wget download')
|
||||||
chmod_file(link['domain'], cwd=out_dir)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end()
|
end()
|
||||||
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||||
|
@ -140,7 +202,6 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT):
|
||||||
if result.returncode:
|
if result.returncode:
|
||||||
print(' ', (result.stderr or result.stdout).decode())
|
print(' ', (result.stderr or result.stdout).decode())
|
||||||
raise Exception('Failed to print PDF')
|
raise Exception('Failed to print PDF')
|
||||||
chmod_file('output.pdf', cwd=out_dir)
|
|
||||||
output = 'output.pdf'
|
output = 'output.pdf'
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
end()
|
end()
|
||||||
|
@ -338,67 +399,11 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT):
|
||||||
# print(' √ Skipping video download')
|
# print(' √ Skipping video download')
|
||||||
|
|
||||||
|
|
||||||
def archive_links(out_dir, links, export_path, resume=None):
|
|
||||||
check_dependencies()
|
|
||||||
|
|
||||||
to_archive = links_after_timestamp(links, resume)
|
|
||||||
try:
|
|
||||||
for idx, link in enumerate(to_archive):
|
|
||||||
out_dir = os.path.join(out_dir, link['timestamp'])
|
|
||||||
archive_link(out_dir, link)
|
|
||||||
|
|
||||||
except (KeyboardInterrupt, SystemExit, Exception) as e:
|
|
||||||
print('{red}[X] Archive update stopped on #{idx} out of {total} links{reset}'.format(
|
|
||||||
**ANSI,
|
|
||||||
idx=idx,
|
|
||||||
total=len(list(to_archive)),
|
|
||||||
))
|
|
||||||
print(' Continue where you left off by running:')
|
|
||||||
print(' ./archive.py {} {}'.format(
|
|
||||||
export_path,
|
|
||||||
link['timestamp'],
|
|
||||||
))
|
|
||||||
if not isinstance(e, KeyboardInterrupt):
|
|
||||||
raise e
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
|
|
||||||
def archive_link(out_dir, link, overwrite=False, permissions=ARCHIVE_PERMISSIONS):
|
|
||||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
|
||||||
|
|
||||||
link = {**parse_json_link_index(out_dir), **link}
|
|
||||||
log_link_archive(out_dir, link)
|
|
||||||
|
|
||||||
if FETCH_WGET:
|
|
||||||
link = fetch_wget(out_dir, link, overwrite=overwrite)
|
|
||||||
|
|
||||||
if FETCH_PDF:
|
|
||||||
link = fetch_pdf(out_dir, link, overwrite=overwrite)
|
|
||||||
|
|
||||||
if FETCH_SCREENSHOT:
|
|
||||||
link = fetch_screenshot(out_dir, link, overwrite=overwrite)
|
|
||||||
|
|
||||||
if SUBMIT_ARCHIVE_DOT_ORG:
|
|
||||||
link = archive_dot_org(out_dir, link, overwrite=overwrite)
|
|
||||||
|
|
||||||
# if FETCH_AUDIO:
|
|
||||||
# link = fetch_audio(out_dir, link, overwrite=overwrite)
|
|
||||||
|
|
||||||
# if FETCH_VIDEO:
|
|
||||||
# link = fetch_video(out_dir, link, overwrite=overwrite)
|
|
||||||
|
|
||||||
if FETCH_FAVICON:
|
|
||||||
link = fetch_favicon(out_dir, link, overwrite=overwrite)
|
|
||||||
|
|
||||||
write_link_index(out_dir, link)
|
|
||||||
|
|
||||||
return link
|
|
||||||
|
|
||||||
def log_link_archive(out_dir, link):
|
def log_link_archive(out_dir, link):
|
||||||
update_existing = os.path.exists(out_dir)
|
update_existing = os.path.exists(out_dir)
|
||||||
if not update_existing:
|
if not update_existing:
|
||||||
os.makedirs(out_dir)
|
os.makedirs(out_dir)
|
||||||
run(['chmod', ARCHIVE_PERMISSIONS, out_dir], timeout=5)
|
|
||||||
|
|
||||||
print('[{symbol_color}{symbol}{reset}] [{timestamp}] "{title}": {blue}{base_url}{reset}'.format(
|
print('[{symbol_color}{symbol}{reset}] [{timestamp}] "{title}": {blue}{base_url}{reset}'.format(
|
||||||
symbol='*' if update_existing else '+',
|
symbol='*' if update_existing else '+',
|
||||||
|
|
29
config.py
29
config.py
|
@ -4,14 +4,11 @@ import shutil
|
||||||
|
|
||||||
from subprocess import run, PIPE
|
from subprocess import run, PIPE
|
||||||
|
|
||||||
# os.getenv('VARIABLE', 'DEFAULT') gets the value of environment
|
# ******************************************************************************
|
||||||
# variable "VARIABLE" and if it is not set, sets it to 'DEFAULT'
|
# * TO SET YOUR CONFIGURATION, EDIT THE VALUES BELOW, or use the 'env' command *
|
||||||
|
# * e.g. *
|
||||||
# for boolean values, check to see if the string is 'true', and
|
# * env USE_COLOR=True CHROME_BINARY=google-chrome ./archive.py export.html *
|
||||||
# if so, the python variable will be True
|
# ******************************************************************************
|
||||||
|
|
||||||
# *******************************************************************************
|
|
||||||
# *** TO SET YOUR PREFERENCES, EDIT THE VALUES HERE, or use the 'env' command ***
|
|
||||||
|
|
||||||
IS_TTY = sys.stdout.isatty()
|
IS_TTY = sys.stdout.isatty()
|
||||||
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
|
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
|
||||||
|
@ -35,8 +32,16 @@ LINK_INDEX_TEMPLATE = os.getenv('LINK_INDEX_TEMPLATE', 'templates/link_ind
|
||||||
INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html')
|
INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html')
|
||||||
INDEX_ROW_TEMPLATE = os.getenv('INDEX_ROW_TEMPLATE', 'templates/index_row.html')
|
INDEX_ROW_TEMPLATE = os.getenv('INDEX_ROW_TEMPLATE', 'templates/index_row.html')
|
||||||
|
|
||||||
# *******************************************************************************
|
### Output Paths
|
||||||
|
ROOT_FOLDER = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
HTML_FOLDER = os.path.join(ARCHIVE_DIR, 'html')
|
||||||
|
ARCHIVE_FOLDER = os.path.join(HTML_FOLDER, 'archive')
|
||||||
|
|
||||||
|
# ******************************************************************************
|
||||||
|
# ********************** Do not edit below this point **************************
|
||||||
|
# ******************************************************************************
|
||||||
|
|
||||||
|
### Terminal Configuration
|
||||||
TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns
|
TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns
|
||||||
ANSI = {
|
ANSI = {
|
||||||
'reset': '\033[00;00m',
|
'reset': '\033[00;00m',
|
||||||
|
@ -53,17 +58,13 @@ if not USE_COLOR:
|
||||||
# dont show colors if USE_COLOR is False
|
# dont show colors if USE_COLOR is False
|
||||||
ANSI = {k: '' for k in ANSI.keys()}
|
ANSI = {k: '' for k in ANSI.keys()}
|
||||||
|
|
||||||
|
### Confirm Environment Setup
|
||||||
ROOT_FOLDER = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
HTML_FOLDER = os.path.join(ARCHIVE_DIR, 'html')
|
|
||||||
ARCHIVE_FOLDER = os.path.join(HTML_FOLDER, 'archive')
|
|
||||||
try:
|
try:
|
||||||
GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=ROOT_FOLDER).stdout.strip().decode()
|
GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=ROOT_FOLDER).stdout.strip().decode()
|
||||||
except Exception:
|
except Exception:
|
||||||
GIT_SHA = None
|
GIT_SHA = None
|
||||||
print('[!] Warning, you need git installed for some archiving features to save correct version numbers!')
|
print('[!] Warning, you need git installed for some archiving features to save correct version numbers!')
|
||||||
|
|
||||||
|
|
||||||
if sys.stdout.encoding.upper() != 'UTF-8':
|
if sys.stdout.encoding.upper() != 'UTF-8':
|
||||||
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
|
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
|
||||||
print(' To fix it, add the line "export PYTHONIOENCODING=utf8" to your ~/.bashrc file (without quotes)')
|
print(' To fix it, add the line "export PYTHONIOENCODING=utf8" to your ~/.bashrc file (without quotes)')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue