mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 22:54:27 -04:00
better logging during long output
This commit is contained in:
parent
1c5732d5c6
commit
bd9f3e313f
6 changed files with 63 additions and 72 deletions
|
@ -94,7 +94,7 @@ def main(*args):
|
||||||
|
|
||||||
|
|
||||||
def update_archive_data(import_path=None, resume=None):
|
def update_archive_data(import_path=None, resume=None):
|
||||||
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
||||||
check_dependencies()
|
check_dependencies()
|
||||||
|
|
||||||
# Step 1: Load list of links from the existing index
|
# Step 1: Load list of links from the existing index
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from functools import wraps
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
@ -50,10 +49,9 @@ from util import (
|
||||||
run, PIPE, DEVNULL
|
run, PIPE, DEVNULL
|
||||||
)
|
)
|
||||||
from logs import (
|
from logs import (
|
||||||
_LAST_RUN_STATS,
|
|
||||||
log_link_archiving_started,
|
log_link_archiving_started,
|
||||||
log_link_archiving_finished,
|
log_link_archiving_finished,
|
||||||
log_archive_method_starting,
|
log_archive_method_started,
|
||||||
log_archive_method_finished,
|
log_archive_method_finished,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -94,6 +92,7 @@ def archive_link(link_dir, link):
|
||||||
link['history'][method_name] = []
|
link['history'][method_name] = []
|
||||||
if method_name not in link['latest']:
|
if method_name not in link['latest']:
|
||||||
link['latest'][method_name] = None
|
link['latest'][method_name] = None
|
||||||
|
|
||||||
if not should_run(link_dir, link):
|
if not should_run(link_dir, link):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -101,7 +100,7 @@ def archive_link(link_dir, link):
|
||||||
skipped_entirely = False
|
skipped_entirely = False
|
||||||
print()
|
print()
|
||||||
|
|
||||||
log_archive_method_starting(method_name)
|
log_archive_method_started(method_name)
|
||||||
result = method_function(link_dir, link)
|
result = method_function(link_dir, link)
|
||||||
log_archive_method_finished(result)
|
log_archive_method_finished(result)
|
||||||
|
|
||||||
|
@ -109,11 +108,6 @@ def archive_link(link_dir, link):
|
||||||
if result['status'] == 'succeeded':
|
if result['status'] == 'succeeded':
|
||||||
link['latest'][method_name] = result['output']
|
link['latest'][method_name] = result['output']
|
||||||
|
|
||||||
if result['status'] != 'skipped':
|
|
||||||
made_changes = True
|
|
||||||
|
|
||||||
_LAST_RUN_STATS[result['status']] += 1
|
|
||||||
|
|
||||||
write_link_index(link_dir, link)
|
write_link_index(link_dir, link)
|
||||||
patch_links_index(link)
|
patch_links_index(link)
|
||||||
|
|
||||||
|
@ -126,6 +120,7 @@ def archive_link(link_dir, link):
|
||||||
return link
|
return link
|
||||||
|
|
||||||
|
|
||||||
|
### Archive Method Functions
|
||||||
|
|
||||||
def should_fetch_title(link_dir, link):
|
def should_fetch_title(link_dir, link):
|
||||||
# if link already has valid title, skip it
|
# if link already has valid title, skip it
|
||||||
|
@ -428,8 +423,8 @@ def should_fetch_git(link_dir, link):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
is_clonable_url = (
|
is_clonable_url = (
|
||||||
domain(link['url']) in GIT_DOMAINS
|
(domain(link['url']) in GIT_DOMAINS)
|
||||||
or extension(link['url']) == 'git'
|
or (extension(link['url']) == 'git')
|
||||||
)
|
)
|
||||||
if not is_clonable_url:
|
if not is_clonable_url:
|
||||||
return False
|
return False
|
||||||
|
@ -477,6 +472,7 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
|
||||||
**timer.stats,
|
**timer.stats,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def should_fetch_media(link_dir, link):
|
def should_fetch_media(link_dir, link):
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link['url']):
|
||||||
return False
|
return False
|
||||||
|
@ -547,21 +543,6 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
|
||||||
**timer.stats,
|
**timer.stats,
|
||||||
}
|
}
|
||||||
|
|
||||||
def parse_archive_dot_org_response(response):
|
|
||||||
# Parse archive.org response headers
|
|
||||||
headers = defaultdict(list)
|
|
||||||
|
|
||||||
# lowercase all the header names and store in dict
|
|
||||||
for header in response.splitlines():
|
|
||||||
if b':' not in header or not header.strip():
|
|
||||||
continue
|
|
||||||
name, val = header.decode().split(':', 1)
|
|
||||||
headers[name.lower().strip()].append(val.strip())
|
|
||||||
|
|
||||||
# Get successful archive url in "content-location" header or any errors
|
|
||||||
content_location = headers['content-location']
|
|
||||||
errors = headers['x-archive-wayback-runtime-error']
|
|
||||||
return content_location, errors
|
|
||||||
|
|
||||||
def should_fetch_archive_dot_org(link_dir, link):
|
def should_fetch_archive_dot_org(link_dir, link):
|
||||||
if is_static_file(link['url']):
|
if is_static_file(link['url']):
|
||||||
|
@ -627,4 +608,18 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||||
**timer.stats,
|
**timer.stats,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def parse_archive_dot_org_response(response):
|
||||||
|
# Parse archive.org response headers
|
||||||
|
headers = defaultdict(list)
|
||||||
|
|
||||||
|
# lowercase all the header names and store in dict
|
||||||
|
for header in response.splitlines():
|
||||||
|
if b':' not in header or not header.strip():
|
||||||
|
continue
|
||||||
|
name, val = header.decode().split(':', 1)
|
||||||
|
headers[name.lower().strip()].append(val.strip())
|
||||||
|
|
||||||
|
# Get successful archive url in "content-location" header or any errors
|
||||||
|
content_location = headers['content-location']
|
||||||
|
errors = headers['x-archive-wayback-runtime-error']
|
||||||
|
return content_location, errors
|
||||||
|
|
|
@ -26,6 +26,7 @@ from util import (
|
||||||
from parse import parse_links
|
from parse import parse_links
|
||||||
from links import validate_links
|
from links import validate_links
|
||||||
from logs import (
|
from logs import (
|
||||||
|
log_indexing_process_started,
|
||||||
log_indexing_started,
|
log_indexing_started,
|
||||||
log_indexing_finished,
|
log_indexing_finished,
|
||||||
log_parsing_started,
|
log_parsing_started,
|
||||||
|
@ -40,12 +41,14 @@ TITLE_LOADING_MSG = 'Not yet archived...'
|
||||||
def write_links_index(out_dir, links, finished=False):
|
def write_links_index(out_dir, links, finished=False):
|
||||||
"""create index.html file for a given list of links"""
|
"""create index.html file for a given list of links"""
|
||||||
|
|
||||||
log_indexing_started()
|
log_indexing_process_started()
|
||||||
check_links_structure(links)
|
check_links_structure(links)
|
||||||
|
|
||||||
|
log_indexing_started(out_dir, 'index.json')
|
||||||
write_json_links_index(out_dir, links)
|
write_json_links_index(out_dir, links)
|
||||||
log_indexing_finished(out_dir, 'index.json')
|
log_indexing_finished(out_dir, 'index.json')
|
||||||
|
|
||||||
|
log_indexing_started(out_dir, 'index.html')
|
||||||
write_html_links_index(out_dir, links, finished=finished)
|
write_html_links_index(out_dir, links, finished=finished)
|
||||||
log_indexing_finished(out_dir, 'index.html')
|
log_indexing_finished(out_dir, 'index.html')
|
||||||
|
|
||||||
|
|
|
@ -3,33 +3,26 @@ In ArchiveBox, a Link represents a single entry that we track in the
|
||||||
json index. All links pass through all archiver functions and the latest,
|
json index. All links pass through all archiver functions and the latest,
|
||||||
most up-to-date canonical output for each is stored in "latest".
|
most up-to-date canonical output for each is stored in "latest".
|
||||||
|
|
||||||
|
|
||||||
Link {
|
Link {
|
||||||
timestamp: str, (how we uniquely id links) _ _ _ _ ___
|
timestamp: str, (how we uniquely id links)
|
||||||
url: str, | \ / \ |\| ' |
|
url: str,
|
||||||
base_url: str, |_/ \_/ | | |
|
title: str,
|
||||||
domain: str, _ _ _ _ _ _
|
tags: str,
|
||||||
tags: str, |_) /| |\| | / `
|
sources: [str],
|
||||||
type: str, | /"| | | | \_,
|
latest: {
|
||||||
title: str, ,-'"`-.
|
...,
|
||||||
sources: [str], /// / @ @ \ \\\\
|
pdf: 'output.pdf',
|
||||||
latest: { \ :=| ,._,. |=: /
|
wget: 'example.com/1234/index.html',
|
||||||
..., || ,\ \_../ /. ||
|
screenshot: null,
|
||||||
pdf: 'output.pdf', ||','`-._))'`.`||
|
|
||||||
wget: 'example.com/1234/index.html' `-' (/ `-'
|
|
||||||
},
|
},
|
||||||
history: {
|
history: {
|
||||||
...
|
|
||||||
pdf: [
|
pdf: [
|
||||||
{timestamp: 15444234325, status: 'skipped', result='output.pdf'},
|
{start_ts, end_ts, duration, cmd, pwd, status, output},
|
||||||
...
|
...
|
||||||
],
|
],
|
||||||
wget: [
|
...
|
||||||
{timestamp: 11534435345, status: 'succeded', result='donuts.com/eat/them.html'}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from html import unescape
|
from html import unescape
|
||||||
|
|
|
@ -45,13 +45,21 @@ def log_link_archiving_started(link_dir, link, is_new):
|
||||||
))
|
))
|
||||||
|
|
||||||
def log_link_archiving_finished(link_dir, link, is_new, skipped_entirely):
|
def log_link_archiving_finished(link_dir, link, is_new, skipped_entirely):
|
||||||
|
if all(output == 'succeeded' for output in link['latest']):
|
||||||
|
_LAST_RUN_STATS['succeeded'] += 1
|
||||||
|
elif skipped_entirely or all(output == 'skipped' for output in link['latest']):
|
||||||
|
_LAST_RUN_STATS['skipped'] += 1
|
||||||
|
else:
|
||||||
|
_LAST_RUN_STATS['failed'] += 1
|
||||||
|
# import ipdb; ipdb.set_trace()
|
||||||
|
|
||||||
if skipped_entirely:
|
if skipped_entirely:
|
||||||
print('\r √ {}{}'.format(
|
print('\r √ {}{}'.format(
|
||||||
pretty_path(link_dir),
|
pretty_path(link_dir),
|
||||||
' (new)' if is_new else '',
|
' (new)' if is_new else '',
|
||||||
))
|
))
|
||||||
|
|
||||||
def log_archive_method_starting(method):
|
def log_archive_method_started(method):
|
||||||
print(' > {}'.format(method))
|
print(' > {}'.format(method))
|
||||||
|
|
||||||
def log_archive_method_finished(result):
|
def log_archive_method_finished(result):
|
||||||
|
@ -117,7 +125,7 @@ def log_parsing_finished(num_new_links, parser_name):
|
||||||
parser_name,
|
parser_name,
|
||||||
))
|
))
|
||||||
|
|
||||||
def log_indexing_started():
|
def log_indexing_process_started():
|
||||||
start_ts = datetime.now()
|
start_ts = datetime.now()
|
||||||
_LAST_RUN_STATS['index_start_ts'] = start_ts
|
_LAST_RUN_STATS['index_start_ts'] = start_ts
|
||||||
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
||||||
|
@ -125,10 +133,13 @@ def log_indexing_started():
|
||||||
**ANSI,
|
**ANSI,
|
||||||
))
|
))
|
||||||
|
|
||||||
|
def log_indexing_started(out_dir, out_file):
|
||||||
|
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
|
||||||
|
|
||||||
def log_indexing_finished(out_dir, out_file):
|
def log_indexing_finished(out_dir, out_file):
|
||||||
end_ts = datetime.now()
|
end_ts = datetime.now()
|
||||||
_LAST_RUN_STATS['index_end_ts'] = end_ts
|
_LAST_RUN_STATS['index_end_ts'] = end_ts
|
||||||
print(' √ {}/{}'.format(pretty_path(out_dir), out_file))
|
print('\r √ {}/{}'.format(pretty_path(out_dir), out_file))
|
||||||
|
|
||||||
def log_archiving_started(num_links, resume):
|
def log_archiving_started(num_links, resume):
|
||||||
start_ts = datetime.now()
|
start_ts = datetime.now()
|
||||||
|
|
|
@ -314,10 +314,20 @@ def wget_output_path(link):
|
||||||
# Wget downloads can save in a number of different ways depending on the url:
|
# Wget downloads can save in a number of different ways depending on the url:
|
||||||
# https://example.com
|
# https://example.com
|
||||||
# > output/archive/<timestamp>/example.com/index.html
|
# > output/archive/<timestamp>/example.com/index.html
|
||||||
|
# https://example.com?v=zzVa_tX1OiI
|
||||||
|
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
|
||||||
|
# https://www.example.com/?v=zzVa_tX1OiI
|
||||||
|
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
|
||||||
|
|
||||||
# https://example.com/abc
|
# https://example.com/abc
|
||||||
# > output/archive/<timestamp>/example.com/abc.html
|
# > output/archive/<timestamp>/example.com/abc.html
|
||||||
# https://example.com/abc/
|
# https://example.com/abc/
|
||||||
# > output/archive/<timestamp>/example.com/abc/index.html
|
# > output/archive/<timestamp>/example.com/abc/index.html
|
||||||
|
# https://example.com/abc?v=zzVa_tX1OiI.html
|
||||||
|
# > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
|
||||||
|
# https://example.com/abc/?v=zzVa_tX1OiI.html
|
||||||
|
# > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
|
||||||
|
|
||||||
# https://example.com/abc/test.html
|
# https://example.com/abc/test.html
|
||||||
# > output/archive/<timestamp>/example.com/abc/test.html
|
# > output/archive/<timestamp>/example.com/abc/test.html
|
||||||
# https://example.com/abc/test?v=zzVa_tX1OiI
|
# https://example.com/abc/test?v=zzVa_tX1OiI
|
||||||
|
@ -326,7 +336,7 @@ def wget_output_path(link):
|
||||||
# > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
|
# > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
|
||||||
|
|
||||||
# There's also lots of complexity around how the urlencoding and renaming
|
# There's also lots of complexity around how the urlencoding and renaming
|
||||||
# is done for pages with query and hash fragments or extensions like shtml / htm
|
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
|
||||||
|
|
||||||
# Since the wget algorithm for -E (appending .html) is incredibly complex
|
# Since the wget algorithm for -E (appending .html) is incredibly complex
|
||||||
# and there's no way to get the computed output path from wget
|
# and there's no way to get the computed output path from wget
|
||||||
|
@ -359,27 +369,6 @@ def wget_output_path(link):
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# If finding the actual output file didn't work, fall back to the buggy
|
|
||||||
# implementation of the wget .html appending algorithm
|
|
||||||
# split_url = link['url'].split('#', 1)
|
|
||||||
# query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
|
|
||||||
|
|
||||||
# if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
|
||||||
# # already ends in .html
|
|
||||||
# return urlencode(base_url(link['url']))
|
|
||||||
# else:
|
|
||||||
# # .html needs to be appended
|
|
||||||
# without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
|
||||||
# if without_scheme.endswith('/'):
|
|
||||||
# if query:
|
|
||||||
# return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
|
|
||||||
# return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
|
|
||||||
# else:
|
|
||||||
# if query:
|
|
||||||
# return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
|
|
||||||
# elif '/' in without_scheme:
|
|
||||||
# return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
|
|
||||||
# return urlencode(base_url(link['url']) + '/index.html')
|
|
||||||
|
|
||||||
### String Manipulation & Logging Helpers
|
### String Manipulation & Logging Helpers
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue