split up utils into separate files

This commit is contained in:
Nick Sweeting 2019-04-30 23:13:04 -04:00
parent daf5951897
commit 95007d9137
23 changed files with 820 additions and 759 deletions

View file

@ -1,14 +1,28 @@
__package__ = 'archivebox.cli'
import re
import os
import sys
import time
import argparse
from datetime import datetime
from dataclasses import dataclass
from typing import Optional, List
from multiprocessing import Process
from typing import Optional, List, Dict, Union, IO
from ..index.schema import Link, ArchiveResult
from ..config import ANSI, OUTPUT_DIR, IS_TTY
from ..index.json import to_json
from ..index.csv import links_to_csv
from ..util import enforce_types
from ..config import (
ConfigDict,
ANSI,
OUTPUT_DIR,
IS_TTY,
SHOW_PROGRESS,
TERM_WIDTH,
)
@dataclass
@ -32,11 +46,104 @@ class RuntimeStats:
_LAST_RUN_STATS = RuntimeStats()
def pretty_path(path: str) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = os.path.abspath('.')
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
return path.replace(pwd + '/', './')
class SmartFormatter(argparse.HelpFormatter):
"""Patched formatter that prints newlines in argparse help strings"""
def _split_lines(self, text, width):
if '\n' in text:
return text.splitlines()
return argparse.HelpFormatter._split_lines(self, text, width)
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
"""Tell the user they passed stdin to a command that doesn't accept it"""
if stdin and not stdin.isatty():
stdin_raw_text = stdin.read().strip()
if stdin_raw_text:
print(
'{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
caller,
**ANSI,
)
)
print(' Run archivebox "{} --help" to see usage and examples.'.format(
caller,
))
print()
raise SystemExit(1)
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
if stdin and not stdin.isatty():
return stdin.read()
return None
class TimedProgress:
"""Show a progress bar and measure elapsed time until .end() is called"""
def __init__(self, seconds, prefix=''):
if SHOW_PROGRESS:
self.p = Process(target=progress_bar, args=(seconds, prefix))
self.p.start()
self.stats = {'start_ts': datetime.now(), 'end_ts': None}
def end(self):
"""immediately end progress, clear the progressbar line, and save end_ts"""
end_ts = datetime.now()
self.stats['end_ts'] = end_ts
if SHOW_PROGRESS:
# protect from double termination
#if p is None or not hasattr(p, 'kill'):
# return
if self.p is not None:
self.p.terminate()
self.p = None
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) # clear whole terminal line
@enforce_types
def progress_bar(seconds: int, prefix: str='') -> None:
"""show timer in the form of progress bar, with percentage and seconds remaining"""
chunk = '' if sys.stdout.encoding == 'UTF-8' else '#'
chunks = TERM_WIDTH() - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
try:
for s in range(seconds * chunks):
chunks = TERM_WIDTH() - len(prefix) - 20
progress = s / chunks / seconds * 100
bar_width = round(progress/(100/chunks))
# ████████████████████ 0.9% (1/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
prefix,
ANSI['green'],
(chunk * bar_width).ljust(chunks),
ANSI['reset'],
round(progress, 1),
round(s/chunks),
seconds,
))
sys.stdout.flush()
time.sleep(1 / chunks)
# ██████████████████████████████████ 100.0% (60/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
prefix,
ANSI['red'],
chunk * chunks,
ANSI['reset'],
100.0,
seconds,
seconds,
))
sys.stdout.flush()
except KeyboardInterrupt:
print()
pass
### Parsing Stage
@ -223,10 +330,9 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
print(' {}'.format(' '.join(filter_patterns or ())))
def log_list_finished(links):
from ..util import links_to_csv
print()
print('---------------------------------------------------------------------------------------------------')
print(links_to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
print('---------------------------------------------------------------------------------------------------')
print()
@ -266,3 +372,129 @@ def log_removal_finished(all_links: int, to_keep: int):
**ANSI,
))
print(' Index now contains {} links.'.format(to_keep))
def log_shell_welcome_msg():
from . import list_subcommands
print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
print('{green}from archivebox.core.models import Page, User{reset}'.format(**ANSI))
print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
print()
print('[i] Welcome to the ArchiveBox Shell!')
print(' https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage')
print()
print(' {lightred}Hint:{reset} Example use:'.format(**ANSI))
print(' print(Page.objects.filter(is_archived=True).count())')
print(' Page.objects.get(url="https://example.com").as_json()')
print(' add("https://example.com/some/new/url")')
### Helpers
@enforce_types
def pretty_path(path: str) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = os.path.abspath('.')
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
return path.replace(pwd + '/', './')
@enforce_types
def printable_filesize(num_bytes: Union[int, float]) -> str:
for count in ['Bytes','KB','MB','GB']:
if num_bytes > -1024.0 and num_bytes < 1024.0:
return '%3.1f %s' % (num_bytes, count)
num_bytes /= 1024.0
return '%3.1f %s' % (num_bytes, 'TB')
@enforce_types
def printable_folders(folders: Dict[str, Optional[Link]],
json: bool=False,
csv: Optional[str]=None) -> str:
if json:
return to_json(folders.values(), indent=4, sort_keys=True)
elif csv:
return links_to_csv(folders.values(), cols=csv.split(','), header=True)
return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
@enforce_types
def printable_config(config: ConfigDict, prefix: str='') -> str:
return f'\n{prefix}'.join(
f'{key}={val}'
for key, val in config.items()
if not (isinstance(val, dict) or callable(val))
)
@enforce_types
def printable_folder_status(name: str, folder: Dict) -> str:
if folder['enabled']:
if folder['is_valid']:
color, symbol, note = 'green', '', 'valid'
else:
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']:
if os.path.exists(folder['path']):
num_files = (
f'{len(os.listdir(folder["path"]))} files'
if os.path.isdir(folder['path']) else
printable_filesize(os.path.getsize(folder['path']))
)
else:
num_files = 'missing'
if ' ' in folder['path']:
folder['path'] = f'"{folder["path"]}"'
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(22),
(folder["path"] or '').ljust(76),
num_files.ljust(14),
ANSI[color],
note,
ANSI['reset'],
))
@enforce_types
def printable_dependency_version(name: str, dependency: Dict) -> str:
if dependency['enabled']:
if dependency['is_valid']:
color, symbol, note, version = 'green', '', 'valid', ''
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
if parsed_version_num:
version = f'v{parsed_version_num[0]}'
if not version:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
if ' ' in dependency["path"]:
dependency["path"] = f'"{dependency["path"]}"'
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(22),
(dependency["path"] or '').ljust(76),
version.ljust(14),
ANSI[color],
note,
ANSI['reset'],
))

View file

@ -119,6 +119,27 @@ DEFAULT_CLI_COLORS = {
}
ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
STATICFILE_EXTENSIONS = {
# 99.999% of the time, URLs ending in these extentions are static files
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img',
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# Thse are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
VERSION_FILENAME = 'VERSION'
PYTHON_DIR_NAME = 'archivebox'

View file

@ -64,3 +64,7 @@ class Page(models.Model):
@property
def base_url(self):
return self.as_link().base_url
@property
def link_dir(self):
return self.as_link().link_dir

View file

@ -4,17 +4,19 @@ import os
import sys
SECRET_KEY = '---------------- not a valid secret key ! ----------------'
DEBUG = True
DEBUG = os.getenv('DEBUG', 'False').lower() == 'true'
ALLOWED_HOSTS = ['*']
REPO_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), os.path.pardir, os.path.pardir))
OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR', os.curdir))
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, 'archive')
DATABASE_FILE = os.path.join(OUTPUT_DIR, 'index.sqlite3')
ACTIVE_THEME = 'default'
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
APPEND_SLASH = True
INSTALLED_APPS = [
'django.contrib.auth',

View file

@ -1,17 +1,6 @@
from cli import list_subcommands
from .config import ANSI
from cli.logging import log_shell_welcome_msg
if __name__ == '__main__':
print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
# print('from archivebox.core.models import Page, User')
print('{green}from archivebox.cli import\narchivebox_{}{reset}'.format("\narchivebox_".join(list_subcommands().keys()), **ANSI))
print()
print('[i] Welcome to the ArchiveBox Shell! Example use:')
print(' print(Page.objects.filter(is_archived=True).count())')
print(' Page.objects.get(url="https://example.com").as_json()')
print(' Page.objects.get(url="https://example.com").as_json()')
print(' from archivebox.main import get_invalid_folders')
from main import *
log_shell_welcome_msg()

View file

@ -5,16 +5,11 @@ import os
from typing import Optional, List, Dict, Tuple
from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, DEVNULL, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
DEVNULL,
is_static_file,
ArchiveError,
chmod_file,
)
from ..config import (
VERSION,
@ -24,6 +19,7 @@ from ..config import (
CURL_VERSION,
CHECK_SSL_VALIDITY
)
from ..cli.logging import TimedProgress

View file

@ -4,22 +4,19 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chrome_args,
chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_DOM,
CHROME_VERSION,
)
from ..cli.logging import TimedProgress

View file

@ -5,14 +5,8 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..util import (
enforce_types,
TimedProgress,
domain,
run,
PIPE,
chmod_file,
)
from ..system import chmod_file, run, PIPE
from ..util import enforce_types, domain
from ..config import (
TIMEOUT,
SAVE_FAVICON,
@ -20,6 +14,7 @@ from ..config import (
CURL_VERSION,
CHECK_SSL_VALIDITY,
)
from ..cli.logging import TimedProgress
@enforce_types

View file

@ -4,15 +4,11 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chmod_file,
domain,
extension,
without_query,
@ -26,6 +22,7 @@ from ..config import (
GIT_DOMAINS,
CHECK_SSL_VALIDITY
)
from ..cli.logging import TimedProgress

View file

@ -4,15 +4,11 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chmod_file,
)
from ..config import (
MEDIA_TIMEOUT,
@ -21,6 +17,7 @@ from ..config import (
YOUTUBEDL_VERSION,
CHECK_SSL_VALIDITY
)
from ..cli.logging import TimedProgress
@enforce_types

View file

@ -4,23 +4,19 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chrome_args,
chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_PDF,
CHROME_VERSION,
)
from ..cli.logging import TimedProgress
@enforce_types

View file

@ -4,22 +4,19 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
is_static_file,
ArchiveError,
chrome_args,
chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_SCREENSHOT,
CHROME_VERSION,
)
from ..cli.logging import TimedProgress

View file

@ -1,14 +1,14 @@
__package__ = 'archivebox.extractors'
import re
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..util import (
enforce_types,
TimedProgress,
is_static_file,
ArchiveError,
fetch_page_title,
download_url,
htmldecode,
)
from ..config import (
TIMEOUT,
@ -16,6 +16,14 @@ from ..config import (
CURL_BINARY,
CURL_VERSION,
)
from ..cli.logging import TimedProgress
HTML_TITLE_REGEX = re.compile(
r'<title.*?>' # start matching text after <title> tag
r'(.[^<>]+)', # get everything up to these symbols
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
)
@enforce_types
@ -44,7 +52,9 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
output = fetch_page_title(link.url, timeout=timeout, progress=False)
html = download_url(link.url, timeout=timeout)
match = re.search(HTML_TITLE_REGEX, html)
output = htmldecode(match.group(1).strip()) if match else None
if not output:
raise ArchiveError('Unable to detect page title')
except Exception as err:

View file

@ -1,18 +1,22 @@
__package__ = 'archivebox.extractors'
import os
import re
from typing import Optional
from datetime import datetime
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, PIPE
from ..util import (
enforce_types,
TimedProgress,
run,
PIPE,
wget_output_path,
ArchiveError,
is_static_file,
without_scheme,
without_fragment,
without_query,
path,
domain,
urldecode,
)
from ..config import (
TIMEOUT,
@ -26,7 +30,7 @@ from ..config import (
WGET_USER_AGENT,
COOKIES_FILE,
)
from ..cli.logging import TimedProgress
@enforce_types
@ -121,3 +125,72 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
status=status,
**timer.stats,
)
@enforce_types
def wget_output_path(link: Link) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension (-E)
"""
if is_static_file(link.url):
return without_scheme(without_fragment(link.url))
# Wget downloads can save in a number of different ways depending on the url:
# https://example.com
# > example.com/index.html
# https://example.com?v=zzVa_tX1OiI
# > example.com/index.html?v=zzVa_tX1OiI.html
# https://www.example.com/?v=zzVa_tX1OiI
# > example.com/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc
# > example.com/abc.html
# https://example.com/abc/
# > example.com/abc/index.html
# https://example.com/abc?v=zzVa_tX1OiI.html
# > example.com/abc?v=zzVa_tX1OiI.html
# https://example.com/abc/?v=zzVa_tX1OiI.html
# > example.com/abc/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc/test.html
# > example.com/abc/test.html
# https://example.com/abc/test?v=zzVa_tX1OiI
# > example.com/abc/test?v=zzVa_tX1OiI.html
# https://example.com/abc/test/?v=zzVa_tX1OiI
# > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
# Since the wget algorithm for -E (appending .html) is incredibly complex
# and there's no way to get the computed output path from wget
# in order to avoid having to reverse-engineer how they calculate it,
# we just look in the output folder read the filename wget used from the filesystem
full_path = without_fragment(without_query(path(link.url))).strip('/')
search_dir = os.path.join(
link.link_dir,
domain(link.url),
urldecode(full_path),
)
for _ in range(4):
if os.path.exists(search_dir):
if os.path.isdir(search_dir):
html_files = [
f for f in os.listdir(search_dir)
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
]
if html_files:
path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
return os.path.join(path_from_link_dir, html_files[0])
# Move up one directory level
search_dir = search_dir.rsplit('/', 1)[0]
if search_dir == link.link_dir:
break
return None

View file

@ -10,12 +10,10 @@ from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict
from contextlib import contextmanager
from ..parsers import parse_links
from ..system import atomic_write
from ..util import (
scheme,
enforce_types,
TimedProgress,
atomic_write,
ExtendedEncoder,
)
from ..config import (
@ -30,6 +28,7 @@ from ..config import (
stderr,
)
from ..cli.logging import (
TimedProgress,
log_indexing_process_started,
log_indexing_process_finished,
log_indexing_started,
@ -278,6 +277,8 @@ def import_new_links(existing_links: List[Link],
import_path: str,
out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
from ..parsers import parse_links
new_links: List[Link] = []
# parse and validate the import file
@ -584,9 +585,9 @@ def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], Li
else:
shutil.move(entry.path, dest)
fixed.append(dest)
if link.link_dir != entry.path:
link = link.overwrite(link_dir=entry.path)
write_json_link_details(link, out_dir=entry.path)
timestamp = entry.path.rsplit('/', 1)[-1]
assert link.link_dir == entry.path
assert link.timestamp == timestamp
write_json_link_details(link, out_dir=entry.path)
return fixed, cant_fix

37
archivebox/index/csv.py Normal file
View file

@ -0,0 +1,37 @@
__package__ = 'archivebox.index'
from typing import List, Optional, Any
from ..util import enforce_types
from .schema import Link
@enforce_types
def links_to_csv(links: List[Link],
cols: Optional[List[str]]=None,
header: bool=True,
separator: str=',',
ljust: int=0) -> str:
cols = cols or ['timestamp', 'is_archived', 'url']
header_str = ''
if header:
header_str = separator.join(col.ljust(ljust) for col in cols)
row_strs = (
link.to_csv(cols=cols, ljust=ljust, separator=separator)
for link in links
)
return '\n'.join((header_str, *row_strs))
@enforce_types
def to_csv(obj: Any, cols: List[str], separator: str=',', ljust: int=0) -> str:
from .json import to_json
return separator.join(
to_json(getattr(obj, col), indent=None).ljust(ljust)
for col in cols
)

View file

@ -2,20 +2,18 @@ __package__ = 'archivebox.index'
import os
from string import Template
from datetime import datetime
from typing import List, Optional, Iterator
from typing import List, Optional, Iterator, Mapping
from .schema import Link
from ..system import atomic_write, copy_and_overwrite
from ..util import (
enforce_types,
ts_to_date,
urlencode,
htmlencode,
urldecode,
wget_output_path,
render_template,
atomic_write,
copy_and_overwrite,
)
from ..config import (
OUTPUT_DIR,
@ -67,7 +65,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished:
def main_index_template(links: List[Link], finished: bool=True) -> str:
"""render the template for the entire main index"""
return render_template(MAIN_INDEX_TEMPLATE, {
return render_legacy_template(MAIN_INDEX_TEMPLATE, {
'version': VERSION,
'git_sha': GIT_SHA,
'num_links': str(len(links)),
@ -86,7 +84,9 @@ def main_index_template(links: List[Link], finished: bool=True) -> str:
def main_index_row_template(link: Link) -> str:
"""render the template for an individual link row of the main index"""
return render_template(MAIN_INDEX_ROW_TEMPLATE, {
from ..extractors.wget import wget_output_path
return render_legacy_template(MAIN_INDEX_ROW_TEMPLATE, {
**link._asdict(extended=True),
# before pages are finished archiving, show loading msg instead of title
@ -122,9 +122,11 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
@enforce_types
def link_details_template(link: Link) -> str:
from ..extractors.wget import wget_output_path
link_info = link._asdict(extended=True)
return render_template(LINK_DETAILS_TEMPLATE, {
return render_legacy_template(LINK_DETAILS_TEMPLATE, {
**link_info,
**link_info['canonical'],
'title': (
@ -142,3 +144,13 @@ def link_details_template(link: Link) -> str:
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date(link.oldest_archive_date),
})
@enforce_types
def render_legacy_template(template_path: str, context: Mapping[str, str]) -> str:
"""render a given html template string with the given template content"""
# will be replaced by django templates in the future
with open(template_path, 'r', encoding='utf-8') as template:
template_str = template.read()
return Template(template_str).substitute(**context)

View file

@ -2,13 +2,14 @@ __package__ = 'archivebox.index'
import os
import sys
import json
import json as pyjson
from datetime import datetime
from typing import List, Optional, Iterator
from typing import List, Optional, Iterator, Any
from .schema import Link, ArchiveResult
from ..util import enforce_types, atomic_write
from ..system import atomic_write
from ..util import enforce_types
from ..config import (
VERSION,
OUTPUT_DIR,
@ -46,7 +47,7 @@ def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links']
links = pyjson.load(f)['links']
for link_json in links:
yield Link.from_json(link_json)
@ -95,12 +96,13 @@ def parse_json_link_details(out_dir: str) -> Optional[Link]:
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
try:
link_json = json.load(f)
link_json = pyjson.load(f)
return Link.from_json(link_json)
except json.JSONDecodeError:
except pyjson.JSONDecodeError:
pass
return None
@enforce_types
def parse_json_links_details(out_dir: str) -> Iterator[Link]:
"""read through all the archive data folders and return the parsed links"""
@ -111,3 +113,41 @@ def parse_json_links_details(out_dir: str) -> Iterator[Link]:
link = parse_json_link_details(entry.path)
if link:
yield link
### Helpers
class ExtendedEncoder(pyjson.JSONEncoder):
"""
Extended json serializer that supports serializing several model
fields and objects
"""
def default(self, obj):
cls_name = obj.__class__.__name__
if hasattr(obj, '_asdict'):
return obj._asdict()
elif isinstance(obj, bytes):
return obj.decode()
elif isinstance(obj, datetime):
return obj.isoformat()
elif isinstance(obj, Exception):
return '{}: {}'.format(obj.__class__.__name__, obj)
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
return tuple(obj)
return pyjson.JSONEncoder.default(self, obj)
@enforce_types
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)

View file

@ -61,19 +61,20 @@ class ArchiveResult:
info['end_ts'] = parse_date(info['end_ts'])
return cls(**info)
def to_json(self, indent=4, sort_keys=True):
from ..util import to_json
def to_dict(self, *keys) -> dict:
if keys:
return {k: v for k, v in asdict(self).items() if k in keys}
return asdict(self)
def to_json(self, indent=4, sort_keys=True) -> str:
from .json import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, cols=None, ljust: int=0, separator: str=','):
from ..util import to_json
def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
from .csv import to_csv
cols = cols or self.field_names()
return separator.join(
to_json(getattr(self, col), indent=None).ljust(ljust)
for col in cols
)
return to_csv(self, csv_col=cols or self.field_names(), separator=separator, ljust=ljust)
@classmethod
def field_names(cls):
@ -201,18 +202,15 @@ class Link:
info['history'] = cast_history
return cls(**info)
def to_json(self, indent=4, sort_keys=True):
from ..util import to_json
def to_json(self, indent=4, sort_keys=True) -> str:
from .json import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','):
from ..util import to_json
def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
from .csv import to_csv
return separator.join(
to_json(getattr(self, col), indent=None).ljust(ljust)
for col in csv_cols
)
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
@classmethod
def field_names(cls):
@ -354,7 +352,7 @@ class Link:
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""predict the expected output paths that should be present after archiving"""
from ..util import wget_output_path
from ..extractors.wget import wget_output_path
canonical = {
'index_path': 'index.html',
'favicon_path': 'favicon.ico',
@ -382,3 +380,5 @@ class Link:
'dom_path': static_path,
})
return canonical

View file

@ -1,11 +1,10 @@
__package__ = 'archivebox'
import re
import os
import sys
import shutil
from typing import Dict, List, Optional, Set, Tuple, Iterable, IO
from typing import Dict, List, Optional, Iterable, IO
from crontab import CronTab, CronSlices
@ -17,18 +16,13 @@ from .cli import (
main_cmds,
archive_cmds,
)
from .index.schema import Link
from .util import (
enforce_types,
TimedProgress,
get_dir_size,
human_readable_size,
from .parsers import (
save_stdin_to_sources,
save_file_to_sources,
links_to_csv,
to_json,
folders_to_str,
)
from .index.schema import Link
from .util import enforce_types, docstring
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from .index import (
links_after_timestamp,
load_main_index,
@ -51,7 +45,11 @@ from .index.json import (
parse_json_main_index,
parse_json_links_details,
)
from .index.sql import parse_sql_main_index, get_admins, apply_migrations
from .index.sql import (
parse_sql_main_index,
get_admins,
apply_migrations,
)
from .index.html import parse_html_main_index
from .extractors import archive_link
from .config import (
@ -91,6 +89,7 @@ from .config import (
get_real_name,
)
from .cli.logging import (
TimedProgress,
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
@ -98,6 +97,11 @@ from .cli.logging import (
log_removal_finished,
log_list_started,
log_list_finished,
printable_config,
printable_folders,
printable_filesize,
printable_folder_status,
printable_dependency_version,
)
@ -387,7 +391,7 @@ def info(out_dir: str=OUTPUT_DIR) -> None:
print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
print(f' {out_dir}/*')
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
size = human_readable_size(num_bytes)
size = printable_filesize(num_bytes)
print(f' Size: {size} across {num_files} files')
print()
@ -419,7 +423,7 @@ def info(out_dir: str=OUTPUT_DIR) -> None:
print(f' {ARCHIVE_DIR}/*')
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
size = human_readable_size(num_bytes)
size = printable_filesize(num_bytes)
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
print()
@ -712,13 +716,8 @@ def list_all(filter_patterns_str: Optional[str]=None,
out_dir=out_dir,
)
if csv:
print(links_to_csv(folders.values(), csv_cols=csv.split(','), header=True))
elif json:
print(to_json(folders.values(), indent=4, sort_keys=True))
else:
print(folders_to_str(folders))
raise SystemExit(not folders)
print(printable_folders(folders, json=json, csv=csv))
return folders
@enforce_types
@ -749,7 +748,7 @@ def list_folders(links: List[Link],
status: str,
out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
check_data_folder()
check_data_folder(out_dir=out_dir)
if status == 'indexed':
return get_indexed_folders(links, out_dir=out_dir)
@ -796,7 +795,7 @@ def config(config_options_str: Optional[str]=None,
)
raise SystemExit(2)
elif config_options_str:
config_options = stdin_raw_text.split('\n')
config_options = config_options_str.split('\n')
config_options = config_options or []
@ -865,7 +864,6 @@ def config(config_options_str: Optional[str]=None,
stderr(' Please manually remove the relevant lines from your config file:')
stderr(f' {CONFIG_FILE}')
raise SystemExit(2)
else:
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
stderr(' archivebox config')
@ -874,8 +872,6 @@ def config(config_options_str: Optional[str]=None,
raise SystemExit(2)
CRON_COMMENT = 'archivebox_schedule'
@enforce_types
def schedule(add: bool=False,
show: bool=False,
@ -893,7 +889,7 @@ def schedule(add: bool=False,
os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True)
cron = CronTab(user=True)
cron = dedupe_jobs(cron)
cron = dedupe_cron_jobs(cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
if foreground or run_all:
@ -962,7 +958,7 @@ def schedule(add: bool=False,
stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
raise SystemExit(1)
cron = dedupe_jobs(cron)
cron = dedupe_cron_jobs(cron)
cron.write()
total_runs = sum(j.frequency_per_year() for j in cron)
@ -1025,95 +1021,13 @@ def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None:
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
@enforce_types
def shell(out_dir: str=OUTPUT_DIR) -> None:
"""Enter an interactive ArchiveBox Django shell"""
check_data_folder(out_dir=out_dir)
setup_django(OUTPUT_DIR)
from django.core.management import call_command
call_command("shell_plus")
# Helpers
def printable_config(config: ConfigDict, prefix: str='') -> str:
return f'\n{prefix}'.join(
f'{key}={val}'
for key, val in config.items()
if not (isinstance(val, dict) or callable(val))
)
def dedupe_jobs(cron: CronTab) -> CronTab:
deduped: Set[Tuple[str, str]] = set()
for job in list(cron):
unique_tuple = (str(job.slices), job.command)
if unique_tuple not in deduped:
deduped.add(unique_tuple)
cron.remove(job)
for schedule, command in deduped:
job = cron.new(command=command, comment=CRON_COMMENT)
job.setall(schedule)
job.enable()
return cron
def print_folder_status(name, folder):
if folder['enabled']:
if folder['is_valid']:
color, symbol, note = 'green', '', 'valid'
else:
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']:
if os.path.exists(folder['path']):
num_files = (
f'{len(os.listdir(folder["path"]))} files'
if os.path.isdir(folder['path']) else
human_readable_size(os.path.getsize(folder['path']))
)
else:
num_files = 'missing'
if ' ' in folder['path']:
folder['path'] = f'"{folder["path"]}"'
print(
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(22),
(folder["path"] or '').ljust(76),
num_files.ljust(14),
ANSI[color],
note,
ANSI['reset'],
)
def print_dependency_version(name, dependency):
if dependency['enabled']:
if dependency['is_valid']:
color, symbol, note = 'green', '', 'valid'
version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
else:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
if ' ' in dependency["path"]:
dependency["path"] = f'"{dependency["path"]}"'
print(
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(22),
(dependency["path"] or '').ljust(76),
version.ljust(14),
ANSI[color],
note,
ANSI['reset'],
)

View file

@ -7,16 +7,29 @@ For examples of supported import formats see tests/.
__package__ = 'archivebox.parsers'
import re
import os
from typing import Tuple, List
from datetime import datetime
from ..config import TIMEOUT
from ..util import (
check_url_parsing_invariants,
TimedProgress,
Link,
enforce_types,
from ..index.schema import Link
from ..system import atomic_write
from ..config import (
ANSI,
OUTPUT_DIR,
SOURCES_DIR_NAME,
TIMEOUT,
check_data_folder,
)
from ..util import (
basename,
domain,
download_url,
enforce_types,
URL_REGEX,
)
from ..cli.logging import pretty_path, TimedProgress
from .pocket_html import parse_pocket_html_export
from .pinboard_rss import parse_pinboard_rss_export
from .shaarli_rss import parse_shaarli_rss_export
@ -66,3 +79,95 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]:
timer.end()
return [], 'Failed to parse'
@enforce_types
def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
check_data_folder(out_dir=out_dir)
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
if not os.path.exists(sources_dir):
os.makedirs(sources_dir)
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
atomic_write(raw_text, source_path)
return source_path
@enforce_types
def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
check_data_folder(out_dir=out_dir)
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
if not os.path.exists(sources_dir):
os.makedirs(sources_dir)
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
print('{}[*] [{}] Downloading {}{}'.format(
ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
path,
ANSI['reset'],
))
timer = TimedProgress(timeout, prefix=' ')
try:
raw_source_text = download_url(path, timeout=timeout)
timer.end()
except Exception as e:
timer.end()
print('{}[!] Failed to download {}{}\n'.format(
ANSI['red'],
path,
ANSI['reset'],
))
print(' ', e)
raise SystemExit(1)
else:
with open(path, 'r') as f:
raw_source_text = f.read()
atomic_write(raw_source_text, source_path)
print(' > {}'.format(pretty_path(source_path)))
return source_path
def check_url_parsing_invariants() -> None:
"""Check that plain text regex URL parsing works as expected"""
# this is last-line-of-defense to make sure the URL_REGEX isn't
# misbehaving, as the consequences could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive
test_urls = '''
https://example1.com/what/is/happening.html?what=1#how-about-this=1
https://example2.com/what/is/happening/?what=1#how-about-this=1
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
https://example4.com/what/is/happening.html
https://example5.com/
https://example6.com
<test>http://example7.com</test>
[https://example8.com/what/is/this.php?what=1]
[and http://example9.com?what=1&other=3#and-thing=2]
<what>https://example10.com#and-thing=2 "</about>
abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
example13.bada
and example14.badb
<or>htt://example15.badc</that>
'''
# print('\n'.join(re.findall(URL_REGEX, test_urls)))
assert len(re.findall(URL_REGEX, test_urls)) == 12

150
archivebox/system.py Normal file
View file

@ -0,0 +1,150 @@
__package__ = 'archivebox'
import os
import shutil
import json as pyjson
from typing import Optional, Union, Set, Tuple
from crontab import CronTab
from subprocess import (
Popen,
PIPE,
DEVNULL,
CompletedProcess,
TimeoutExpired,
CalledProcessError,
)
from .util import enforce_types, ExtendedEncoder
from .config import OUTPUT_PERMISSIONS
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
if input is not None:
if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used '
'with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
with Popen(*popenargs, **kwargs) as process:
try:
stdout, stderr = process.communicate(input, timeout=timeout)
except TimeoutExpired:
process.kill()
try:
stdout, stderr = process.communicate(input, timeout=2)
except:
pass
raise TimeoutExpired(popenargs[0][0], timeout)
except BaseException:
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise
retcode = process.poll()
if check and retcode:
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr)
return CompletedProcess(process.args, retcode, stdout, stderr)
def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
try:
tmp_file = '{}.tmp'.format(path)
if isinstance(contents, bytes):
args = {'mode': 'wb+'}
else:
args = {'mode': 'w+', 'encoding': 'utf-8'}
with open(tmp_file, **args) as f:
if isinstance(contents, dict):
pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
else:
f.write(contents)
os.fsync(f.fileno())
os.rename(tmp_file, path)
chmod_file(path)
finally:
if os.path.exists(tmp_file):
os.remove(tmp_file)
@enforce_types
def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
"""chmod -R <permissions> <cwd>/<path>"""
if not os.path.exists(os.path.join(cwd, path)):
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
if chmod_result.returncode == 1:
print(' ', chmod_result.stderr.decode())
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
@enforce_types
def copy_and_overwrite(from_path: str, to_path: str):
"""copy a given file or directory to a given path, overwriting the destination"""
if os.path.isdir(from_path):
shutil.rmtree(to_path, ignore_errors=True)
shutil.copytree(from_path, to_path)
else:
with open(from_path, 'rb') as src:
atomic_write(src.read(), to_path)
@enforce_types
def get_dir_size(path: str, recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
"""get the total disk size of a given directory, optionally summing up
recursively and limiting to a given filter list
"""
num_bytes, num_dirs, num_files = 0, 0, 0
for entry in os.scandir(path):
if (pattern is not None) and (pattern not in entry.path):
continue
if entry.is_dir(follow_symlinks=False):
if not recursive:
continue
num_dirs += 1
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
num_bytes += bytes_inside
num_dirs += dirs_inside
num_files += files_inside
else:
num_bytes += entry.stat(follow_symlinks=False).st_size
num_files += 1
return num_bytes, num_dirs, num_files
CRON_COMMENT = 'archivebox_schedule'
@enforce_types
def dedupe_cron_jobs(cron: CronTab) -> CronTab:
deduped: Set[Tuple[str, str]] = set()
for job in list(cron):
unique_tuple = (str(job.slices), job.command)
if unique_tuple not in deduped:
deduped.add(unique_tuple)
cron.remove(job)
for schedule, command in deduped:
job = cron.new(command=command, comment=CRON_COMMENT)
job.setall(schedule)
job.enable()
return cron

View file

@ -1,15 +1,8 @@
import os
import re
import sys
import ssl
import json
import time
import shutil
import argparse
from string import Template
from json import JSONEncoder
from typing import List, Dict, Optional, Any, Union, IO, Mapping, Tuple
from typing import List, Optional, Any
from inspect import signature
from functools import wraps
from hashlib import sha256
@ -17,34 +10,17 @@ from urllib.request import Request, urlopen
from urllib.parse import urlparse, quote, unquote
from html import escape, unescape
from datetime import datetime
from multiprocessing import Process
from subprocess import (
Popen,
PIPE,
DEVNULL,
CompletedProcess,
TimeoutExpired,
CalledProcessError,
)
from base32_crockford import encode as base32_encode # type: ignore
import json as pyjson
from .index.schema import Link
from .config import (
ANSI,
TERM_WIDTH,
OUTPUT_DIR,
SOURCES_DIR_NAME,
OUTPUT_PERMISSIONS,
TIMEOUT,
SHOW_PROGRESS,
SAVE_TITLE,
STATICFILE_EXTENSIONS,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
CHROME_OPTIONS,
check_data_folder,
)
from .cli.logging import pretty_path
### Parsing Helpers
@ -66,6 +42,7 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
without_www = lambda url: url.replace('://www.', '://', 1)
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
is_static_file = lambda url: extension(url).lower() in STATICFILE_EXTENSIONS # TODO: the proper way is with MIME type detection, not using extension
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
urldecode = lambda s: s and unquote(s)
@ -85,36 +62,7 @@ URL_REGEX = re.compile(
r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
re.IGNORECASE,
)
HTML_TITLE_REGEX = re.compile(
r'<title.*?>' # start matching text after <title> tag
r'(.[^<>]+)', # get everything up to these symbols
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
)
STATICFILE_EXTENSIONS = {
# 99.999% of the time, URLs ending in these extentions are static files
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img',
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# Thse are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
### Checks & Tests
def enforce_types(func):
"""
@ -158,189 +106,14 @@ def enforce_types(func):
return typechecked_function
def check_url_parsing_invariants() -> None:
"""Check that plain text regex URL parsing works as expected"""
def docstring(text: Optional[str]):
"""attach the given docstring to the decorated function"""
def decorator(func):
if text:
func.__doc__ = text
return func
return decorator
# this is last-line-of-defense to make sure the URL_REGEX isn't
# misbehaving, as the consequences could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive
test_urls = '''
https://example1.com/what/is/happening.html?what=1#how-about-this=1
https://example2.com/what/is/happening/?what=1#how-about-this=1
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
https://example4.com/what/is/happening.html
https://example5.com/
https://example6.com
<test>http://example7.com</test>
[https://example8.com/what/is/this.php?what=1]
[and http://example9.com?what=1&other=3#and-thing=2]
<what>https://example10.com#and-thing=2 "</about>
abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
example13.bada
and example14.badb
<or>htt://example15.badc</that>
'''
# print('\n'.join(re.findall(URL_REGEX, test_urls)))
assert len(re.findall(URL_REGEX, test_urls)) == 12
### Random Helpers
@enforce_types
def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
check_data_folder(out_dir=out_dir)
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
if not os.path.exists(sources_dir):
os.makedirs(sources_dir)
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
atomic_write(raw_text, source_path)
return source_path
@enforce_types
def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
check_data_folder(out_dir=out_dir)
sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
if not os.path.exists(sources_dir):
os.makedirs(sources_dir)
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
print('{}[*] [{}] Downloading {}{}'.format(
ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
path,
ANSI['reset'],
))
timer = TimedProgress(timeout, prefix=' ')
try:
raw_source_text = download_url(path, timeout=timeout)
timer.end()
except Exception as e:
timer.end()
print('{}[!] Failed to download {}{}\n'.format(
ANSI['red'],
path,
ANSI['reset'],
))
print(' ', e)
raise SystemExit(1)
else:
with open(path, 'r') as f:
raw_source_text = f.read()
atomic_write(raw_source_text, source_path)
print(' > {}'.format(pretty_path(source_path)))
return source_path
@enforce_types
def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]:
"""Attempt to guess a page's title by downloading the html"""
if not SAVE_TITLE:
return None
try:
html = download_url(url, timeout=timeout)
match = re.search(HTML_TITLE_REGEX, html)
return htmldecode(match.group(1).strip()) if match else None
except Exception as err: # noqa
# print('[!] Failed to fetch title because of {}: {}'.format(
# err.__class__.__name__,
# err,
# ))
return None
@enforce_types
def wget_output_path(link: Link) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension (-E)
"""
if is_static_file(link.url):
return without_scheme(without_fragment(link.url))
# Wget downloads can save in a number of different ways depending on the url:
# https://example.com
# > output/archive/<timestamp>/example.com/index.html
# https://example.com?v=zzVa_tX1OiI
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
# https://www.example.com/?v=zzVa_tX1OiI
# > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc
# > output/archive/<timestamp>/example.com/abc.html
# https://example.com/abc/
# > output/archive/<timestamp>/example.com/abc/index.html
# https://example.com/abc?v=zzVa_tX1OiI.html
# > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html
# https://example.com/abc/?v=zzVa_tX1OiI.html
# > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc/test.html
# > output/archive/<timestamp>/example.com/abc/test.html
# https://example.com/abc/test?v=zzVa_tX1OiI
# > output/archive/<timestamp>/example.com/abc/test?v=zzVa_tX1OiI.html
# https://example.com/abc/test/?v=zzVa_tX1OiI
# > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
# Since the wget algorithm for -E (appending .html) is incredibly complex
# and there's no way to get the computed output path from wget
# in order to avoid having to reverse-engineer how they calculate it,
# we just look in the output folder read the filename wget used from the filesystem
full_path = without_fragment(without_query(path(link.url))).strip('/')
search_dir = os.path.join(
link.link_dir,
domain(link.url),
urldecode(full_path),
)
for _ in range(4):
if os.path.exists(search_dir):
if os.path.isdir(search_dir):
html_files = [
f for f in os.listdir(search_dir)
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
]
if html_files:
path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
return os.path.join(path_from_link_dir, html_files[0])
# Move up one directory level
search_dir = search_dir.rsplit('/', 1)[0]
if search_dir == link.link_dir:
break
return None
### String Manipulation & Logging Helpers
@enforce_types
def str_between(string: str, start: str, end: str=None) -> str:
@ -415,122 +188,6 @@ def parse_date(date: Any) -> Optional[datetime]:
raise ValueError('Tried to parse invalid date! {}'.format(date))
@enforce_types
def is_static_file(url: str) -> bool:
"""Certain URLs just point to a single static file, and
don't need to be re-archived in many formats
"""
# TODO: the proper way is with MIME type detection, not using extension
return extension(url) in STATICFILE_EXTENSIONS
### Python / System Helpers
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
if input is not None:
if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used '
'with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
with Popen(*popenargs, **kwargs) as process:
try:
stdout, stderr = process.communicate(input, timeout=timeout)
except TimeoutExpired:
process.kill()
try:
stdout, stderr = process.communicate(input, timeout=2)
except:
pass
raise TimeoutExpired(popenargs[0][0], timeout)
except BaseException:
process.kill()
# We don't call process.wait() as .__exit__ does that for us.
raise
retcode = process.poll()
if check and retcode:
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr)
return CompletedProcess(process.args, retcode, stdout, stderr)
class TimedProgress:
"""Show a progress bar and measure elapsed time until .end() is called"""
def __init__(self, seconds, prefix=''):
if SHOW_PROGRESS:
self.p = Process(target=progress_bar, args=(seconds, prefix))
self.p.start()
self.stats = {'start_ts': datetime.now(), 'end_ts': None}
def end(self):
"""immediately end progress, clear the progressbar line, and save end_ts"""
end_ts = datetime.now()
self.stats['end_ts'] = end_ts
if SHOW_PROGRESS:
# protect from double termination
#if p is None or not hasattr(p, 'kill'):
# return
if self.p is not None:
self.p.terminate()
self.p = None
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) # clear whole terminal line
@enforce_types
def progress_bar(seconds: int, prefix: str='') -> None:
"""show timer in the form of progress bar, with percentage and seconds remaining"""
chunk = '' if sys.stdout.encoding == 'UTF-8' else '#'
chunks = TERM_WIDTH() - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
try:
for s in range(seconds * chunks):
chunks = TERM_WIDTH() - len(prefix) - 20
progress = s / chunks / seconds * 100
bar_width = round(progress/(100/chunks))
# ████████████████████ 0.9% (1/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
prefix,
ANSI['green'],
(chunk * bar_width).ljust(chunks),
ANSI['reset'],
round(progress, 1),
round(s/chunks),
seconds,
))
sys.stdout.flush()
time.sleep(1 / chunks)
# ██████████████████████████████████ 100.0% (60/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
prefix,
ANSI['red'],
chunk * chunks,
ANSI['reset'],
100.0,
seconds,
seconds,
))
sys.stdout.flush()
except KeyboardInterrupt:
print()
pass
@enforce_types
def download_url(url: str, timeout: int=TIMEOUT) -> str:
"""Download the contents of a remote url and return the text"""
@ -547,58 +204,6 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
return resp.read().decode(encoding)
@enforce_types
def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
"""chmod -R <permissions> <cwd>/<path>"""
if not os.path.exists(os.path.join(cwd, path)):
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
if chmod_result.returncode == 1:
print(' ', chmod_result.stderr.decode())
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
@enforce_types
def copy_and_overwrite(from_path: str, to_path: str):
if os.path.isdir(from_path):
shutil.rmtree(to_path, ignore_errors=True)
shutil.copytree(from_path, to_path)
else:
with open(from_path, 'rb') as src:
atomic_write(src.read(), to_path)
@enforce_types
def get_dir_size(path: str, recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
num_bytes, num_dirs, num_files = 0, 0, 0
for entry in os.scandir(path):
if (pattern is not None) and (pattern not in entry.path):
continue
if entry.is_dir(follow_symlinks=False):
if not recursive:
continue
num_dirs += 1
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
num_bytes += bytes_inside
num_dirs += dirs_inside
num_files += files_inside
else:
num_bytes += entry.stat(follow_symlinks=False).st_size
num_files += 1
return num_bytes, num_dirs, num_files
@enforce_types
def human_readable_size(num_bytes: Union[int, float]) -> str:
for count in ['Bytes','KB','MB','GB']:
if num_bytes > -1024.0 and num_bytes < 1024.0:
return '%3.1f %s' % (num_bytes, count)
num_bytes /= 1024.0
return '%3.1f %s' % (num_bytes, 'TB')
@enforce_types
def chrome_args(**options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""
@ -632,7 +237,7 @@ def chrome_args(**options) -> List[str]:
return cmd_args
class ExtendedEncoder(JSONEncoder):
class ExtendedEncoder(pyjson.JSONEncoder):
"""
Extended json serializer that supports serializing several model
fields and objects
@ -656,114 +261,5 @@ class ExtendedEncoder(JSONEncoder):
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
return tuple(obj)
return JSONEncoder.default(self, obj)
return pyjson.JSONEncoder.default(self, obj)
def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
if file:
path = os.path.realpath(file.name)
contents = json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
atomic_write(contents, path)
return contents
else:
return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
def links_to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
header: bool=True, ljust: int=0, separator: str=',') -> str:
csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
header_str = ''
if header:
header_str = separator.join(col.ljust(ljust) for col in csv_cols)
row_strs = (
link.to_csv(csv_cols=csv_cols, ljust=ljust, separator=separator)
for link in links
)
return '\n'.join((header_str, *row_strs))
def folders_to_str(folders: Dict[str, Optional[Link]]) -> str:
return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
@enforce_types
def render_template(template_path: str, context: Mapping[str, str]) -> str:
"""render a given html template string with the given template content"""
# will be replaced by django templates in the future
with open(template_path, 'r', encoding='utf-8') as template:
template_str = template.read()
return Template(template_str).substitute(**context)
def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
try:
tmp_file = '{}.tmp'.format(path)
if isinstance(contents, bytes):
args = {'mode': 'wb+'}
else:
args = {'mode': 'w+', 'encoding': 'utf-8'}
with open(tmp_file, **args) as f:
if isinstance(contents, dict):
to_json(contents, file=f)
else:
f.write(contents)
os.fsync(f.fileno())
os.rename(tmp_file, path)
chmod_file(path)
finally:
if os.path.exists(tmp_file):
os.remove(tmp_file)
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
"""Tell the user they passed stdin to a command that doesn't accept it"""
if stdin and not stdin.isatty():
stdin_raw_text = stdin.read().strip()
if stdin_raw_text:
print(
'{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
caller,
**ANSI,
)
)
print(' Run archivebox "{} --help" to see usage and examples.'.format(
caller,
))
print()
raise SystemExit(1)
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
if stdin and not stdin.isatty():
return stdin.read()
return None
def set_docstring(text: str):
def decorator(func):
@wraps(func)
def wrapper_with_docstring(*args, **kwargs):
return func(*args, **kwargs)
wrapper_with_docstring.__doc__ = text
return wrapper_with_docstring
return decorator
class SmartFormatter(argparse.HelpFormatter):
def _split_lines(self, text, width):
if '\n' in text:
return text.splitlines()
return argparse.HelpFormatter._split_lines(self, text, width)
class ArchiveError(Exception):
def __init__(self, message, hints=None):
super().__init__(message)
self.hints = hints