diff --git a/archivebox/cli/logging.py b/archivebox/cli/logging.py index 87a7fab1..e07b0719 100644 --- a/archivebox/cli/logging.py +++ b/archivebox/cli/logging.py @@ -1,14 +1,28 @@ __package__ = 'archivebox.cli' +import re import os import sys +import time +import argparse from datetime import datetime from dataclasses import dataclass -from typing import Optional, List +from multiprocessing import Process +from typing import Optional, List, Dict, Union, IO from ..index.schema import Link, ArchiveResult -from ..config import ANSI, OUTPUT_DIR, IS_TTY +from ..index.json import to_json +from ..index.csv import links_to_csv +from ..util import enforce_types +from ..config import ( + ConfigDict, + ANSI, + OUTPUT_DIR, + IS_TTY, + SHOW_PROGRESS, + TERM_WIDTH, +) @dataclass @@ -32,11 +46,104 @@ class RuntimeStats: _LAST_RUN_STATS = RuntimeStats() -def pretty_path(path: str) -> str: - """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" - pwd = os.path.abspath('.') - # parent = os.path.abspath(os.path.join(pwd, os.path.pardir)) - return path.replace(pwd + '/', './') + +class SmartFormatter(argparse.HelpFormatter): + """Patched formatter that prints newlines in argparse help strings""" + def _split_lines(self, text, width): + if '\n' in text: + return text.splitlines() + return argparse.HelpFormatter._split_lines(self, text, width) + + +def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None: + """Tell the user they passed stdin to a command that doesn't accept it""" + + if stdin and not stdin.isatty(): + stdin_raw_text = stdin.read().strip() + if stdin_raw_text: + print( + '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format( + caller, + **ANSI, + ) + ) + print(' Run archivebox "{} --help" to see usage and examples.'.format( + caller, + )) + print() + raise SystemExit(1) + +def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]: + if stdin and not stdin.isatty(): + return stdin.read() + return None + + +class TimedProgress: + """Show a progress bar and measure elapsed time until .end() is called""" + + def __init__(self, seconds, prefix=''): + if SHOW_PROGRESS: + self.p = Process(target=progress_bar, args=(seconds, prefix)) + self.p.start() + + self.stats = {'start_ts': datetime.now(), 'end_ts': None} + + def end(self): + """immediately end progress, clear the progressbar line, and save end_ts""" + + end_ts = datetime.now() + self.stats['end_ts'] = end_ts + if SHOW_PROGRESS: + # protect from double termination + #if p is None or not hasattr(p, 'kill'): + # return + if self.p is not None: + self.p.terminate() + + self.p = None + + sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) # clear whole terminal line + + +@enforce_types +def progress_bar(seconds: int, prefix: str='') -> None: + """show timer in the form of progress bar, with percentage and seconds remaining""" + chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#' + chunks = TERM_WIDTH() - len(prefix) - 20 # number of progress chunks to show (aka max bar width) + try: + for s in range(seconds * chunks): + chunks = TERM_WIDTH() - len(prefix) - 20 + progress = s / chunks / seconds * 100 + bar_width = round(progress/(100/chunks)) + + # ████████████████████ 0.9% (1/60sec) + sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( + prefix, + ANSI['green'], + (chunk * bar_width).ljust(chunks), + ANSI['reset'], + round(progress, 1), + round(s/chunks), + seconds, + )) + sys.stdout.flush() + time.sleep(1 / chunks) + + # ██████████████████████████████████ 100.0% (60/60sec) + sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format( + prefix, + ANSI['red'], + chunk * chunks, + ANSI['reset'], + 100.0, + seconds, + seconds, + )) + sys.stdout.flush() + except KeyboardInterrupt: + print() + pass ### Parsing Stage @@ -223,10 +330,9 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str): print(' {}'.format(' '.join(filter_patterns or ()))) def log_list_finished(links): - from ..util import links_to_csv print() print('---------------------------------------------------------------------------------------------------') - print(links_to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | ')) + print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | ')) print('---------------------------------------------------------------------------------------------------') print() @@ -266,3 +372,129 @@ def log_removal_finished(all_links: int, to_keep: int): **ANSI, )) print(' Index now contains {} links.'.format(to_keep)) + + +def log_shell_welcome_msg(): + from . import list_subcommands + + print('{green}# ArchiveBox Imports{reset}'.format(**ANSI)) + print('{green}from archivebox.core.models import Page, User{reset}'.format(**ANSI)) + print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI)) + print() + print('[i] Welcome to the ArchiveBox Shell!') + print(' https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage') + print() + print(' {lightred}Hint:{reset} Example use:'.format(**ANSI)) + print(' print(Page.objects.filter(is_archived=True).count())') + print(' Page.objects.get(url="https://example.com").as_json()') + print(' add("https://example.com/some/new/url")') + + + +### Helpers + +@enforce_types +def pretty_path(path: str) -> str: + """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" + pwd = os.path.abspath('.') + # parent = os.path.abspath(os.path.join(pwd, os.path.pardir)) + return path.replace(pwd + '/', './') + + +@enforce_types +def printable_filesize(num_bytes: Union[int, float]) -> str: + for count in ['Bytes','KB','MB','GB']: + if num_bytes > -1024.0 and num_bytes < 1024.0: + return '%3.1f %s' % (num_bytes, count) + num_bytes /= 1024.0 + return '%3.1f %s' % (num_bytes, 'TB') + + +@enforce_types +def printable_folders(folders: Dict[str, Optional[Link]], + json: bool=False, + csv: Optional[str]=None) -> str: + if json: + return to_json(folders.values(), indent=4, sort_keys=True) + + elif csv: + return links_to_csv(folders.values(), cols=csv.split(','), header=True) + + return '\n'.join(f'{folder} {link}' for folder, link in folders.items()) + + + +@enforce_types +def printable_config(config: ConfigDict, prefix: str='') -> str: + return f'\n{prefix}'.join( + f'{key}={val}' + for key, val in config.items() + if not (isinstance(val, dict) or callable(val)) + ) + + +@enforce_types +def printable_folder_status(name: str, folder: Dict) -> str: + if folder['enabled']: + if folder['is_valid']: + color, symbol, note = 'green', '√', 'valid' + else: + color, symbol, note, num_files = 'red', 'X', 'invalid', '?' + else: + color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-' + + if folder['path']: + if os.path.exists(folder['path']): + num_files = ( + f'{len(os.listdir(folder["path"]))} files' + if os.path.isdir(folder['path']) else + printable_filesize(os.path.getsize(folder['path'])) + ) + else: + num_files = 'missing' + + if ' ' in folder['path']: + folder['path'] = f'"{folder["path"]}"' + + return ' '.join(( + ANSI[color], + symbol, + ANSI['reset'], + name.ljust(22), + (folder["path"] or '').ljust(76), + num_files.ljust(14), + ANSI[color], + note, + ANSI['reset'], + )) + + +@enforce_types +def printable_dependency_version(name: str, dependency: Dict) -> str: + if dependency['enabled']: + if dependency['is_valid']: + color, symbol, note, version = 'green', '√', 'valid', '' + + parsed_version_num = re.search(r'[\d\.]+', dependency['version']) + if parsed_version_num: + version = f'v{parsed_version_num[0]}' + + if not version: + color, symbol, note, version = 'red', 'X', 'invalid', '?' + else: + color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' + + if ' ' in dependency["path"]: + dependency["path"] = f'"{dependency["path"]}"' + + return ' '.join(( + ANSI[color], + symbol, + ANSI['reset'], + name.ljust(22), + (dependency["path"] or '').ljust(76), + version.ljust(14), + ANSI[color], + note, + ANSI['reset'], + )) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index fdaf9ca4..72baec64 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -119,6 +119,27 @@ DEFAULT_CLI_COLORS = { } ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()} +STATICFILE_EXTENSIONS = { + # 99.999% of the time, URLs ending in these extentions are static files + # that can be downloaded as-is, not html pages that need to be rendered + 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', + 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', + 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', + 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8', + 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', + 'atom', 'rss', 'css', 'js', 'json', + 'dmg', 'iso', 'img', + 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z', + + # Less common extensions to consider adding later + # jar, swf, bin, com, exe, dll, deb + # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, + # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, + # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml + + # Thse are always treated as pages, not as static files, never add them: + # html, htm, shtml, xhtml, xml, aspx, php, cgi +} VERSION_FILENAME = 'VERSION' PYTHON_DIR_NAME = 'archivebox' diff --git a/archivebox/core/models.py b/archivebox/core/models.py index a41f3d1c..2900f798 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -64,3 +64,7 @@ class Page(models.Model): @property def base_url(self): return self.as_link().base_url + + @property + def link_dir(self): + return self.as_link().link_dir diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index b225a899..e128f8d0 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -4,17 +4,19 @@ import os import sys SECRET_KEY = '---------------- not a valid secret key ! ----------------' -DEBUG = True +DEBUG = os.getenv('DEBUG', 'False').lower() == 'true' ALLOWED_HOSTS = ['*'] REPO_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), os.path.pardir, os.path.pardir)) OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR', os.curdir)) +ARCHIVE_DIR = os.path.join(OUTPUT_DIR, 'archive') DATABASE_FILE = os.path.join(OUTPUT_DIR, 'index.sqlite3') ACTIVE_THEME = 'default' IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3] +APPEND_SLASH = True INSTALLED_APPS = [ 'django.contrib.auth', diff --git a/archivebox/core/welcome_message.py b/archivebox/core/welcome_message.py index 70410c75..b257b7d6 100644 --- a/archivebox/core/welcome_message.py +++ b/archivebox/core/welcome_message.py @@ -1,17 +1,6 @@ -from cli import list_subcommands - -from .config import ANSI +from cli.logging import log_shell_welcome_msg if __name__ == '__main__': - print('{green}# ArchiveBox Imports{reset}'.format(**ANSI)) - # print('from archivebox.core.models import Page, User') - print('{green}from archivebox.cli import\narchivebox_{}{reset}'.format("\narchivebox_".join(list_subcommands().keys()), **ANSI)) - print() - print('[i] Welcome to the ArchiveBox Shell! Example use:') - print(' print(Page.objects.filter(is_archived=True).count())') - print(' Page.objects.get(url="https://example.com").as_json()') - - print(' Page.objects.get(url="https://example.com").as_json()') - - print(' from archivebox.main import get_invalid_folders') + from main import * + log_shell_welcome_msg() diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index ad6d409b..7522ddb8 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -5,16 +5,11 @@ import os from typing import Optional, List, Dict, Tuple from collections import defaultdict -from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError +from ..system import run, PIPE, DEVNULL, chmod_file from ..util import ( enforce_types, - TimedProgress, - run, - PIPE, - DEVNULL, is_static_file, - ArchiveError, - chmod_file, ) from ..config import ( VERSION, @@ -24,6 +19,7 @@ from ..config import ( CURL_VERSION, CHECK_SSL_VALIDITY ) +from ..cli.logging import TimedProgress diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index a002302f..331531c0 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -4,22 +4,19 @@ import os from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError +from ..system import run, PIPE, chmod_file from ..util import ( enforce_types, - TimedProgress, - run, - PIPE, is_static_file, - ArchiveError, chrome_args, - chmod_file, ) from ..config import ( TIMEOUT, SAVE_DOM, CHROME_VERSION, ) +from ..cli.logging import TimedProgress diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 0dff3900..ab5485c8 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -5,14 +5,8 @@ import os from typing import Optional from ..index.schema import Link, ArchiveResult, ArchiveOutput -from ..util import ( - enforce_types, - TimedProgress, - domain, - run, - PIPE, - chmod_file, -) +from ..system import chmod_file, run, PIPE +from ..util import enforce_types, domain from ..config import ( TIMEOUT, SAVE_FAVICON, @@ -20,6 +14,7 @@ from ..config import ( CURL_VERSION, CHECK_SSL_VALIDITY, ) +from ..cli.logging import TimedProgress @enforce_types diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index 21a86f5e..54e67d8b 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -4,15 +4,11 @@ import os from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError +from ..system import run, PIPE, chmod_file from ..util import ( enforce_types, - TimedProgress, - run, - PIPE, is_static_file, - ArchiveError, - chmod_file, domain, extension, without_query, @@ -26,6 +22,7 @@ from ..config import ( GIT_DOMAINS, CHECK_SSL_VALIDITY ) +from ..cli.logging import TimedProgress diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 9fd9a9be..ece47f0a 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -4,15 +4,11 @@ import os from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError +from ..system import run, PIPE, chmod_file from ..util import ( enforce_types, - TimedProgress, - run, - PIPE, is_static_file, - ArchiveError, - chmod_file, ) from ..config import ( MEDIA_TIMEOUT, @@ -21,6 +17,7 @@ from ..config import ( YOUTUBEDL_VERSION, CHECK_SSL_VALIDITY ) +from ..cli.logging import TimedProgress @enforce_types diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index e7ade948..c29f3b22 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -4,23 +4,19 @@ import os from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError +from ..system import run, PIPE, chmod_file from ..util import ( enforce_types, - TimedProgress, - run, - PIPE, is_static_file, - ArchiveError, chrome_args, - chmod_file, ) from ..config import ( TIMEOUT, SAVE_PDF, CHROME_VERSION, ) - +from ..cli.logging import TimedProgress @enforce_types diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index 3e211939..d2879c95 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -4,22 +4,19 @@ import os from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError +from ..system import run, PIPE, chmod_file from ..util import ( enforce_types, - TimedProgress, - run, - PIPE, is_static_file, - ArchiveError, chrome_args, - chmod_file, ) from ..config import ( TIMEOUT, SAVE_SCREENSHOT, CHROME_VERSION, ) +from ..cli.logging import TimedProgress diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index c8ba9dd3..497c0ffb 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -1,14 +1,14 @@ __package__ = 'archivebox.extractors' +import re from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..util import ( enforce_types, - TimedProgress, is_static_file, - ArchiveError, - fetch_page_title, + download_url, + htmldecode, ) from ..config import ( TIMEOUT, @@ -16,6 +16,14 @@ from ..config import ( CURL_BINARY, CURL_VERSION, ) +from ..cli.logging import TimedProgress + + +HTML_TITLE_REGEX = re.compile( + r'' # start matching text after tag + r'(.[^<>]+)', # get everything up to these symbols + re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, +) @enforce_types @@ -44,7 +52,9 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - output = fetch_page_title(link.url, timeout=timeout, progress=False) + html = download_url(link.url, timeout=timeout) + match = re.search(HTML_TITLE_REGEX, html) + output = htmldecode(match.group(1).strip()) if match else None if not output: raise ArchiveError('Unable to detect page title') except Exception as err: diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 7a0568f1..782d6d31 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -1,18 +1,22 @@ __package__ = 'archivebox.extractors' import os +import re from typing import Optional from datetime import datetime -from ..index.schema import Link, ArchiveResult, ArchiveOutput +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError +from ..system import run, PIPE from ..util import ( enforce_types, - TimedProgress, - run, - PIPE, - wget_output_path, - ArchiveError, + is_static_file, + without_scheme, + without_fragment, + without_query, + path, + domain, + urldecode, ) from ..config import ( TIMEOUT, @@ -26,7 +30,7 @@ from ..config import ( WGET_USER_AGENT, COOKIES_FILE, ) - +from ..cli.logging import TimedProgress @enforce_types @@ -121,3 +125,72 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> status=status, **timer.stats, ) + + +@enforce_types +def wget_output_path(link: Link) -> Optional[str]: + """calculate the path to the wgetted .html file, since wget may + adjust some paths to be different than the base_url path. + + See docs on wget --adjust-extension (-E) + """ + + if is_static_file(link.url): + return without_scheme(without_fragment(link.url)) + + # Wget downloads can save in a number of different ways depending on the url: + # https://example.com + # > example.com/index.html + # https://example.com?v=zzVa_tX1OiI + # > example.com/index.html?v=zzVa_tX1OiI.html + # https://www.example.com/?v=zzVa_tX1OiI + # > example.com/index.html?v=zzVa_tX1OiI.html + + # https://example.com/abc + # > example.com/abc.html + # https://example.com/abc/ + # > example.com/abc/index.html + # https://example.com/abc?v=zzVa_tX1OiI.html + # > example.com/abc?v=zzVa_tX1OiI.html + # https://example.com/abc/?v=zzVa_tX1OiI.html + # > example.com/abc/index.html?v=zzVa_tX1OiI.html + + # https://example.com/abc/test.html + # > example.com/abc/test.html + # https://example.com/abc/test?v=zzVa_tX1OiI + # > example.com/abc/test?v=zzVa_tX1OiI.html + # https://example.com/abc/test/?v=zzVa_tX1OiI + # > example.com/abc/test/index.html?v=zzVa_tX1OiI.html + + # There's also lots of complexity around how the urlencoding and renaming + # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc + + # Since the wget algorithm for -E (appending .html) is incredibly complex + # and there's no way to get the computed output path from wget + # in order to avoid having to reverse-engineer how they calculate it, + # we just look in the output folder read the filename wget used from the filesystem + full_path = without_fragment(without_query(path(link.url))).strip('/') + search_dir = os.path.join( + link.link_dir, + domain(link.url), + urldecode(full_path), + ) + + for _ in range(4): + if os.path.exists(search_dir): + if os.path.isdir(search_dir): + html_files = [ + f for f in os.listdir(search_dir) + if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", f, re.I | re.M) + ] + if html_files: + path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/') + return os.path.join(path_from_link_dir, html_files[0]) + + # Move up one directory level + search_dir = search_dir.rsplit('/', 1)[0] + + if search_dir == link.link_dir: + break + + return None diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index d7b6b43e..e86d3336 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -10,12 +10,10 @@ from typing import List, Tuple, Dict, Optional, Iterable from collections import OrderedDict from contextlib import contextmanager -from ..parsers import parse_links +from ..system import atomic_write from ..util import ( scheme, enforce_types, - TimedProgress, - atomic_write, ExtendedEncoder, ) from ..config import ( @@ -30,6 +28,7 @@ from ..config import ( stderr, ) from ..cli.logging import ( + TimedProgress, log_indexing_process_started, log_indexing_process_finished, log_indexing_started, @@ -278,6 +277,8 @@ def import_new_links(existing_links: List[Link], import_path: str, out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]: + from ..parsers import parse_links + new_links: List[Link] = [] # parse and validate the import file @@ -584,9 +585,9 @@ def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], Li else: shutil.move(entry.path, dest) fixed.append(dest) - - if link.link_dir != entry.path: - link = link.overwrite(link_dir=entry.path) - write_json_link_details(link, out_dir=entry.path) + timestamp = entry.path.rsplit('/', 1)[-1] + assert link.link_dir == entry.path + assert link.timestamp == timestamp + write_json_link_details(link, out_dir=entry.path) return fixed, cant_fix diff --git a/archivebox/index/csv.py b/archivebox/index/csv.py new file mode 100644 index 00000000..804e6461 --- /dev/null +++ b/archivebox/index/csv.py @@ -0,0 +1,37 @@ +__package__ = 'archivebox.index' + +from typing import List, Optional, Any + +from ..util import enforce_types +from .schema import Link + + +@enforce_types +def links_to_csv(links: List[Link], + cols: Optional[List[str]]=None, + header: bool=True, + separator: str=',', + ljust: int=0) -> str: + + cols = cols or ['timestamp', 'is_archived', 'url'] + + header_str = '' + if header: + header_str = separator.join(col.ljust(ljust) for col in cols) + + row_strs = ( + link.to_csv(cols=cols, ljust=ljust, separator=separator) + for link in links + ) + + return '\n'.join((header_str, *row_strs)) + + +@enforce_types +def to_csv(obj: Any, cols: List[str], separator: str=',', ljust: int=0) -> str: + from .json import to_json + + return separator.join( + to_json(getattr(obj, col), indent=None).ljust(ljust) + for col in cols + ) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 3cba2bf0..ea890276 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -2,20 +2,18 @@ __package__ = 'archivebox.index' import os +from string import Template from datetime import datetime -from typing import List, Optional, Iterator +from typing import List, Optional, Iterator, Mapping from .schema import Link +from ..system import atomic_write, copy_and_overwrite from ..util import ( enforce_types, ts_to_date, urlencode, htmlencode, urldecode, - wget_output_path, - render_template, - atomic_write, - copy_and_overwrite, ) from ..config import ( OUTPUT_DIR, @@ -67,7 +65,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: def main_index_template(links: List[Link], finished: bool=True) -> str: """render the template for the entire main index""" - return render_template(MAIN_INDEX_TEMPLATE, { + return render_legacy_template(MAIN_INDEX_TEMPLATE, { 'version': VERSION, 'git_sha': GIT_SHA, 'num_links': str(len(links)), @@ -86,7 +84,9 @@ def main_index_template(links: List[Link], finished: bool=True) -> str: def main_index_row_template(link: Link) -> str: """render the template for an individual link row of the main index""" - return render_template(MAIN_INDEX_ROW_TEMPLATE, { + from ..extractors.wget import wget_output_path + + return render_legacy_template(MAIN_INDEX_ROW_TEMPLATE, { **link._asdict(extended=True), # before pages are finished archiving, show loading msg instead of title @@ -122,9 +122,11 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None: @enforce_types def link_details_template(link: Link) -> str: + from ..extractors.wget import wget_output_path + link_info = link._asdict(extended=True) - return render_template(LINK_DETAILS_TEMPLATE, { + return render_legacy_template(LINK_DETAILS_TEMPLATE, { **link_info, **link_info['canonical'], 'title': ( @@ -142,3 +144,13 @@ def link_details_template(link: Link) -> str: 'status_color': 'success' if link.is_archived else 'danger', 'oldest_archive_date': ts_to_date(link.oldest_archive_date), }) + + +@enforce_types +def render_legacy_template(template_path: str, context: Mapping[str, str]) -> str: + """render a given html template string with the given template content""" + + # will be replaced by django templates in the future + with open(template_path, 'r', encoding='utf-8') as template: + template_str = template.read() + return Template(template_str).substitute(**context) diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 4d75d095..a11dba5d 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -2,13 +2,14 @@ __package__ = 'archivebox.index' import os import sys -import json +import json as pyjson from datetime import datetime -from typing import List, Optional, Iterator +from typing import List, Optional, Iterator, Any from .schema import Link, ArchiveResult -from ..util import enforce_types, atomic_write +from ..system import atomic_write +from ..util import enforce_types from ..config import ( VERSION, OUTPUT_DIR, @@ -46,7 +47,7 @@ def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]: index_path = os.path.join(out_dir, JSON_INDEX_FILENAME) if os.path.exists(index_path): with open(index_path, 'r', encoding='utf-8') as f: - links = json.load(f)['links'] + links = pyjson.load(f)['links'] for link_json in links: yield Link.from_json(link_json) @@ -95,12 +96,13 @@ def parse_json_link_details(out_dir: str) -> Optional[Link]: if os.path.exists(existing_index): with open(existing_index, 'r', encoding='utf-8') as f: try: - link_json = json.load(f) + link_json = pyjson.load(f) return Link.from_json(link_json) - except json.JSONDecodeError: + except pyjson.JSONDecodeError: pass return None + @enforce_types def parse_json_links_details(out_dir: str) -> Iterator[Link]: """read through all the archive data folders and return the parsed links""" @@ -111,3 +113,41 @@ def parse_json_links_details(out_dir: str) -> Iterator[Link]: link = parse_json_link_details(entry.path) if link: yield link + + + +### Helpers + +class ExtendedEncoder(pyjson.JSONEncoder): + """ + Extended json serializer that supports serializing several model + fields and objects + """ + + def default(self, obj): + cls_name = obj.__class__.__name__ + + if hasattr(obj, '_asdict'): + return obj._asdict() + + elif isinstance(obj, bytes): + return obj.decode() + + elif isinstance(obj, datetime): + return obj.isoformat() + + elif isinstance(obj, Exception): + return '{}: {}'.format(obj.__class__.__name__, obj) + + elif cls_name in ('dict_items', 'dict_keys', 'dict_values'): + return tuple(obj) + + return pyjson.JSONEncoder.default(self, obj) + + +@enforce_types +def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str: + return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder) + + + diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 1cec34b1..f8d81e34 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -61,19 +61,20 @@ class ArchiveResult: info['end_ts'] = parse_date(info['end_ts']) return cls(**info) - def to_json(self, indent=4, sort_keys=True): - from ..util import to_json + def to_dict(self, *keys) -> dict: + if keys: + return {k: v for k, v in asdict(self).items() if k in keys} + return asdict(self) + + def to_json(self, indent=4, sort_keys=True) -> str: + from .json import to_json return to_json(self, indent=indent, sort_keys=sort_keys) - def to_csv(self, cols=None, ljust: int=0, separator: str=','): - from ..util import to_json + def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str: + from .csv import to_csv - cols = cols or self.field_names() - return separator.join( - to_json(getattr(self, col), indent=None).ljust(ljust) - for col in cols - ) + return to_csv(self, csv_col=cols or self.field_names(), separator=separator, ljust=ljust) @classmethod def field_names(cls): @@ -201,18 +202,15 @@ class Link: info['history'] = cast_history return cls(**info) - def to_json(self, indent=4, sort_keys=True): - from ..util import to_json + def to_json(self, indent=4, sort_keys=True) -> str: + from .json import to_json return to_json(self, indent=indent, sort_keys=sort_keys) - def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','): - from ..util import to_json + def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str: + from .csv import to_csv - return separator.join( - to_json(getattr(self, col), indent=None).ljust(ljust) - for col in csv_cols - ) + return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust) @classmethod def field_names(cls): @@ -354,7 +352,7 @@ class Link: def canonical_outputs(self) -> Dict[str, Optional[str]]: """predict the expected output paths that should be present after archiving""" - from ..util import wget_output_path + from ..extractors.wget import wget_output_path canonical = { 'index_path': 'index.html', 'favicon_path': 'favicon.ico', @@ -382,3 +380,5 @@ class Link: 'dom_path': static_path, }) return canonical + + diff --git a/archivebox/main.py b/archivebox/main.py index 47c0a66d..231d27e0 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -1,11 +1,10 @@ __package__ = 'archivebox' -import re import os import sys import shutil -from typing import Dict, List, Optional, Set, Tuple, Iterable, IO +from typing import Dict, List, Optional, Iterable, IO from crontab import CronTab, CronSlices @@ -17,18 +16,13 @@ from .cli import ( main_cmds, archive_cmds, ) -from .index.schema import Link -from .util import ( - enforce_types, - TimedProgress, - get_dir_size, - human_readable_size, +from .parsers import ( save_stdin_to_sources, save_file_to_sources, - links_to_csv, - to_json, - folders_to_str, ) +from .index.schema import Link +from .util import enforce_types, docstring +from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from .index import ( links_after_timestamp, load_main_index, @@ -51,7 +45,11 @@ from .index.json import ( parse_json_main_index, parse_json_links_details, ) -from .index.sql import parse_sql_main_index, get_admins, apply_migrations +from .index.sql import ( + parse_sql_main_index, + get_admins, + apply_migrations, +) from .index.html import parse_html_main_index from .extractors import archive_link from .config import ( @@ -91,6 +89,7 @@ from .config import ( get_real_name, ) from .cli.logging import ( + TimedProgress, log_archiving_started, log_archiving_paused, log_archiving_finished, @@ -98,6 +97,11 @@ from .cli.logging import ( log_removal_finished, log_list_started, log_list_finished, + printable_config, + printable_folders, + printable_filesize, + printable_folder_status, + printable_dependency_version, ) @@ -387,7 +391,7 @@ def info(out_dir: str=OUTPUT_DIR) -> None: print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI)) print(f' {out_dir}/*') num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.') - size = human_readable_size(num_bytes) + size = printable_filesize(num_bytes) print(f' Size: {size} across {num_files} files') print() @@ -419,7 +423,7 @@ def info(out_dir: str=OUTPUT_DIR) -> None: print(f' {ARCHIVE_DIR}/*') num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) - size = human_readable_size(num_bytes) + size = printable_filesize(num_bytes) print(f' Size: {size} across {num_files} files in {num_dirs} directories') print() @@ -712,13 +716,8 @@ def list_all(filter_patterns_str: Optional[str]=None, out_dir=out_dir, ) - if csv: - print(links_to_csv(folders.values(), csv_cols=csv.split(','), header=True)) - elif json: - print(to_json(folders.values(), indent=4, sort_keys=True)) - else: - print(folders_to_str(folders)) - raise SystemExit(not folders) + print(printable_folders(folders, json=json, csv=csv)) + return folders @enforce_types @@ -749,7 +748,7 @@ def list_folders(links: List[Link], status: str, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - check_data_folder() + check_data_folder(out_dir=out_dir) if status == 'indexed': return get_indexed_folders(links, out_dir=out_dir) @@ -796,7 +795,7 @@ def config(config_options_str: Optional[str]=None, ) raise SystemExit(2) elif config_options_str: - config_options = stdin_raw_text.split('\n') + config_options = config_options_str.split('\n') config_options = config_options or [] @@ -865,7 +864,6 @@ def config(config_options_str: Optional[str]=None, stderr(' Please manually remove the relevant lines from your config file:') stderr(f' {CONFIG_FILE}') raise SystemExit(2) - else: stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red') stderr(' archivebox config') @@ -874,8 +872,6 @@ def config(config_options_str: Optional[str]=None, raise SystemExit(2) -CRON_COMMENT = 'archivebox_schedule' - @enforce_types def schedule(add: bool=False, show: bool=False, @@ -893,7 +889,7 @@ def schedule(add: bool=False, os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True) cron = CronTab(user=True) - cron = dedupe_jobs(cron) + cron = dedupe_cron_jobs(cron) existing_jobs = list(cron.find_comment(CRON_COMMENT)) if foreground or run_all: @@ -962,7 +958,7 @@ def schedule(add: bool=False, stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml') raise SystemExit(1) - cron = dedupe_jobs(cron) + cron = dedupe_cron_jobs(cron) cron.write() total_runs = sum(j.frequency_per_year() for j in cron) @@ -1025,95 +1021,13 @@ def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None: execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])]) + +@enforce_types def shell(out_dir: str=OUTPUT_DIR) -> None: + """Enter an interactive ArchiveBox Django shell""" + check_data_folder(out_dir=out_dir) setup_django(OUTPUT_DIR) from django.core.management import call_command call_command("shell_plus") - -# Helpers - -def printable_config(config: ConfigDict, prefix: str='') -> str: - return f'\n{prefix}'.join( - f'{key}={val}' - for key, val in config.items() - if not (isinstance(val, dict) or callable(val)) - ) - -def dedupe_jobs(cron: CronTab) -> CronTab: - deduped: Set[Tuple[str, str]] = set() - - for job in list(cron): - unique_tuple = (str(job.slices), job.command) - if unique_tuple not in deduped: - deduped.add(unique_tuple) - cron.remove(job) - - for schedule, command in deduped: - job = cron.new(command=command, comment=CRON_COMMENT) - job.setall(schedule) - job.enable() - - return cron - - -def print_folder_status(name, folder): - if folder['enabled']: - if folder['is_valid']: - color, symbol, note = 'green', '√', 'valid' - else: - color, symbol, note, num_files = 'red', 'X', 'invalid', '?' - else: - color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-' - - if folder['path']: - if os.path.exists(folder['path']): - num_files = ( - f'{len(os.listdir(folder["path"]))} files' - if os.path.isdir(folder['path']) else - human_readable_size(os.path.getsize(folder['path'])) - ) - else: - num_files = 'missing' - - if ' ' in folder['path']: - folder['path'] = f'"{folder["path"]}"' - - print( - ANSI[color], - symbol, - ANSI['reset'], - name.ljust(22), - (folder["path"] or '').ljust(76), - num_files.ljust(14), - ANSI[color], - note, - ANSI['reset'], - ) - - -def print_dependency_version(name, dependency): - if dependency['enabled']: - if dependency['is_valid']: - color, symbol, note = 'green', '√', 'valid' - version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0] - else: - color, symbol, note, version = 'red', 'X', 'invalid', '?' - else: - color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' - - if ' ' in dependency["path"]: - dependency["path"] = f'"{dependency["path"]}"' - - print( - ANSI[color], - symbol, - ANSI['reset'], - name.ljust(22), - (dependency["path"] or '').ljust(76), - version.ljust(14), - ANSI[color], - note, - ANSI['reset'], - ) diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 2a20ff6d..e3e19c45 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -7,16 +7,29 @@ For examples of supported import formats see tests/. __package__ = 'archivebox.parsers' +import re +import os from typing import Tuple, List +from datetime import datetime -from ..config import TIMEOUT -from ..util import ( - check_url_parsing_invariants, - TimedProgress, - Link, - enforce_types, +from ..index.schema import Link +from ..system import atomic_write +from ..config import ( + ANSI, + OUTPUT_DIR, + SOURCES_DIR_NAME, + TIMEOUT, + check_data_folder, ) +from ..util import ( + basename, + domain, + download_url, + enforce_types, + URL_REGEX, +) +from ..cli.logging import pretty_path, TimedProgress from .pocket_html import parse_pocket_html_export from .pinboard_rss import parse_pinboard_rss_export from .shaarli_rss import parse_shaarli_rss_export @@ -66,3 +79,95 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]: timer.end() return [], 'Failed to parse' + + +@enforce_types +def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str: + check_data_folder(out_dir=out_dir) + + sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME) + if not os.path.exists(sources_dir): + os.makedirs(sources_dir) + + ts = str(datetime.now().timestamp()).split('.', 1)[0] + + source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts)) + + atomic_write(raw_text, source_path) + return source_path + + +@enforce_types +def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str: + """download a given url's content into output/sources/domain-<timestamp>.txt""" + check_data_folder(out_dir=out_dir) + + sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME) + if not os.path.exists(sources_dir): + os.makedirs(sources_dir) + + ts = str(datetime.now().timestamp()).split('.', 1)[0] + + source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts)) + + if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')): + source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts)) + print('{}[*] [{}] Downloading {}{}'.format( + ANSI['green'], + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + path, + ANSI['reset'], + )) + timer = TimedProgress(timeout, prefix=' ') + try: + raw_source_text = download_url(path, timeout=timeout) + timer.end() + except Exception as e: + timer.end() + print('{}[!] Failed to download {}{}\n'.format( + ANSI['red'], + path, + ANSI['reset'], + )) + print(' ', e) + raise SystemExit(1) + + else: + with open(path, 'r') as f: + raw_source_text = f.read() + + atomic_write(raw_source_text, source_path) + + print(' > {}'.format(pretty_path(source_path))) + + return source_path + + +def check_url_parsing_invariants() -> None: + """Check that plain text regex URL parsing works as expected""" + + # this is last-line-of-defense to make sure the URL_REGEX isn't + # misbehaving, as the consequences could be disastrous and lead to many + # incorrect/badly parsed links being added to the archive + + test_urls = ''' + https://example1.com/what/is/happening.html?what=1#how-about-this=1 + https://example2.com/what/is/happening/?what=1#how-about-this=1 + HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f + https://example4.com/what/is/happening.html + https://example5.com/ + https://example6.com + + <test>http://example7.com</test> + [https://example8.com/what/is/this.php?what=1] + [and http://example9.com?what=1&other=3#and-thing=2] + <what>https://example10.com#and-thing=2 "</about> + abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def + sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi + example13.bada + and example14.badb + <or>htt://example15.badc</that> + ''' + # print('\n'.join(re.findall(URL_REGEX, test_urls))) + assert len(re.findall(URL_REGEX, test_urls)) == 12 + diff --git a/archivebox/system.py b/archivebox/system.py new file mode 100644 index 00000000..aa6263e9 --- /dev/null +++ b/archivebox/system.py @@ -0,0 +1,150 @@ +__package__ = 'archivebox' + + +import os +import shutil + +import json as pyjson +from typing import Optional, Union, Set, Tuple + +from crontab import CronTab + +from subprocess import ( + Popen, + PIPE, + DEVNULL, + CompletedProcess, + TimeoutExpired, + CalledProcessError, +) + +from .util import enforce_types, ExtendedEncoder +from .config import OUTPUT_PERMISSIONS + + +def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): + """Patched of subprocess.run to fix blocking io making timeout=innefective""" + + if input is not None: + if 'stdin' in kwargs: + raise ValueError('stdin and input arguments may not both be used.') + kwargs['stdin'] = PIPE + + if capture_output: + if ('stdout' in kwargs) or ('stderr' in kwargs): + raise ValueError('stdout and stderr arguments may not be used ' + 'with capture_output.') + kwargs['stdout'] = PIPE + kwargs['stderr'] = PIPE + + with Popen(*popenargs, **kwargs) as process: + try: + stdout, stderr = process.communicate(input, timeout=timeout) + except TimeoutExpired: + process.kill() + try: + stdout, stderr = process.communicate(input, timeout=2) + except: + pass + raise TimeoutExpired(popenargs[0][0], timeout) + except BaseException: + process.kill() + # We don't call process.wait() as .__exit__ does that for us. + raise + retcode = process.poll() + if check and retcode: + raise CalledProcessError(retcode, process.args, + output=stdout, stderr=stderr) + return CompletedProcess(process.args, retcode, stdout, stderr) + + +def atomic_write(contents: Union[dict, str, bytes], path: str) -> None: + """Safe atomic write to filesystem by writing to temp file + atomic rename""" + try: + tmp_file = '{}.tmp'.format(path) + + if isinstance(contents, bytes): + args = {'mode': 'wb+'} + else: + args = {'mode': 'w+', 'encoding': 'utf-8'} + + with open(tmp_file, **args) as f: + if isinstance(contents, dict): + pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) + else: + f.write(contents) + + os.fsync(f.fileno()) + + os.rename(tmp_file, path) + chmod_file(path) + finally: + if os.path.exists(tmp_file): + os.remove(tmp_file) + + +@enforce_types +def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None: + """chmod -R <permissions> <cwd>/<path>""" + + if not os.path.exists(os.path.join(cwd, path)): + raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) + + chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) + if chmod_result.returncode == 1: + print(' ', chmod_result.stderr.decode()) + raise Exception('Failed to chmod {}/{}'.format(cwd, path)) + + +@enforce_types +def copy_and_overwrite(from_path: str, to_path: str): + """copy a given file or directory to a given path, overwriting the destination""" + if os.path.isdir(from_path): + shutil.rmtree(to_path, ignore_errors=True) + shutil.copytree(from_path, to_path) + else: + with open(from_path, 'rb') as src: + atomic_write(src.read(), to_path) + + +@enforce_types +def get_dir_size(path: str, recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]: + """get the total disk size of a given directory, optionally summing up + recursively and limiting to a given filter list + """ + num_bytes, num_dirs, num_files = 0, 0, 0 + for entry in os.scandir(path): + if (pattern is not None) and (pattern not in entry.path): + continue + if entry.is_dir(follow_symlinks=False): + if not recursive: + continue + num_dirs += 1 + bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path) + num_bytes += bytes_inside + num_dirs += dirs_inside + num_files += files_inside + else: + num_bytes += entry.stat(follow_symlinks=False).st_size + num_files += 1 + return num_bytes, num_dirs, num_files + + +CRON_COMMENT = 'archivebox_schedule' + +@enforce_types +def dedupe_cron_jobs(cron: CronTab) -> CronTab: + deduped: Set[Tuple[str, str]] = set() + + for job in list(cron): + unique_tuple = (str(job.slices), job.command) + if unique_tuple not in deduped: + deduped.add(unique_tuple) + cron.remove(job) + + for schedule, command in deduped: + job = cron.new(command=command, comment=CRON_COMMENT) + job.setall(schedule) + job.enable() + + return cron diff --git a/archivebox/util.py b/archivebox/util.py index 447b9eff..8b606db1 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -1,15 +1,8 @@ -import os import re -import sys import ssl -import json -import time -import shutil -import argparse -from string import Template -from json import JSONEncoder -from typing import List, Dict, Optional, Any, Union, IO, Mapping, Tuple + +from typing import List, Optional, Any from inspect import signature from functools import wraps from hashlib import sha256 @@ -17,34 +10,17 @@ from urllib.request import Request, urlopen from urllib.parse import urlparse, quote, unquote from html import escape, unescape from datetime import datetime -from multiprocessing import Process -from subprocess import ( - Popen, - PIPE, - DEVNULL, - CompletedProcess, - TimeoutExpired, - CalledProcessError, -) from base32_crockford import encode as base32_encode # type: ignore +import json as pyjson -from .index.schema import Link from .config import ( - ANSI, - TERM_WIDTH, - OUTPUT_DIR, - SOURCES_DIR_NAME, - OUTPUT_PERMISSIONS, TIMEOUT, - SHOW_PROGRESS, - SAVE_TITLE, + STATICFILE_EXTENSIONS, CHECK_SSL_VALIDITY, WGET_USER_AGENT, CHROME_OPTIONS, - check_data_folder, ) -from .cli.logging import pretty_path ### Parsing Helpers @@ -66,6 +42,7 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links without_www = lambda url: url.replace('://www.', '://', 1) without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?') hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20] +is_static_file = lambda url: extension(url).lower() in STATICFILE_EXTENSIONS # TODO: the proper way is with MIME type detection, not using extension urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace') urldecode = lambda s: s and unquote(s) @@ -85,36 +62,7 @@ URL_REGEX = re.compile( r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols re.IGNORECASE, ) -HTML_TITLE_REGEX = re.compile( - r'<title.*?>' # start matching text after <title> tag - r'(.[^<>]+)', # get everything up to these symbols - re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, -) -STATICFILE_EXTENSIONS = { - # 99.999% of the time, URLs ending in these extentions are static files - # that can be downloaded as-is, not html pages that need to be rendered - 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', - 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', - 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', - 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8', - 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', - 'atom', 'rss', 'css', 'js', 'json', - 'dmg', 'iso', 'img', - 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z', - # Less common extensions to consider adding later - # jar, swf, bin, com, exe, dll, deb - # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, - # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, - # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml - - # Thse are always treated as pages, not as static files, never add them: - # html, htm, shtml, xhtml, xml, aspx, php, cgi -} - - - -### Checks & Tests def enforce_types(func): """ @@ -158,189 +106,14 @@ def enforce_types(func): return typechecked_function -def check_url_parsing_invariants() -> None: - """Check that plain text regex URL parsing works as expected""" +def docstring(text: Optional[str]): + """attach the given docstring to the decorated function""" + def decorator(func): + if text: + func.__doc__ = text + return func + return decorator - # this is last-line-of-defense to make sure the URL_REGEX isn't - # misbehaving, as the consequences could be disastrous and lead to many - # incorrect/badly parsed links being added to the archive - - test_urls = ''' - https://example1.com/what/is/happening.html?what=1#how-about-this=1 - https://example2.com/what/is/happening/?what=1#how-about-this=1 - HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f - https://example4.com/what/is/happening.html - https://example5.com/ - https://example6.com - - <test>http://example7.com</test> - [https://example8.com/what/is/this.php?what=1] - [and http://example9.com?what=1&other=3#and-thing=2] - <what>https://example10.com#and-thing=2 "</about> - abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def - sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi - example13.bada - and example14.badb - <or>htt://example15.badc</that> - ''' - # print('\n'.join(re.findall(URL_REGEX, test_urls))) - assert len(re.findall(URL_REGEX, test_urls)) == 12 - - -### Random Helpers - -@enforce_types -def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str: - check_data_folder(out_dir=out_dir) - - sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME) - if not os.path.exists(sources_dir): - os.makedirs(sources_dir) - - ts = str(datetime.now().timestamp()).split('.', 1)[0] - - source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts)) - - atomic_write(raw_text, source_path) - return source_path - - -@enforce_types -def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str: - """download a given url's content into output/sources/domain-<timestamp>.txt""" - check_data_folder(out_dir=out_dir) - - sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME) - if not os.path.exists(sources_dir): - os.makedirs(sources_dir) - - ts = str(datetime.now().timestamp()).split('.', 1)[0] - - source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts)) - - if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')): - source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts)) - print('{}[*] [{}] Downloading {}{}'.format( - ANSI['green'], - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - path, - ANSI['reset'], - )) - timer = TimedProgress(timeout, prefix=' ') - try: - raw_source_text = download_url(path, timeout=timeout) - timer.end() - except Exception as e: - timer.end() - print('{}[!] Failed to download {}{}\n'.format( - ANSI['red'], - path, - ANSI['reset'], - )) - print(' ', e) - raise SystemExit(1) - - else: - with open(path, 'r') as f: - raw_source_text = f.read() - - atomic_write(raw_source_text, source_path) - - print(' > {}'.format(pretty_path(source_path))) - - return source_path - - -@enforce_types -def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]: - """Attempt to guess a page's title by downloading the html""" - - if not SAVE_TITLE: - return None - - try: - html = download_url(url, timeout=timeout) - - match = re.search(HTML_TITLE_REGEX, html) - return htmldecode(match.group(1).strip()) if match else None - except Exception as err: # noqa - # print('[!] Failed to fetch title because of {}: {}'.format( - # err.__class__.__name__, - # err, - # )) - return None - - -@enforce_types -def wget_output_path(link: Link) -> Optional[str]: - """calculate the path to the wgetted .html file, since wget may - adjust some paths to be different than the base_url path. - - See docs on wget --adjust-extension (-E) - """ - - if is_static_file(link.url): - return without_scheme(without_fragment(link.url)) - - # Wget downloads can save in a number of different ways depending on the url: - # https://example.com - # > output/archive/<timestamp>/example.com/index.html - # https://example.com?v=zzVa_tX1OiI - # > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html - # https://www.example.com/?v=zzVa_tX1OiI - # > output/archive/<timestamp>/example.com/index.html?v=zzVa_tX1OiI.html - - # https://example.com/abc - # > output/archive/<timestamp>/example.com/abc.html - # https://example.com/abc/ - # > output/archive/<timestamp>/example.com/abc/index.html - # https://example.com/abc?v=zzVa_tX1OiI.html - # > output/archive/<timestamp>/example.com/abc?v=zzVa_tX1OiI.html - # https://example.com/abc/?v=zzVa_tX1OiI.html - # > output/archive/<timestamp>/example.com/abc/index.html?v=zzVa_tX1OiI.html - - # https://example.com/abc/test.html - # > output/archive/<timestamp>/example.com/abc/test.html - # https://example.com/abc/test?v=zzVa_tX1OiI - # > output/archive/<timestamp>/example.com/abc/test?v=zzVa_tX1OiI.html - # https://example.com/abc/test/?v=zzVa_tX1OiI - # > output/archive/<timestamp>/example.com/abc/test/index.html?v=zzVa_tX1OiI.html - - # There's also lots of complexity around how the urlencoding and renaming - # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc - - # Since the wget algorithm for -E (appending .html) is incredibly complex - # and there's no way to get the computed output path from wget - # in order to avoid having to reverse-engineer how they calculate it, - # we just look in the output folder read the filename wget used from the filesystem - full_path = without_fragment(without_query(path(link.url))).strip('/') - search_dir = os.path.join( - link.link_dir, - domain(link.url), - urldecode(full_path), - ) - - for _ in range(4): - if os.path.exists(search_dir): - if os.path.isdir(search_dir): - html_files = [ - f for f in os.listdir(search_dir) - if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M) - ] - if html_files: - path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/') - return os.path.join(path_from_link_dir, html_files[0]) - - # Move up one directory level - search_dir = search_dir.rsplit('/', 1)[0] - - if search_dir == link.link_dir: - break - - return None - - -### String Manipulation & Logging Helpers @enforce_types def str_between(string: str, start: str, end: str=None) -> str: @@ -415,122 +188,6 @@ def parse_date(date: Any) -> Optional[datetime]: raise ValueError('Tried to parse invalid date! {}'.format(date)) -@enforce_types -def is_static_file(url: str) -> bool: - """Certain URLs just point to a single static file, and - don't need to be re-archived in many formats - """ - - # TODO: the proper way is with MIME type detection, not using extension - return extension(url) in STATICFILE_EXTENSIONS - - - -### Python / System Helpers - -def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): - """Patched of subprocess.run to fix blocking io making timeout=innefective""" - - if input is not None: - if 'stdin' in kwargs: - raise ValueError('stdin and input arguments may not both be used.') - kwargs['stdin'] = PIPE - - if capture_output: - if ('stdout' in kwargs) or ('stderr' in kwargs): - raise ValueError('stdout and stderr arguments may not be used ' - 'with capture_output.') - kwargs['stdout'] = PIPE - kwargs['stderr'] = PIPE - - with Popen(*popenargs, **kwargs) as process: - try: - stdout, stderr = process.communicate(input, timeout=timeout) - except TimeoutExpired: - process.kill() - try: - stdout, stderr = process.communicate(input, timeout=2) - except: - pass - raise TimeoutExpired(popenargs[0][0], timeout) - except BaseException: - process.kill() - # We don't call process.wait() as .__exit__ does that for us. - raise - retcode = process.poll() - if check and retcode: - raise CalledProcessError(retcode, process.args, - output=stdout, stderr=stderr) - return CompletedProcess(process.args, retcode, stdout, stderr) - - -class TimedProgress: - """Show a progress bar and measure elapsed time until .end() is called""" - - def __init__(self, seconds, prefix=''): - if SHOW_PROGRESS: - self.p = Process(target=progress_bar, args=(seconds, prefix)) - self.p.start() - - self.stats = {'start_ts': datetime.now(), 'end_ts': None} - - def end(self): - """immediately end progress, clear the progressbar line, and save end_ts""" - - end_ts = datetime.now() - self.stats['end_ts'] = end_ts - if SHOW_PROGRESS: - # protect from double termination - #if p is None or not hasattr(p, 'kill'): - # return - if self.p is not None: - self.p.terminate() - - self.p = None - - sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) # clear whole terminal line - - -@enforce_types -def progress_bar(seconds: int, prefix: str='') -> None: - """show timer in the form of progress bar, with percentage and seconds remaining""" - chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#' - chunks = TERM_WIDTH() - len(prefix) - 20 # number of progress chunks to show (aka max bar width) - try: - for s in range(seconds * chunks): - chunks = TERM_WIDTH() - len(prefix) - 20 - progress = s / chunks / seconds * 100 - bar_width = round(progress/(100/chunks)) - - # ████████████████████ 0.9% (1/60sec) - sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( - prefix, - ANSI['green'], - (chunk * bar_width).ljust(chunks), - ANSI['reset'], - round(progress, 1), - round(s/chunks), - seconds, - )) - sys.stdout.flush() - time.sleep(1 / chunks) - - # ██████████████████████████████████ 100.0% (60/60sec) - sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format( - prefix, - ANSI['red'], - chunk * chunks, - ANSI['reset'], - 100.0, - seconds, - seconds, - )) - sys.stdout.flush() - except KeyboardInterrupt: - print() - pass - - @enforce_types def download_url(url: str, timeout: int=TIMEOUT) -> str: """Download the contents of a remote url and return the text""" @@ -547,58 +204,6 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str: return resp.read().decode(encoding) -@enforce_types -def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None: - """chmod -R <permissions> <cwd>/<path>""" - - if not os.path.exists(os.path.join(cwd, path)): - raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) - - chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) - if chmod_result.returncode == 1: - print(' ', chmod_result.stderr.decode()) - raise Exception('Failed to chmod {}/{}'.format(cwd, path)) - - -@enforce_types -def copy_and_overwrite(from_path: str, to_path: str): - if os.path.isdir(from_path): - shutil.rmtree(to_path, ignore_errors=True) - shutil.copytree(from_path, to_path) - else: - with open(from_path, 'rb') as src: - atomic_write(src.read(), to_path) - - -@enforce_types -def get_dir_size(path: str, recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]: - num_bytes, num_dirs, num_files = 0, 0, 0 - for entry in os.scandir(path): - if (pattern is not None) and (pattern not in entry.path): - continue - if entry.is_dir(follow_symlinks=False): - if not recursive: - continue - num_dirs += 1 - bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path) - num_bytes += bytes_inside - num_dirs += dirs_inside - num_files += files_inside - else: - num_bytes += entry.stat(follow_symlinks=False).st_size - num_files += 1 - return num_bytes, num_dirs, num_files - - -@enforce_types -def human_readable_size(num_bytes: Union[int, float]) -> str: - for count in ['Bytes','KB','MB','GB']: - if num_bytes > -1024.0 and num_bytes < 1024.0: - return '%3.1f %s' % (num_bytes, count) - num_bytes /= 1024.0 - return '%3.1f %s' % (num_bytes, 'TB') - - @enforce_types def chrome_args(**options) -> List[str]: """helper to build up a chrome shell command with arguments""" @@ -632,7 +237,7 @@ def chrome_args(**options) -> List[str]: return cmd_args -class ExtendedEncoder(JSONEncoder): +class ExtendedEncoder(pyjson.JSONEncoder): """ Extended json serializer that supports serializing several model fields and objects @@ -656,114 +261,5 @@ class ExtendedEncoder(JSONEncoder): elif cls_name in ('dict_items', 'dict_keys', 'dict_values'): return tuple(obj) - return JSONEncoder.default(self, obj) + return pyjson.JSONEncoder.default(self, obj) - -def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str: - if file: - path = os.path.realpath(file.name) - contents = json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder) - atomic_write(contents, path) - return contents - else: - return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder) - - -def links_to_csv(links: List[Link], csv_cols: Optional[List[str]]=None, - header: bool=True, ljust: int=0, separator: str=',') -> str: - csv_cols = csv_cols or ['timestamp', 'is_archived', 'url'] - - header_str = '' - if header: - header_str = separator.join(col.ljust(ljust) for col in csv_cols) - - row_strs = ( - link.to_csv(csv_cols=csv_cols, ljust=ljust, separator=separator) - for link in links - ) - - return '\n'.join((header_str, *row_strs)) - -def folders_to_str(folders: Dict[str, Optional[Link]]) -> str: - return '\n'.join(f'{folder} {link}' for folder, link in folders.items()) - -@enforce_types -def render_template(template_path: str, context: Mapping[str, str]) -> str: - """render a given html template string with the given template content""" - - # will be replaced by django templates in the future - with open(template_path, 'r', encoding='utf-8') as template: - template_str = template.read() - return Template(template_str).substitute(**context) - - -def atomic_write(contents: Union[dict, str, bytes], path: str) -> None: - """Safe atomic write to filesystem by writing to temp file + atomic rename""" - try: - tmp_file = '{}.tmp'.format(path) - - if isinstance(contents, bytes): - args = {'mode': 'wb+'} - else: - args = {'mode': 'w+', 'encoding': 'utf-8'} - - with open(tmp_file, **args) as f: - if isinstance(contents, dict): - to_json(contents, file=f) - else: - f.write(contents) - - os.fsync(f.fileno()) - - os.rename(tmp_file, path) - chmod_file(path) - finally: - if os.path.exists(tmp_file): - os.remove(tmp_file) - - -def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None: - """Tell the user they passed stdin to a command that doesn't accept it""" - - if stdin and not stdin.isatty(): - stdin_raw_text = stdin.read().strip() - if stdin_raw_text: - print( - '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format( - caller, - **ANSI, - ) - ) - print(' Run archivebox "{} --help" to see usage and examples.'.format( - caller, - )) - print() - raise SystemExit(1) - -def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]: - if stdin and not stdin.isatty(): - return stdin.read() - return None - - -def set_docstring(text: str): - def decorator(func): - @wraps(func) - def wrapper_with_docstring(*args, **kwargs): - return func(*args, **kwargs) - wrapper_with_docstring.__doc__ = text - return wrapper_with_docstring - return decorator - - -class SmartFormatter(argparse.HelpFormatter): - def _split_lines(self, text, width): - if '\n' in text: - return text.splitlines() - return argparse.HelpFormatter._split_lines(self, text, width) - - -class ArchiveError(Exception): - def __init__(self, message, hints=None): - super().__init__(message) - self.hints = hints