diff --git a/archivebox/cli/logging.py b/archivebox/cli/logging.py
index 87a7fab1..e07b0719 100644
--- a/archivebox/cli/logging.py
+++ b/archivebox/cli/logging.py
@@ -1,14 +1,28 @@
__package__ = 'archivebox.cli'
+import re
import os
import sys
+import time
+import argparse
from datetime import datetime
from dataclasses import dataclass
-from typing import Optional, List
+from multiprocessing import Process
+from typing import Optional, List, Dict, Union, IO
from ..index.schema import Link, ArchiveResult
-from ..config import ANSI, OUTPUT_DIR, IS_TTY
+from ..index.json import to_json
+from ..index.csv import links_to_csv
+from ..util import enforce_types
+from ..config import (
+ ConfigDict,
+ ANSI,
+ OUTPUT_DIR,
+ IS_TTY,
+ SHOW_PROGRESS,
+ TERM_WIDTH,
+)
@dataclass
@@ -32,11 +46,104 @@ class RuntimeStats:
_LAST_RUN_STATS = RuntimeStats()
-def pretty_path(path: str) -> str:
- """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
- pwd = os.path.abspath('.')
- # parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
- return path.replace(pwd + '/', './')
+
+class SmartFormatter(argparse.HelpFormatter):
+ """Patched formatter that prints newlines in argparse help strings"""
+ def _split_lines(self, text, width):
+ if '\n' in text:
+ return text.splitlines()
+ return argparse.HelpFormatter._split_lines(self, text, width)
+
+
+def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
+ """Tell the user they passed stdin to a command that doesn't accept it"""
+
+ if stdin and not stdin.isatty():
+ stdin_raw_text = stdin.read().strip()
+ if stdin_raw_text:
+ print(
+ '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
+ caller,
+ **ANSI,
+ )
+ )
+ print(' Run archivebox "{} --help" to see usage and examples.'.format(
+ caller,
+ ))
+ print()
+ raise SystemExit(1)
+
+def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
+ if stdin and not stdin.isatty():
+ return stdin.read()
+ return None
+
+
+class TimedProgress:
+ """Show a progress bar and measure elapsed time until .end() is called"""
+
+ def __init__(self, seconds, prefix=''):
+ if SHOW_PROGRESS:
+ self.p = Process(target=progress_bar, args=(seconds, prefix))
+ self.p.start()
+
+ self.stats = {'start_ts': datetime.now(), 'end_ts': None}
+
+ def end(self):
+ """immediately end progress, clear the progressbar line, and save end_ts"""
+
+ end_ts = datetime.now()
+ self.stats['end_ts'] = end_ts
+ if SHOW_PROGRESS:
+ # protect from double termination
+ #if p is None or not hasattr(p, 'kill'):
+ # return
+ if self.p is not None:
+ self.p.terminate()
+
+ self.p = None
+
+ sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) # clear whole terminal line
+
+
+@enforce_types
+def progress_bar(seconds: int, prefix: str='') -> None:
+ """show timer in the form of progress bar, with percentage and seconds remaining"""
+ chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
+ chunks = TERM_WIDTH() - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
+ try:
+ for s in range(seconds * chunks):
+ chunks = TERM_WIDTH() - len(prefix) - 20
+ progress = s / chunks / seconds * 100
+ bar_width = round(progress/(100/chunks))
+
+ # ████████████████████ 0.9% (1/60sec)
+ sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
+ prefix,
+ ANSI['green'],
+ (chunk * bar_width).ljust(chunks),
+ ANSI['reset'],
+ round(progress, 1),
+ round(s/chunks),
+ seconds,
+ ))
+ sys.stdout.flush()
+ time.sleep(1 / chunks)
+
+ # ██████████████████████████████████ 100.0% (60/60sec)
+ sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
+ prefix,
+ ANSI['red'],
+ chunk * chunks,
+ ANSI['reset'],
+ 100.0,
+ seconds,
+ seconds,
+ ))
+ sys.stdout.flush()
+ except KeyboardInterrupt:
+ print()
+ pass
### Parsing Stage
@@ -223,10 +330,9 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
print(' {}'.format(' '.join(filter_patterns or ())))
def log_list_finished(links):
- from ..util import links_to_csv
print()
print('---------------------------------------------------------------------------------------------------')
- print(links_to_csv(links, csv_cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
+ print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
print('---------------------------------------------------------------------------------------------------')
print()
@@ -266,3 +372,129 @@ def log_removal_finished(all_links: int, to_keep: int):
**ANSI,
))
print(' Index now contains {} links.'.format(to_keep))
+
+
+def log_shell_welcome_msg():
+ from . import list_subcommands
+
+ print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
+ print('{green}from archivebox.core.models import Page, User{reset}'.format(**ANSI))
+ print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
+ print()
+ print('[i] Welcome to the ArchiveBox Shell!')
+ print(' https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage')
+ print()
+ print(' {lightred}Hint:{reset} Example use:'.format(**ANSI))
+ print(' print(Page.objects.filter(is_archived=True).count())')
+ print(' Page.objects.get(url="https://example.com").as_json()')
+ print(' add("https://example.com/some/new/url")')
+
+
+
+### Helpers
+
+@enforce_types
+def pretty_path(path: str) -> str:
+ """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
+ pwd = os.path.abspath('.')
+ # parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
+ return path.replace(pwd + '/', './')
+
+
+@enforce_types
+def printable_filesize(num_bytes: Union[int, float]) -> str:
+ for count in ['Bytes','KB','MB','GB']:
+ if num_bytes > -1024.0 and num_bytes < 1024.0:
+ return '%3.1f %s' % (num_bytes, count)
+ num_bytes /= 1024.0
+ return '%3.1f %s' % (num_bytes, 'TB')
+
+
+@enforce_types
+def printable_folders(folders: Dict[str, Optional[Link]],
+ json: bool=False,
+ csv: Optional[str]=None) -> str:
+ if json:
+ return to_json(folders.values(), indent=4, sort_keys=True)
+
+ elif csv:
+ return links_to_csv(folders.values(), cols=csv.split(','), header=True)
+
+ return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
+
+
+
+@enforce_types
+def printable_config(config: ConfigDict, prefix: str='') -> str:
+ return f'\n{prefix}'.join(
+ f'{key}={val}'
+ for key, val in config.items()
+ if not (isinstance(val, dict) or callable(val))
+ )
+
+
+@enforce_types
+def printable_folder_status(name: str, folder: Dict) -> str:
+ if folder['enabled']:
+ if folder['is_valid']:
+ color, symbol, note = 'green', '√', 'valid'
+ else:
+ color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
+ else:
+ color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
+
+ if folder['path']:
+ if os.path.exists(folder['path']):
+ num_files = (
+ f'{len(os.listdir(folder["path"]))} files'
+ if os.path.isdir(folder['path']) else
+ printable_filesize(os.path.getsize(folder['path']))
+ )
+ else:
+ num_files = 'missing'
+
+ if ' ' in folder['path']:
+ folder['path'] = f'"{folder["path"]}"'
+
+ return ' '.join((
+ ANSI[color],
+ symbol,
+ ANSI['reset'],
+ name.ljust(22),
+ (folder["path"] or '').ljust(76),
+ num_files.ljust(14),
+ ANSI[color],
+ note,
+ ANSI['reset'],
+ ))
+
+
+@enforce_types
+def printable_dependency_version(name: str, dependency: Dict) -> str:
+ if dependency['enabled']:
+ if dependency['is_valid']:
+ color, symbol, note, version = 'green', '√', 'valid', ''
+
+ parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
+ if parsed_version_num:
+ version = f'v{parsed_version_num[0]}'
+
+ if not version:
+ color, symbol, note, version = 'red', 'X', 'invalid', '?'
+ else:
+ color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
+
+ if ' ' in dependency["path"]:
+ dependency["path"] = f'"{dependency["path"]}"'
+
+ return ' '.join((
+ ANSI[color],
+ symbol,
+ ANSI['reset'],
+ name.ljust(22),
+ (dependency["path"] or '').ljust(76),
+ version.ljust(14),
+ ANSI[color],
+ note,
+ ANSI['reset'],
+ ))
diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py
index fdaf9ca4..72baec64 100644
--- a/archivebox/config/__init__.py
+++ b/archivebox/config/__init__.py
@@ -119,6 +119,27 @@ DEFAULT_CLI_COLORS = {
}
ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
+STATICFILE_EXTENSIONS = {
+ # 99.999% of the time, URLs ending in these extentions are static files
+ # that can be downloaded as-is, not html pages that need to be rendered
+ 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
+ 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
+ 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
+ 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
+ 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
+ 'atom', 'rss', 'css', 'js', 'json',
+ 'dmg', 'iso', 'img',
+ 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
+
+ # Less common extensions to consider adding later
+ # jar, swf, bin, com, exe, dll, deb
+ # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
+ # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
+ # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
+
+ # Thse are always treated as pages, not as static files, never add them:
+ # html, htm, shtml, xhtml, xml, aspx, php, cgi
+}
VERSION_FILENAME = 'VERSION'
PYTHON_DIR_NAME = 'archivebox'
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index a41f3d1c..2900f798 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -64,3 +64,7 @@ class Page(models.Model):
@property
def base_url(self):
return self.as_link().base_url
+
+ @property
+ def link_dir(self):
+ return self.as_link().link_dir
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index b225a899..e128f8d0 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -4,17 +4,19 @@ import os
import sys
SECRET_KEY = '---------------- not a valid secret key ! ----------------'
-DEBUG = True
+DEBUG = os.getenv('DEBUG', 'False').lower() == 'true'
ALLOWED_HOSTS = ['*']
REPO_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), os.path.pardir, os.path.pardir))
OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR', os.curdir))
+ARCHIVE_DIR = os.path.join(OUTPUT_DIR, 'archive')
DATABASE_FILE = os.path.join(OUTPUT_DIR, 'index.sqlite3')
ACTIVE_THEME = 'default'
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
+APPEND_SLASH = True
INSTALLED_APPS = [
'django.contrib.auth',
diff --git a/archivebox/core/welcome_message.py b/archivebox/core/welcome_message.py
index 70410c75..b257b7d6 100644
--- a/archivebox/core/welcome_message.py
+++ b/archivebox/core/welcome_message.py
@@ -1,17 +1,6 @@
-from cli import list_subcommands
-
-from .config import ANSI
+from cli.logging import log_shell_welcome_msg
if __name__ == '__main__':
- print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
- # print('from archivebox.core.models import Page, User')
- print('{green}from archivebox.cli import\narchivebox_{}{reset}'.format("\narchivebox_".join(list_subcommands().keys()), **ANSI))
- print()
- print('[i] Welcome to the ArchiveBox Shell! Example use:')
- print(' print(Page.objects.filter(is_archived=True).count())')
- print(' Page.objects.get(url="https://example.com").as_json()')
-
- print(' Page.objects.get(url="https://example.com").as_json()')
-
- print(' from archivebox.main import get_invalid_folders')
+ from main import *
+ log_shell_welcome_msg()
diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py
index ad6d409b..7522ddb8 100644
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@@ -5,16 +5,11 @@ import os
from typing import Optional, List, Dict, Tuple
from collections import defaultdict
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, DEVNULL, chmod_file
from ..util import (
enforce_types,
- TimedProgress,
- run,
- PIPE,
- DEVNULL,
is_static_file,
- ArchiveError,
- chmod_file,
)
from ..config import (
VERSION,
@@ -24,6 +19,7 @@ from ..config import (
CURL_VERSION,
CHECK_SSL_VALIDITY
)
+from ..cli.logging import TimedProgress
diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py
index a002302f..331531c0 100644
--- a/archivebox/extractors/dom.py
+++ b/archivebox/extractors/dom.py
@@ -4,22 +4,19 @@ import os
from typing import Optional
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
- TimedProgress,
- run,
- PIPE,
is_static_file,
- ArchiveError,
chrome_args,
- chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_DOM,
CHROME_VERSION,
)
+from ..cli.logging import TimedProgress
diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py
index 0dff3900..ab5485c8 100644
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -5,14 +5,8 @@ import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
-from ..util import (
- enforce_types,
- TimedProgress,
- domain,
- run,
- PIPE,
- chmod_file,
-)
+from ..system import chmod_file, run, PIPE
+from ..util import enforce_types, domain
from ..config import (
TIMEOUT,
SAVE_FAVICON,
@@ -20,6 +14,7 @@ from ..config import (
CURL_VERSION,
CHECK_SSL_VALIDITY,
)
+from ..cli.logging import TimedProgress
@enforce_types
diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py
index 21a86f5e..54e67d8b 100644
--- a/archivebox/extractors/git.py
+++ b/archivebox/extractors/git.py
@@ -4,15 +4,11 @@ import os
from typing import Optional
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
- TimedProgress,
- run,
- PIPE,
is_static_file,
- ArchiveError,
- chmod_file,
domain,
extension,
without_query,
@@ -26,6 +22,7 @@ from ..config import (
GIT_DOMAINS,
CHECK_SSL_VALIDITY
)
+from ..cli.logging import TimedProgress
diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py
index 9fd9a9be..ece47f0a 100644
--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@@ -4,15 +4,11 @@ import os
from typing import Optional
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
- TimedProgress,
- run,
- PIPE,
is_static_file,
- ArchiveError,
- chmod_file,
)
from ..config import (
MEDIA_TIMEOUT,
@@ -21,6 +17,7 @@ from ..config import (
YOUTUBEDL_VERSION,
CHECK_SSL_VALIDITY
)
+from ..cli.logging import TimedProgress
@enforce_types
diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py
index e7ade948..c29f3b22 100644
--- a/archivebox/extractors/pdf.py
+++ b/archivebox/extractors/pdf.py
@@ -4,23 +4,19 @@ import os
from typing import Optional
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
- TimedProgress,
- run,
- PIPE,
is_static_file,
- ArchiveError,
chrome_args,
- chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_PDF,
CHROME_VERSION,
)
-
+from ..cli.logging import TimedProgress
@enforce_types
diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py
index 3e211939..d2879c95 100644
--- a/archivebox/extractors/screenshot.py
+++ b/archivebox/extractors/screenshot.py
@@ -4,22 +4,19 @@ import os
from typing import Optional
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE, chmod_file
from ..util import (
enforce_types,
- TimedProgress,
- run,
- PIPE,
is_static_file,
- ArchiveError,
chrome_args,
- chmod_file,
)
from ..config import (
TIMEOUT,
SAVE_SCREENSHOT,
CHROME_VERSION,
)
+from ..cli.logging import TimedProgress
diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index c8ba9dd3..497c0ffb 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -1,14 +1,14 @@
__package__ = 'archivebox.extractors'
+import re
from typing import Optional
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..util import (
enforce_types,
- TimedProgress,
is_static_file,
- ArchiveError,
- fetch_page_title,
+ download_url,
+ htmldecode,
)
from ..config import (
TIMEOUT,
@@ -16,6 +16,14 @@ from ..config import (
CURL_BINARY,
CURL_VERSION,
)
+from ..cli.logging import TimedProgress
+
+
+HTML_TITLE_REGEX = re.compile(
+ r'
' # start matching text after tag
+ r'(.[^<>]+)', # get everything up to these symbols
+ re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
+)
@enforce_types
@@ -44,7 +52,9 @@ def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
- output = fetch_page_title(link.url, timeout=timeout, progress=False)
+ html = download_url(link.url, timeout=timeout)
+ match = re.search(HTML_TITLE_REGEX, html)
+ output = htmldecode(match.group(1).strip()) if match else None
if not output:
raise ArchiveError('Unable to detect page title')
except Exception as err:
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index 7a0568f1..782d6d31 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -1,18 +1,22 @@
__package__ = 'archivebox.extractors'
import os
+import re
from typing import Optional
from datetime import datetime
-from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..system import run, PIPE
from ..util import (
enforce_types,
- TimedProgress,
- run,
- PIPE,
- wget_output_path,
- ArchiveError,
+ is_static_file,
+ without_scheme,
+ without_fragment,
+ without_query,
+ path,
+ domain,
+ urldecode,
)
from ..config import (
TIMEOUT,
@@ -26,7 +30,7 @@ from ..config import (
WGET_USER_AGENT,
COOKIES_FILE,
)
-
+from ..cli.logging import TimedProgress
@enforce_types
@@ -121,3 +125,72 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
status=status,
**timer.stats,
)
+
+
+@enforce_types
+def wget_output_path(link: Link) -> Optional[str]:
+ """calculate the path to the wgetted .html file, since wget may
+ adjust some paths to be different than the base_url path.
+
+ See docs on wget --adjust-extension (-E)
+ """
+
+ if is_static_file(link.url):
+ return without_scheme(without_fragment(link.url))
+
+ # Wget downloads can save in a number of different ways depending on the url:
+ # https://example.com
+ # > example.com/index.html
+ # https://example.com?v=zzVa_tX1OiI
+ # > example.com/index.html?v=zzVa_tX1OiI.html
+ # https://www.example.com/?v=zzVa_tX1OiI
+ # > example.com/index.html?v=zzVa_tX1OiI.html
+
+ # https://example.com/abc
+ # > example.com/abc.html
+ # https://example.com/abc/
+ # > example.com/abc/index.html
+ # https://example.com/abc?v=zzVa_tX1OiI.html
+ # > example.com/abc?v=zzVa_tX1OiI.html
+ # https://example.com/abc/?v=zzVa_tX1OiI.html
+ # > example.com/abc/index.html?v=zzVa_tX1OiI.html
+
+ # https://example.com/abc/test.html
+ # > example.com/abc/test.html
+ # https://example.com/abc/test?v=zzVa_tX1OiI
+ # > example.com/abc/test?v=zzVa_tX1OiI.html
+ # https://example.com/abc/test/?v=zzVa_tX1OiI
+ # > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
+
+ # There's also lots of complexity around how the urlencoding and renaming
+ # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
+
+ # Since the wget algorithm for -E (appending .html) is incredibly complex
+ # and there's no way to get the computed output path from wget
+ # in order to avoid having to reverse-engineer how they calculate it,
+ # we just look in the output folder read the filename wget used from the filesystem
+ full_path = without_fragment(without_query(path(link.url))).strip('/')
+ search_dir = os.path.join(
+ link.link_dir,
+ domain(link.url),
+ urldecode(full_path),
+ )
+
+ for _ in range(4):
+ if os.path.exists(search_dir):
+ if os.path.isdir(search_dir):
+ html_files = [
+ f for f in os.listdir(search_dir)
+ if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
+ ]
+ if html_files:
+ path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
+ return os.path.join(path_from_link_dir, html_files[0])
+
+ # Move up one directory level
+ search_dir = search_dir.rsplit('/', 1)[0]
+
+ if search_dir == link.link_dir:
+ break
+
+ return None
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index d7b6b43e..e86d3336 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -10,12 +10,10 @@ from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict
from contextlib import contextmanager
-from ..parsers import parse_links
+from ..system import atomic_write
from ..util import (
scheme,
enforce_types,
- TimedProgress,
- atomic_write,
ExtendedEncoder,
)
from ..config import (
@@ -30,6 +28,7 @@ from ..config import (
stderr,
)
from ..cli.logging import (
+ TimedProgress,
log_indexing_process_started,
log_indexing_process_finished,
log_indexing_started,
@@ -278,6 +277,8 @@ def import_new_links(existing_links: List[Link],
import_path: str,
out_dir: str=OUTPUT_DIR) -> Tuple[List[Link], List[Link]]:
+ from ..parsers import parse_links
+
new_links: List[Link] = []
# parse and validate the import file
@@ -584,9 +585,9 @@ def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], Li
else:
shutil.move(entry.path, dest)
fixed.append(dest)
-
- if link.link_dir != entry.path:
- link = link.overwrite(link_dir=entry.path)
- write_json_link_details(link, out_dir=entry.path)
+ timestamp = entry.path.rsplit('/', 1)[-1]
+ assert link.link_dir == entry.path
+ assert link.timestamp == timestamp
+ write_json_link_details(link, out_dir=entry.path)
return fixed, cant_fix
diff --git a/archivebox/index/csv.py b/archivebox/index/csv.py
new file mode 100644
index 00000000..804e6461
--- /dev/null
+++ b/archivebox/index/csv.py
@@ -0,0 +1,37 @@
+__package__ = 'archivebox.index'
+
+from typing import List, Optional, Any
+
+from ..util import enforce_types
+from .schema import Link
+
+
+@enforce_types
+def links_to_csv(links: List[Link],
+ cols: Optional[List[str]]=None,
+ header: bool=True,
+ separator: str=',',
+ ljust: int=0) -> str:
+
+ cols = cols or ['timestamp', 'is_archived', 'url']
+
+ header_str = ''
+ if header:
+ header_str = separator.join(col.ljust(ljust) for col in cols)
+
+ row_strs = (
+ link.to_csv(cols=cols, ljust=ljust, separator=separator)
+ for link in links
+ )
+
+ return '\n'.join((header_str, *row_strs))
+
+
+@enforce_types
+def to_csv(obj: Any, cols: List[str], separator: str=',', ljust: int=0) -> str:
+ from .json import to_json
+
+ return separator.join(
+ to_json(getattr(obj, col), indent=None).ljust(ljust)
+ for col in cols
+ )
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index 3cba2bf0..ea890276 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -2,20 +2,18 @@ __package__ = 'archivebox.index'
import os
+from string import Template
from datetime import datetime
-from typing import List, Optional, Iterator
+from typing import List, Optional, Iterator, Mapping
from .schema import Link
+from ..system import atomic_write, copy_and_overwrite
from ..util import (
enforce_types,
ts_to_date,
urlencode,
htmlencode,
urldecode,
- wget_output_path,
- render_template,
- atomic_write,
- copy_and_overwrite,
)
from ..config import (
OUTPUT_DIR,
@@ -67,7 +65,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished:
def main_index_template(links: List[Link], finished: bool=True) -> str:
"""render the template for the entire main index"""
- return render_template(MAIN_INDEX_TEMPLATE, {
+ return render_legacy_template(MAIN_INDEX_TEMPLATE, {
'version': VERSION,
'git_sha': GIT_SHA,
'num_links': str(len(links)),
@@ -86,7 +84,9 @@ def main_index_template(links: List[Link], finished: bool=True) -> str:
def main_index_row_template(link: Link) -> str:
"""render the template for an individual link row of the main index"""
- return render_template(MAIN_INDEX_ROW_TEMPLATE, {
+ from ..extractors.wget import wget_output_path
+
+ return render_legacy_template(MAIN_INDEX_ROW_TEMPLATE, {
**link._asdict(extended=True),
# before pages are finished archiving, show loading msg instead of title
@@ -122,9 +122,11 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
@enforce_types
def link_details_template(link: Link) -> str:
+ from ..extractors.wget import wget_output_path
+
link_info = link._asdict(extended=True)
- return render_template(LINK_DETAILS_TEMPLATE, {
+ return render_legacy_template(LINK_DETAILS_TEMPLATE, {
**link_info,
**link_info['canonical'],
'title': (
@@ -142,3 +144,13 @@ def link_details_template(link: Link) -> str:
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date(link.oldest_archive_date),
})
+
+
+@enforce_types
+def render_legacy_template(template_path: str, context: Mapping[str, str]) -> str:
+ """render a given html template string with the given template content"""
+
+ # will be replaced by django templates in the future
+ with open(template_path, 'r', encoding='utf-8') as template:
+ template_str = template.read()
+ return Template(template_str).substitute(**context)
diff --git a/archivebox/index/json.py b/archivebox/index/json.py
index 4d75d095..a11dba5d 100644
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@@ -2,13 +2,14 @@ __package__ = 'archivebox.index'
import os
import sys
-import json
+import json as pyjson
from datetime import datetime
-from typing import List, Optional, Iterator
+from typing import List, Optional, Iterator, Any
from .schema import Link, ArchiveResult
-from ..util import enforce_types, atomic_write
+from ..system import atomic_write
+from ..util import enforce_types
from ..config import (
VERSION,
OUTPUT_DIR,
@@ -46,7 +47,7 @@ def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
- links = json.load(f)['links']
+ links = pyjson.load(f)['links']
for link_json in links:
yield Link.from_json(link_json)
@@ -95,12 +96,13 @@ def parse_json_link_details(out_dir: str) -> Optional[Link]:
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
try:
- link_json = json.load(f)
+ link_json = pyjson.load(f)
return Link.from_json(link_json)
- except json.JSONDecodeError:
+ except pyjson.JSONDecodeError:
pass
return None
+
@enforce_types
def parse_json_links_details(out_dir: str) -> Iterator[Link]:
"""read through all the archive data folders and return the parsed links"""
@@ -111,3 +113,41 @@ def parse_json_links_details(out_dir: str) -> Iterator[Link]:
link = parse_json_link_details(entry.path)
if link:
yield link
+
+
+
+### Helpers
+
+class ExtendedEncoder(pyjson.JSONEncoder):
+ """
+ Extended json serializer that supports serializing several model
+ fields and objects
+ """
+
+ def default(self, obj):
+ cls_name = obj.__class__.__name__
+
+ if hasattr(obj, '_asdict'):
+ return obj._asdict()
+
+ elif isinstance(obj, bytes):
+ return obj.decode()
+
+ elif isinstance(obj, datetime):
+ return obj.isoformat()
+
+ elif isinstance(obj, Exception):
+ return '{}: {}'.format(obj.__class__.__name__, obj)
+
+ elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
+ return tuple(obj)
+
+ return pyjson.JSONEncoder.default(self, obj)
+
+
+@enforce_types
+def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
+ return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
+
+
+
diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index 1cec34b1..f8d81e34 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -61,19 +61,20 @@ class ArchiveResult:
info['end_ts'] = parse_date(info['end_ts'])
return cls(**info)
- def to_json(self, indent=4, sort_keys=True):
- from ..util import to_json
+ def to_dict(self, *keys) -> dict:
+ if keys:
+ return {k: v for k, v in asdict(self).items() if k in keys}
+ return asdict(self)
+
+ def to_json(self, indent=4, sort_keys=True) -> str:
+ from .json import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
- def to_csv(self, cols=None, ljust: int=0, separator: str=','):
- from ..util import to_json
+ def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
+ from .csv import to_csv
- cols = cols or self.field_names()
- return separator.join(
- to_json(getattr(self, col), indent=None).ljust(ljust)
- for col in cols
- )
+ return to_csv(self, csv_col=cols or self.field_names(), separator=separator, ljust=ljust)
@classmethod
def field_names(cls):
@@ -201,18 +202,15 @@ class Link:
info['history'] = cast_history
return cls(**info)
- def to_json(self, indent=4, sort_keys=True):
- from ..util import to_json
+ def to_json(self, indent=4, sort_keys=True) -> str:
+ from .json import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
- def to_csv(self, csv_cols: List[str], ljust: int=0, separator: str=','):
- from ..util import to_json
+ def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
+ from .csv import to_csv
- return separator.join(
- to_json(getattr(self, col), indent=None).ljust(ljust)
- for col in csv_cols
- )
+ return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
@classmethod
def field_names(cls):
@@ -354,7 +352,7 @@ class Link:
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""predict the expected output paths that should be present after archiving"""
- from ..util import wget_output_path
+ from ..extractors.wget import wget_output_path
canonical = {
'index_path': 'index.html',
'favicon_path': 'favicon.ico',
@@ -382,3 +380,5 @@ class Link:
'dom_path': static_path,
})
return canonical
+
+
diff --git a/archivebox/main.py b/archivebox/main.py
index 47c0a66d..231d27e0 100644
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -1,11 +1,10 @@
__package__ = 'archivebox'
-import re
import os
import sys
import shutil
-from typing import Dict, List, Optional, Set, Tuple, Iterable, IO
+from typing import Dict, List, Optional, Iterable, IO
from crontab import CronTab, CronSlices
@@ -17,18 +16,13 @@ from .cli import (
main_cmds,
archive_cmds,
)
-from .index.schema import Link
-from .util import (
- enforce_types,
- TimedProgress,
- get_dir_size,
- human_readable_size,
+from .parsers import (
save_stdin_to_sources,
save_file_to_sources,
- links_to_csv,
- to_json,
- folders_to_str,
)
+from .index.schema import Link
+from .util import enforce_types, docstring
+from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from .index import (
links_after_timestamp,
load_main_index,
@@ -51,7 +45,11 @@ from .index.json import (
parse_json_main_index,
parse_json_links_details,
)
-from .index.sql import parse_sql_main_index, get_admins, apply_migrations
+from .index.sql import (
+ parse_sql_main_index,
+ get_admins,
+ apply_migrations,
+)
from .index.html import parse_html_main_index
from .extractors import archive_link
from .config import (
@@ -91,6 +89,7 @@ from .config import (
get_real_name,
)
from .cli.logging import (
+ TimedProgress,
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
@@ -98,6 +97,11 @@ from .cli.logging import (
log_removal_finished,
log_list_started,
log_list_finished,
+ printable_config,
+ printable_folders,
+ printable_filesize,
+ printable_folder_status,
+ printable_dependency_version,
)
@@ -387,7 +391,7 @@ def info(out_dir: str=OUTPUT_DIR) -> None:
print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
print(f' {out_dir}/*')
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
- size = human_readable_size(num_bytes)
+ size = printable_filesize(num_bytes)
print(f' Size: {size} across {num_files} files')
print()
@@ -419,7 +423,7 @@ def info(out_dir: str=OUTPUT_DIR) -> None:
print(f' {ARCHIVE_DIR}/*')
num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
- size = human_readable_size(num_bytes)
+ size = printable_filesize(num_bytes)
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
print()
@@ -712,13 +716,8 @@ def list_all(filter_patterns_str: Optional[str]=None,
out_dir=out_dir,
)
- if csv:
- print(links_to_csv(folders.values(), csv_cols=csv.split(','), header=True))
- elif json:
- print(to_json(folders.values(), indent=4, sort_keys=True))
- else:
- print(folders_to_str(folders))
- raise SystemExit(not folders)
+ print(printable_folders(folders, json=json, csv=csv))
+ return folders
@enforce_types
@@ -749,7 +748,7 @@ def list_folders(links: List[Link],
status: str,
out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
- check_data_folder()
+ check_data_folder(out_dir=out_dir)
if status == 'indexed':
return get_indexed_folders(links, out_dir=out_dir)
@@ -796,7 +795,7 @@ def config(config_options_str: Optional[str]=None,
)
raise SystemExit(2)
elif config_options_str:
- config_options = stdin_raw_text.split('\n')
+ config_options = config_options_str.split('\n')
config_options = config_options or []
@@ -865,7 +864,6 @@ def config(config_options_str: Optional[str]=None,
stderr(' Please manually remove the relevant lines from your config file:')
stderr(f' {CONFIG_FILE}')
raise SystemExit(2)
-
else:
stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
stderr(' archivebox config')
@@ -874,8 +872,6 @@ def config(config_options_str: Optional[str]=None,
raise SystemExit(2)
-CRON_COMMENT = 'archivebox_schedule'
-
@enforce_types
def schedule(add: bool=False,
show: bool=False,
@@ -893,7 +889,7 @@ def schedule(add: bool=False,
os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True)
cron = CronTab(user=True)
- cron = dedupe_jobs(cron)
+ cron = dedupe_cron_jobs(cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
if foreground or run_all:
@@ -962,7 +958,7 @@ def schedule(add: bool=False,
stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
raise SystemExit(1)
- cron = dedupe_jobs(cron)
+ cron = dedupe_cron_jobs(cron)
cron.write()
total_runs = sum(j.frequency_per_year() for j in cron)
@@ -1025,95 +1021,13 @@ def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None:
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
+
+@enforce_types
def shell(out_dir: str=OUTPUT_DIR) -> None:
+ """Enter an interactive ArchiveBox Django shell"""
+
check_data_folder(out_dir=out_dir)
setup_django(OUTPUT_DIR)
from django.core.management import call_command
call_command("shell_plus")
-
-# Helpers
-
-def printable_config(config: ConfigDict, prefix: str='') -> str:
- return f'\n{prefix}'.join(
- f'{key}={val}'
- for key, val in config.items()
- if not (isinstance(val, dict) or callable(val))
- )
-
-def dedupe_jobs(cron: CronTab) -> CronTab:
- deduped: Set[Tuple[str, str]] = set()
-
- for job in list(cron):
- unique_tuple = (str(job.slices), job.command)
- if unique_tuple not in deduped:
- deduped.add(unique_tuple)
- cron.remove(job)
-
- for schedule, command in deduped:
- job = cron.new(command=command, comment=CRON_COMMENT)
- job.setall(schedule)
- job.enable()
-
- return cron
-
-
-def print_folder_status(name, folder):
- if folder['enabled']:
- if folder['is_valid']:
- color, symbol, note = 'green', '√', 'valid'
- else:
- color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
- else:
- color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
-
- if folder['path']:
- if os.path.exists(folder['path']):
- num_files = (
- f'{len(os.listdir(folder["path"]))} files'
- if os.path.isdir(folder['path']) else
- human_readable_size(os.path.getsize(folder['path']))
- )
- else:
- num_files = 'missing'
-
- if ' ' in folder['path']:
- folder['path'] = f'"{folder["path"]}"'
-
- print(
- ANSI[color],
- symbol,
- ANSI['reset'],
- name.ljust(22),
- (folder["path"] or '').ljust(76),
- num_files.ljust(14),
- ANSI[color],
- note,
- ANSI['reset'],
- )
-
-
-def print_dependency_version(name, dependency):
- if dependency['enabled']:
- if dependency['is_valid']:
- color, symbol, note = 'green', '√', 'valid'
- version = 'v' + re.search(r'[\d\.]+', dependency['version'])[0]
- else:
- color, symbol, note, version = 'red', 'X', 'invalid', '?'
- else:
- color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
-
- if ' ' in dependency["path"]:
- dependency["path"] = f'"{dependency["path"]}"'
-
- print(
- ANSI[color],
- symbol,
- ANSI['reset'],
- name.ljust(22),
- (dependency["path"] or '').ljust(76),
- version.ljust(14),
- ANSI[color],
- note,
- ANSI['reset'],
- )
diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py
index 2a20ff6d..e3e19c45 100644
--- a/archivebox/parsers/__init__.py
+++ b/archivebox/parsers/__init__.py
@@ -7,16 +7,29 @@ For examples of supported import formats see tests/.
__package__ = 'archivebox.parsers'
+import re
+import os
from typing import Tuple, List
+from datetime import datetime
-from ..config import TIMEOUT
-from ..util import (
- check_url_parsing_invariants,
- TimedProgress,
- Link,
- enforce_types,
+from ..index.schema import Link
+from ..system import atomic_write
+from ..config import (
+ ANSI,
+ OUTPUT_DIR,
+ SOURCES_DIR_NAME,
+ TIMEOUT,
+ check_data_folder,
)
+from ..util import (
+ basename,
+ domain,
+ download_url,
+ enforce_types,
+ URL_REGEX,
+)
+from ..cli.logging import pretty_path, TimedProgress
from .pocket_html import parse_pocket_html_export
from .pinboard_rss import parse_pinboard_rss_export
from .shaarli_rss import parse_shaarli_rss_export
@@ -66,3 +79,95 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]:
timer.end()
return [], 'Failed to parse'
+
+
+@enforce_types
+def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
+ check_data_folder(out_dir=out_dir)
+
+ sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
+ if not os.path.exists(sources_dir):
+ os.makedirs(sources_dir)
+
+ ts = str(datetime.now().timestamp()).split('.', 1)[0]
+
+ source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
+
+ atomic_write(raw_text, source_path)
+ return source_path
+
+
+@enforce_types
+def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
+ """download a given url's content into output/sources/domain-.txt"""
+ check_data_folder(out_dir=out_dir)
+
+ sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
+ if not os.path.exists(sources_dir):
+ os.makedirs(sources_dir)
+
+ ts = str(datetime.now().timestamp()).split('.', 1)[0]
+
+ source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
+
+ if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
+ source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
+ print('{}[*] [{}] Downloading {}{}'.format(
+ ANSI['green'],
+ datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+ path,
+ ANSI['reset'],
+ ))
+ timer = TimedProgress(timeout, prefix=' ')
+ try:
+ raw_source_text = download_url(path, timeout=timeout)
+ timer.end()
+ except Exception as e:
+ timer.end()
+ print('{}[!] Failed to download {}{}\n'.format(
+ ANSI['red'],
+ path,
+ ANSI['reset'],
+ ))
+ print(' ', e)
+ raise SystemExit(1)
+
+ else:
+ with open(path, 'r') as f:
+ raw_source_text = f.read()
+
+ atomic_write(raw_source_text, source_path)
+
+ print(' > {}'.format(pretty_path(source_path)))
+
+ return source_path
+
+
+def check_url_parsing_invariants() -> None:
+ """Check that plain text regex URL parsing works as expected"""
+
+ # this is last-line-of-defense to make sure the URL_REGEX isn't
+ # misbehaving, as the consequences could be disastrous and lead to many
+ # incorrect/badly parsed links being added to the archive
+
+ test_urls = '''
+ https://example1.com/what/is/happening.html?what=1#how-about-this=1
+ https://example2.com/what/is/happening/?what=1#how-about-this=1
+ HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
+ https://example4.com/what/is/happening.html
+ https://example5.com/
+ https://example6.com
+
+ http://example7.com
+ [https://example8.com/what/is/this.php?what=1]
+ [and http://example9.com?what=1&other=3#and-thing=2]
+ https://example10.com#and-thing=2 "
+ abcdef
+ sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
+ example13.bada
+ and example14.badb
+ htt://example15.badc
+ '''
+ # print('\n'.join(re.findall(URL_REGEX, test_urls)))
+ assert len(re.findall(URL_REGEX, test_urls)) == 12
+
diff --git a/archivebox/system.py b/archivebox/system.py
new file mode 100644
index 00000000..aa6263e9
--- /dev/null
+++ b/archivebox/system.py
@@ -0,0 +1,150 @@
+__package__ = 'archivebox'
+
+
+import os
+import shutil
+
+import json as pyjson
+from typing import Optional, Union, Set, Tuple
+
+from crontab import CronTab
+
+from subprocess import (
+ Popen,
+ PIPE,
+ DEVNULL,
+ CompletedProcess,
+ TimeoutExpired,
+ CalledProcessError,
+)
+
+from .util import enforce_types, ExtendedEncoder
+from .config import OUTPUT_PERMISSIONS
+
+
+def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
+ """Patched of subprocess.run to fix blocking io making timeout=innefective"""
+
+ if input is not None:
+ if 'stdin' in kwargs:
+ raise ValueError('stdin and input arguments may not both be used.')
+ kwargs['stdin'] = PIPE
+
+ if capture_output:
+ if ('stdout' in kwargs) or ('stderr' in kwargs):
+ raise ValueError('stdout and stderr arguments may not be used '
+ 'with capture_output.')
+ kwargs['stdout'] = PIPE
+ kwargs['stderr'] = PIPE
+
+ with Popen(*popenargs, **kwargs) as process:
+ try:
+ stdout, stderr = process.communicate(input, timeout=timeout)
+ except TimeoutExpired:
+ process.kill()
+ try:
+ stdout, stderr = process.communicate(input, timeout=2)
+ except:
+ pass
+ raise TimeoutExpired(popenargs[0][0], timeout)
+ except BaseException:
+ process.kill()
+ # We don't call process.wait() as .__exit__ does that for us.
+ raise
+ retcode = process.poll()
+ if check and retcode:
+ raise CalledProcessError(retcode, process.args,
+ output=stdout, stderr=stderr)
+ return CompletedProcess(process.args, retcode, stdout, stderr)
+
+
+def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
+ """Safe atomic write to filesystem by writing to temp file + atomic rename"""
+ try:
+ tmp_file = '{}.tmp'.format(path)
+
+ if isinstance(contents, bytes):
+ args = {'mode': 'wb+'}
+ else:
+ args = {'mode': 'w+', 'encoding': 'utf-8'}
+
+ with open(tmp_file, **args) as f:
+ if isinstance(contents, dict):
+ pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
+ else:
+ f.write(contents)
+
+ os.fsync(f.fileno())
+
+ os.rename(tmp_file, path)
+ chmod_file(path)
+ finally:
+ if os.path.exists(tmp_file):
+ os.remove(tmp_file)
+
+
+@enforce_types
+def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
+ """chmod -R /"""
+
+ if not os.path.exists(os.path.join(cwd, path)):
+ raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
+
+ chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
+ if chmod_result.returncode == 1:
+ print(' ', chmod_result.stderr.decode())
+ raise Exception('Failed to chmod {}/{}'.format(cwd, path))
+
+
+@enforce_types
+def copy_and_overwrite(from_path: str, to_path: str):
+ """copy a given file or directory to a given path, overwriting the destination"""
+ if os.path.isdir(from_path):
+ shutil.rmtree(to_path, ignore_errors=True)
+ shutil.copytree(from_path, to_path)
+ else:
+ with open(from_path, 'rb') as src:
+ atomic_write(src.read(), to_path)
+
+
+@enforce_types
+def get_dir_size(path: str, recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
+ """get the total disk size of a given directory, optionally summing up
+ recursively and limiting to a given filter list
+ """
+ num_bytes, num_dirs, num_files = 0, 0, 0
+ for entry in os.scandir(path):
+ if (pattern is not None) and (pattern not in entry.path):
+ continue
+ if entry.is_dir(follow_symlinks=False):
+ if not recursive:
+ continue
+ num_dirs += 1
+ bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
+ num_bytes += bytes_inside
+ num_dirs += dirs_inside
+ num_files += files_inside
+ else:
+ num_bytes += entry.stat(follow_symlinks=False).st_size
+ num_files += 1
+ return num_bytes, num_dirs, num_files
+
+
+CRON_COMMENT = 'archivebox_schedule'
+
+@enforce_types
+def dedupe_cron_jobs(cron: CronTab) -> CronTab:
+ deduped: Set[Tuple[str, str]] = set()
+
+ for job in list(cron):
+ unique_tuple = (str(job.slices), job.command)
+ if unique_tuple not in deduped:
+ deduped.add(unique_tuple)
+ cron.remove(job)
+
+ for schedule, command in deduped:
+ job = cron.new(command=command, comment=CRON_COMMENT)
+ job.setall(schedule)
+ job.enable()
+
+ return cron
diff --git a/archivebox/util.py b/archivebox/util.py
index 447b9eff..8b606db1 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -1,15 +1,8 @@
-import os
import re
-import sys
import ssl
-import json
-import time
-import shutil
-import argparse
-from string import Template
-from json import JSONEncoder
-from typing import List, Dict, Optional, Any, Union, IO, Mapping, Tuple
+
+from typing import List, Optional, Any
from inspect import signature
from functools import wraps
from hashlib import sha256
@@ -17,34 +10,17 @@ from urllib.request import Request, urlopen
from urllib.parse import urlparse, quote, unquote
from html import escape, unescape
from datetime import datetime
-from multiprocessing import Process
-from subprocess import (
- Popen,
- PIPE,
- DEVNULL,
- CompletedProcess,
- TimeoutExpired,
- CalledProcessError,
-)
from base32_crockford import encode as base32_encode # type: ignore
+import json as pyjson
-from .index.schema import Link
from .config import (
- ANSI,
- TERM_WIDTH,
- OUTPUT_DIR,
- SOURCES_DIR_NAME,
- OUTPUT_PERMISSIONS,
TIMEOUT,
- SHOW_PROGRESS,
- SAVE_TITLE,
+ STATICFILE_EXTENSIONS,
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
CHROME_OPTIONS,
- check_data_folder,
)
-from .cli.logging import pretty_path
### Parsing Helpers
@@ -66,6 +42,7 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
without_www = lambda url: url.replace('://www.', '://', 1)
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
+is_static_file = lambda url: extension(url).lower() in STATICFILE_EXTENSIONS # TODO: the proper way is with MIME type detection, not using extension
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
urldecode = lambda s: s and unquote(s)
@@ -85,36 +62,7 @@ URL_REGEX = re.compile(
r'[^\]\[\(\)<>\""\'\s]+', # stop parsing at these symbols
re.IGNORECASE,
)
-HTML_TITLE_REGEX = re.compile(
- r'' # start matching text after tag
- r'(.[^<>]+)', # get everything up to these symbols
- re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
-)
-STATICFILE_EXTENSIONS = {
- # 99.999% of the time, URLs ending in these extentions are static files
- # that can be downloaded as-is, not html pages that need to be rendered
- 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
- 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
- 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
- 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
- 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
- 'atom', 'rss', 'css', 'js', 'json',
- 'dmg', 'iso', 'img',
- 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
- # Less common extensions to consider adding later
- # jar, swf, bin, com, exe, dll, deb
- # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
- # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
- # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
-
- # Thse are always treated as pages, not as static files, never add them:
- # html, htm, shtml, xhtml, xml, aspx, php, cgi
-}
-
-
-
-### Checks & Tests
def enforce_types(func):
"""
@@ -158,189 +106,14 @@ def enforce_types(func):
return typechecked_function
-def check_url_parsing_invariants() -> None:
- """Check that plain text regex URL parsing works as expected"""
+def docstring(text: Optional[str]):
+ """attach the given docstring to the decorated function"""
+ def decorator(func):
+ if text:
+ func.__doc__ = text
+ return func
+ return decorator
- # this is last-line-of-defense to make sure the URL_REGEX isn't
- # misbehaving, as the consequences could be disastrous and lead to many
- # incorrect/badly parsed links being added to the archive
-
- test_urls = '''
- https://example1.com/what/is/happening.html?what=1#how-about-this=1
- https://example2.com/what/is/happening/?what=1#how-about-this=1
- HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
- https://example4.com/what/is/happening.html
- https://example5.com/
- https://example6.com
-
- http://example7.com
- [https://example8.com/what/is/this.php?what=1]
- [and http://example9.com?what=1&other=3#and-thing=2]
- https://example10.com#and-thing=2 "
- abcdef
- sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
- example13.bada
- and example14.badb
- htt://example15.badc
- '''
- # print('\n'.join(re.findall(URL_REGEX, test_urls)))
- assert len(re.findall(URL_REGEX, test_urls)) == 12
-
-
-### Random Helpers
-
-@enforce_types
-def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
- check_data_folder(out_dir=out_dir)
-
- sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
- if not os.path.exists(sources_dir):
- os.makedirs(sources_dir)
-
- ts = str(datetime.now().timestamp()).split('.', 1)[0]
-
- source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
-
- atomic_write(raw_text, source_path)
- return source_path
-
-
-@enforce_types
-def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
- """download a given url's content into output/sources/domain-.txt"""
- check_data_folder(out_dir=out_dir)
-
- sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
- if not os.path.exists(sources_dir):
- os.makedirs(sources_dir)
-
- ts = str(datetime.now().timestamp()).split('.', 1)[0]
-
- source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
-
- if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
- source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
- print('{}[*] [{}] Downloading {}{}'.format(
- ANSI['green'],
- datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
- path,
- ANSI['reset'],
- ))
- timer = TimedProgress(timeout, prefix=' ')
- try:
- raw_source_text = download_url(path, timeout=timeout)
- timer.end()
- except Exception as e:
- timer.end()
- print('{}[!] Failed to download {}{}\n'.format(
- ANSI['red'],
- path,
- ANSI['reset'],
- ))
- print(' ', e)
- raise SystemExit(1)
-
- else:
- with open(path, 'r') as f:
- raw_source_text = f.read()
-
- atomic_write(raw_source_text, source_path)
-
- print(' > {}'.format(pretty_path(source_path)))
-
- return source_path
-
-
-@enforce_types
-def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]:
- """Attempt to guess a page's title by downloading the html"""
-
- if not SAVE_TITLE:
- return None
-
- try:
- html = download_url(url, timeout=timeout)
-
- match = re.search(HTML_TITLE_REGEX, html)
- return htmldecode(match.group(1).strip()) if match else None
- except Exception as err: # noqa
- # print('[!] Failed to fetch title because of {}: {}'.format(
- # err.__class__.__name__,
- # err,
- # ))
- return None
-
-
-@enforce_types
-def wget_output_path(link: Link) -> Optional[str]:
- """calculate the path to the wgetted .html file, since wget may
- adjust some paths to be different than the base_url path.
-
- See docs on wget --adjust-extension (-E)
- """
-
- if is_static_file(link.url):
- return without_scheme(without_fragment(link.url))
-
- # Wget downloads can save in a number of different ways depending on the url:
- # https://example.com
- # > output/archive//example.com/index.html
- # https://example.com?v=zzVa_tX1OiI
- # > output/archive//example.com/index.html?v=zzVa_tX1OiI.html
- # https://www.example.com/?v=zzVa_tX1OiI
- # > output/archive//example.com/index.html?v=zzVa_tX1OiI.html
-
- # https://example.com/abc
- # > output/archive//example.com/abc.html
- # https://example.com/abc/
- # > output/archive//example.com/abc/index.html
- # https://example.com/abc?v=zzVa_tX1OiI.html
- # > output/archive//example.com/abc?v=zzVa_tX1OiI.html
- # https://example.com/abc/?v=zzVa_tX1OiI.html
- # > output/archive//example.com/abc/index.html?v=zzVa_tX1OiI.html
-
- # https://example.com/abc/test.html
- # > output/archive//example.com/abc/test.html
- # https://example.com/abc/test?v=zzVa_tX1OiI
- # > output/archive//example.com/abc/test?v=zzVa_tX1OiI.html
- # https://example.com/abc/test/?v=zzVa_tX1OiI
- # > output/archive//example.com/abc/test/index.html?v=zzVa_tX1OiI.html
-
- # There's also lots of complexity around how the urlencoding and renaming
- # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
-
- # Since the wget algorithm for -E (appending .html) is incredibly complex
- # and there's no way to get the computed output path from wget
- # in order to avoid having to reverse-engineer how they calculate it,
- # we just look in the output folder read the filename wget used from the filesystem
- full_path = without_fragment(without_query(path(link.url))).strip('/')
- search_dir = os.path.join(
- link.link_dir,
- domain(link.url),
- urldecode(full_path),
- )
-
- for _ in range(4):
- if os.path.exists(search_dir):
- if os.path.isdir(search_dir):
- html_files = [
- f for f in os.listdir(search_dir)
- if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
- ]
- if html_files:
- path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
- return os.path.join(path_from_link_dir, html_files[0])
-
- # Move up one directory level
- search_dir = search_dir.rsplit('/', 1)[0]
-
- if search_dir == link.link_dir:
- break
-
- return None
-
-
-### String Manipulation & Logging Helpers
@enforce_types
def str_between(string: str, start: str, end: str=None) -> str:
@@ -415,122 +188,6 @@ def parse_date(date: Any) -> Optional[datetime]:
raise ValueError('Tried to parse invalid date! {}'.format(date))
-@enforce_types
-def is_static_file(url: str) -> bool:
- """Certain URLs just point to a single static file, and
- don't need to be re-archived in many formats
- """
-
- # TODO: the proper way is with MIME type detection, not using extension
- return extension(url) in STATICFILE_EXTENSIONS
-
-
-
-### Python / System Helpers
-
-def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
- """Patched of subprocess.run to fix blocking io making timeout=innefective"""
-
- if input is not None:
- if 'stdin' in kwargs:
- raise ValueError('stdin and input arguments may not both be used.')
- kwargs['stdin'] = PIPE
-
- if capture_output:
- if ('stdout' in kwargs) or ('stderr' in kwargs):
- raise ValueError('stdout and stderr arguments may not be used '
- 'with capture_output.')
- kwargs['stdout'] = PIPE
- kwargs['stderr'] = PIPE
-
- with Popen(*popenargs, **kwargs) as process:
- try:
- stdout, stderr = process.communicate(input, timeout=timeout)
- except TimeoutExpired:
- process.kill()
- try:
- stdout, stderr = process.communicate(input, timeout=2)
- except:
- pass
- raise TimeoutExpired(popenargs[0][0], timeout)
- except BaseException:
- process.kill()
- # We don't call process.wait() as .__exit__ does that for us.
- raise
- retcode = process.poll()
- if check and retcode:
- raise CalledProcessError(retcode, process.args,
- output=stdout, stderr=stderr)
- return CompletedProcess(process.args, retcode, stdout, stderr)
-
-
-class TimedProgress:
- """Show a progress bar and measure elapsed time until .end() is called"""
-
- def __init__(self, seconds, prefix=''):
- if SHOW_PROGRESS:
- self.p = Process(target=progress_bar, args=(seconds, prefix))
- self.p.start()
-
- self.stats = {'start_ts': datetime.now(), 'end_ts': None}
-
- def end(self):
- """immediately end progress, clear the progressbar line, and save end_ts"""
-
- end_ts = datetime.now()
- self.stats['end_ts'] = end_ts
- if SHOW_PROGRESS:
- # protect from double termination
- #if p is None or not hasattr(p, 'kill'):
- # return
- if self.p is not None:
- self.p.terminate()
-
- self.p = None
-
- sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) # clear whole terminal line
-
-
-@enforce_types
-def progress_bar(seconds: int, prefix: str='') -> None:
- """show timer in the form of progress bar, with percentage and seconds remaining"""
- chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
- chunks = TERM_WIDTH() - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
- try:
- for s in range(seconds * chunks):
- chunks = TERM_WIDTH() - len(prefix) - 20
- progress = s / chunks / seconds * 100
- bar_width = round(progress/(100/chunks))
-
- # ████████████████████ 0.9% (1/60sec)
- sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
- prefix,
- ANSI['green'],
- (chunk * bar_width).ljust(chunks),
- ANSI['reset'],
- round(progress, 1),
- round(s/chunks),
- seconds,
- ))
- sys.stdout.flush()
- time.sleep(1 / chunks)
-
- # ██████████████████████████████████ 100.0% (60/60sec)
- sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
- prefix,
- ANSI['red'],
- chunk * chunks,
- ANSI['reset'],
- 100.0,
- seconds,
- seconds,
- ))
- sys.stdout.flush()
- except KeyboardInterrupt:
- print()
- pass
-
-
@enforce_types
def download_url(url: str, timeout: int=TIMEOUT) -> str:
"""Download the contents of a remote url and return the text"""
@@ -547,58 +204,6 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
return resp.read().decode(encoding)
-@enforce_types
-def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
- """chmod -R /"""
-
- if not os.path.exists(os.path.join(cwd, path)):
- raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
-
- chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
- if chmod_result.returncode == 1:
- print(' ', chmod_result.stderr.decode())
- raise Exception('Failed to chmod {}/{}'.format(cwd, path))
-
-
-@enforce_types
-def copy_and_overwrite(from_path: str, to_path: str):
- if os.path.isdir(from_path):
- shutil.rmtree(to_path, ignore_errors=True)
- shutil.copytree(from_path, to_path)
- else:
- with open(from_path, 'rb') as src:
- atomic_write(src.read(), to_path)
-
-
-@enforce_types
-def get_dir_size(path: str, recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
- num_bytes, num_dirs, num_files = 0, 0, 0
- for entry in os.scandir(path):
- if (pattern is not None) and (pattern not in entry.path):
- continue
- if entry.is_dir(follow_symlinks=False):
- if not recursive:
- continue
- num_dirs += 1
- bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
- num_bytes += bytes_inside
- num_dirs += dirs_inside
- num_files += files_inside
- else:
- num_bytes += entry.stat(follow_symlinks=False).st_size
- num_files += 1
- return num_bytes, num_dirs, num_files
-
-
-@enforce_types
-def human_readable_size(num_bytes: Union[int, float]) -> str:
- for count in ['Bytes','KB','MB','GB']:
- if num_bytes > -1024.0 and num_bytes < 1024.0:
- return '%3.1f %s' % (num_bytes, count)
- num_bytes /= 1024.0
- return '%3.1f %s' % (num_bytes, 'TB')
-
-
@enforce_types
def chrome_args(**options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""
@@ -632,7 +237,7 @@ def chrome_args(**options) -> List[str]:
return cmd_args
-class ExtendedEncoder(JSONEncoder):
+class ExtendedEncoder(pyjson.JSONEncoder):
"""
Extended json serializer that supports serializing several model
fields and objects
@@ -656,114 +261,5 @@ class ExtendedEncoder(JSONEncoder):
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
return tuple(obj)
- return JSONEncoder.default(self, obj)
+ return pyjson.JSONEncoder.default(self, obj)
-
-def to_json(obj: Any, file: IO=None, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
- if file:
- path = os.path.realpath(file.name)
- contents = json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
- atomic_write(contents, path)
- return contents
- else:
- return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
-
-
-def links_to_csv(links: List[Link], csv_cols: Optional[List[str]]=None,
- header: bool=True, ljust: int=0, separator: str=',') -> str:
- csv_cols = csv_cols or ['timestamp', 'is_archived', 'url']
-
- header_str = ''
- if header:
- header_str = separator.join(col.ljust(ljust) for col in csv_cols)
-
- row_strs = (
- link.to_csv(csv_cols=csv_cols, ljust=ljust, separator=separator)
- for link in links
- )
-
- return '\n'.join((header_str, *row_strs))
-
-def folders_to_str(folders: Dict[str, Optional[Link]]) -> str:
- return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
-
-@enforce_types
-def render_template(template_path: str, context: Mapping[str, str]) -> str:
- """render a given html template string with the given template content"""
-
- # will be replaced by django templates in the future
- with open(template_path, 'r', encoding='utf-8') as template:
- template_str = template.read()
- return Template(template_str).substitute(**context)
-
-
-def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
- """Safe atomic write to filesystem by writing to temp file + atomic rename"""
- try:
- tmp_file = '{}.tmp'.format(path)
-
- if isinstance(contents, bytes):
- args = {'mode': 'wb+'}
- else:
- args = {'mode': 'w+', 'encoding': 'utf-8'}
-
- with open(tmp_file, **args) as f:
- if isinstance(contents, dict):
- to_json(contents, file=f)
- else:
- f.write(contents)
-
- os.fsync(f.fileno())
-
- os.rename(tmp_file, path)
- chmod_file(path)
- finally:
- if os.path.exists(tmp_file):
- os.remove(tmp_file)
-
-
-def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
- """Tell the user they passed stdin to a command that doesn't accept it"""
-
- if stdin and not stdin.isatty():
- stdin_raw_text = stdin.read().strip()
- if stdin_raw_text:
- print(
- '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
- caller,
- **ANSI,
- )
- )
- print(' Run archivebox "{} --help" to see usage and examples.'.format(
- caller,
- ))
- print()
- raise SystemExit(1)
-
-def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
- if stdin and not stdin.isatty():
- return stdin.read()
- return None
-
-
-def set_docstring(text: str):
- def decorator(func):
- @wraps(func)
- def wrapper_with_docstring(*args, **kwargs):
- return func(*args, **kwargs)
- wrapper_with_docstring.__doc__ = text
- return wrapper_with_docstring
- return decorator
-
-
-class SmartFormatter(argparse.HelpFormatter):
- def _split_lines(self, text, width):
- if '\n' in text:
- return text.splitlines()
- return argparse.HelpFormatter._split_lines(self, text, width)
-
-
-class ArchiveError(Exception):
- def __init__(self, message, hints=None):
- super().__init__(message)
- self.hints = hints