full type-hinting coverage

This commit is contained in:
Nick Sweeting 2019-03-26 23:25:07 -04:00
parent ab09560f14
commit c9c5b04df0
7 changed files with 94 additions and 13 deletions

View file

@ -24,6 +24,7 @@ from config import (
GIT_SHA, GIT_SHA,
) )
from util import ( from util import (
enforce_types,
save_remote_source, save_remote_source,
save_stdin_source, save_stdin_source,
) )
@ -100,7 +101,8 @@ def main(*args) -> List[Link]:
return update_archive_data(import_path=import_path, resume=resume) return update_archive_data(import_path=import_path, resume=resume)
def update_archive_data(import_path: str=None, resume: float=None) -> List[Link]: @enforce_types
def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None) -> List[Link]:
"""The main ArchiveBox entrancepoint. Everything starts here.""" """The main ArchiveBox entrancepoint. Everything starts here."""
# Step 1: Load list of links from the existing index # Step 1: Load list of links from the existing index

View file

@ -42,6 +42,7 @@ from config import (
YOUTUBEDL_VERSION, YOUTUBEDL_VERSION,
) )
from util import ( from util import (
enforce_types,
domain, domain,
extension, extension,
without_query, without_query,
@ -63,6 +64,7 @@ from logs import (
) )
@enforce_types
def archive_link(link: Link, page=None) -> Link: def archive_link(link: Link, page=None) -> Link:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
@ -126,6 +128,7 @@ def archive_link(link: Link, page=None) -> Link:
### Archive Method Functions ### Archive Method Functions
@enforce_types
def should_fetch_title(link_dir: str, link: Link) -> bool: def should_fetch_title(link_dir: str, link: Link) -> bool:
# if link already has valid title, skip it # if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'): if link.title and not link.title.lower().startswith('http'):
@ -136,6 +139,7 @@ def should_fetch_title(link_dir: str, link: Link) -> bool:
return FETCH_TITLE return FETCH_TITLE
@enforce_types
def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content""" """try to guess the page's title from its content"""
@ -169,12 +173,14 @@ def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResul
) )
@enforce_types
def should_fetch_favicon(link_dir: str, link: Link) -> bool: def should_fetch_favicon(link_dir: str, link: Link) -> bool:
if os.path.exists(os.path.join(link_dir, 'favicon.ico')): if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
return False return False
return FETCH_FAVICON return FETCH_FAVICON
@enforce_types
def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api""" """download site favicon from google's favicon api"""
@ -207,6 +213,7 @@ def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveRes
**timer.stats, **timer.stats,
) )
@enforce_types
def should_fetch_wget(link_dir: str, link: Link) -> bool: def should_fetch_wget(link_dir: str, link: Link) -> bool:
output_path = wget_output_path(link) output_path = wget_output_path(link)
if output_path and os.path.exists(os.path.join(link_dir, output_path)): if output_path and os.path.exists(os.path.join(link_dir, output_path)):
@ -215,6 +222,7 @@ def should_fetch_wget(link_dir: str, link: Link) -> bool:
return FETCH_WGET return FETCH_WGET
@enforce_types
def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using wget""" """download full site using wget"""
@ -294,6 +302,7 @@ def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult
**timer.stats, **timer.stats,
) )
@enforce_types
def should_fetch_pdf(link_dir: str, link: Link) -> bool: def should_fetch_pdf(link_dir: str, link: Link) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -304,6 +313,7 @@ def should_fetch_pdf(link_dir: str, link: Link) -> bool:
return FETCH_PDF return FETCH_PDF
@enforce_types
def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
"""print PDF of site to file using chrome --headless""" """print PDF of site to file using chrome --headless"""
@ -338,6 +348,7 @@ def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
**timer.stats, **timer.stats,
) )
@enforce_types
def should_fetch_screenshot(link_dir: str, link: Link) -> bool: def should_fetch_screenshot(link_dir: str, link: Link) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -347,6 +358,7 @@ def should_fetch_screenshot(link_dir: str, link: Link) -> bool:
return FETCH_SCREENSHOT return FETCH_SCREENSHOT
@enforce_types
def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
"""take screenshot of site using chrome --headless""" """take screenshot of site using chrome --headless"""
@ -381,6 +393,7 @@ def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> Archive
**timer.stats, **timer.stats,
) )
@enforce_types
def should_fetch_dom(link_dir: str, link: Link) -> bool: def should_fetch_dom(link_dir: str, link: Link) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -390,6 +403,7 @@ def should_fetch_dom(link_dir: str, link: Link) -> bool:
return FETCH_DOM return FETCH_DOM
@enforce_types
def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html""" """print HTML of site to file using chrome --dump-html"""
@ -426,6 +440,7 @@ def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
**timer.stats, **timer.stats,
) )
@enforce_types
def should_fetch_git(link_dir: str, link: Link) -> bool: def should_fetch_git(link_dir: str, link: Link) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -443,6 +458,7 @@ def should_fetch_git(link_dir: str, link: Link) -> bool:
return FETCH_GIT return FETCH_GIT
@enforce_types
def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using git""" """download full site using git"""
@ -485,6 +501,7 @@ def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
) )
@enforce_types
def should_fetch_media(link_dir: str, link: Link) -> bool: def should_fetch_media(link_dir: str, link: Link) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -494,6 +511,7 @@ def should_fetch_media(link_dir: str, link: Link) -> bool:
return FETCH_MEDIA return FETCH_MEDIA
@enforce_types
def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl""" """Download playlists or individual video, audio, and subtitles using youtube-dl"""
@ -557,6 +575,7 @@ def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> Archiv
) )
@enforce_types
def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool: def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool:
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -567,6 +586,7 @@ def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool:
return SUBMIT_ARCHIVE_DOT_ORG return SUBMIT_ARCHIVE_DOT_ORG
@enforce_types
def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult:
"""submit site to archive.org for archiving via their service, save returned archive url""" """submit site to archive.org for archiving via their service, save returned archive url"""
@ -622,6 +642,7 @@ def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveR
**timer.stats, **timer.stats,
) )
@enforce_types
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]: def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
# Parse archive.org response headers # Parse archive.org response headers
headers: Dict[str, List[str]] = defaultdict(list) headers: Dict[str, List[str]] = defaultdict(list)

View file

@ -46,6 +46,10 @@ CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true' CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true'
CHROME_USER_AGENT = os.getenv('CHROME_USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36') CHROME_USER_AGENT = os.getenv('CHROME_USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36')
USE_CURL = os.getenv('USE_CURL', 'True' ).lower() == 'true'
USE_WGET = os.getenv('USE_WGET', 'True' ).lower() == 'true'
USE_CHROME = os.getenv('USE_CHROME', 'True' ).lower() == 'true'
CURL_BINARY = os.getenv('CURL_BINARY', 'curl') CURL_BINARY = os.getenv('CURL_BINARY', 'curl')
GIT_BINARY = os.getenv('GIT_BINARY', 'git') GIT_BINARY = os.getenv('GIT_BINARY', 'git')
WGET_BINARY = os.getenv('WGET_BINARY', 'wget') WGET_BINARY = os.getenv('WGET_BINARY', 'wget')
@ -195,13 +199,19 @@ try:
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html') print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
### Make sure curl is installed ### Make sure curl is installed
USE_CURL = FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG if USE_CURL:
USE_CURL = FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG
else:
FETCH_FAVICON = SUBMIT_ARCHIVE_DOT_ORG = False
CURL_VERSION = None CURL_VERSION = None
if USE_CURL: if USE_CURL:
CURL_VERSION = check_version(CURL_BINARY) CURL_VERSION = check_version(CURL_BINARY)
### Make sure wget is installed and calculate version ### Make sure wget is installed and calculate version
USE_WGET = FETCH_WGET or FETCH_WARC if USE_WGET:
USE_WGET = FETCH_WGET or FETCH_WARC
else:
FETCH_WGET = FETCH_WARC = False
WGET_VERSION = None WGET_VERSION = None
if USE_WGET: if USE_WGET:
WGET_VERSION = check_version(WGET_BINARY) WGET_VERSION = check_version(WGET_BINARY)
@ -222,17 +232,21 @@ try:
check_version(YOUTUBEDL_BINARY) check_version(YOUTUBEDL_BINARY)
### Make sure chrome is installed and calculate version ### Make sure chrome is installed and calculate version
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM if USE_CHROME:
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
else:
FETCH_PDF = FETCH_SCREENSHOT = FETCH_DOM = False
CHROME_VERSION = None CHROME_VERSION = None
if USE_CHROME: if USE_CHROME:
if CHROME_BINARY is None: if CHROME_BINARY is None:
CHROME_BINARY = find_chrome_binary() CHROME_BINARY = find_chrome_binary()
CHROME_VERSION = check_version(CHROME_BINARY) if CHROME_BINARY:
# print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) CHROME_VERSION = check_version(CHROME_BINARY)
# print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
if CHROME_USER_DATA_DIR is None: if CHROME_USER_DATA_DIR is None:
CHROME_USER_DATA_DIR = find_chrome_data_dir() CHROME_USER_DATA_DIR = find_chrome_data_dir()
# print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) # print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
CHROME_OPTIONS = { CHROME_OPTIONS = {
'TIMEOUT': TIMEOUT, 'TIMEOUT': TIMEOUT,

View file

@ -58,7 +58,7 @@ def write_links_index(out_dir: str, links: List[Link], finished: bool=False) ->
@enforce_types @enforce_types
def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]: def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
"""parse and load existing index with any new links from import_path merged in""" """parse and load existing index with any new links from import_path merged in"""
existing_links: List[Link] = [] existing_links: List[Link] = []

View file

@ -32,9 +32,11 @@ from util import (
check_url_parsing_invariants, check_url_parsing_invariants,
TimedProgress, TimedProgress,
Link, Link,
enforce_types,
) )
@enforce_types
def parse_links(source_file: str) -> Tuple[List[Link], str]: def parse_links(source_file: str) -> Tuple[List[Link], str]:
"""parse a list of URLs with their metadata from an """parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file RSS feed, bookmarks export, or text file
@ -77,6 +79,7 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]:
### Import Parser Functions ### Import Parser Functions
@enforce_types
def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]: def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
@ -101,6 +104,7 @@ def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
) )
@enforce_types
def parse_json_export(json_file: IO[str]) -> Iterable[Link]: def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
@ -153,6 +157,7 @@ def parse_json_export(json_file: IO[str]) -> Iterable[Link]:
) )
@enforce_types
def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]: def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse RSS XML-format files into links""" """Parse RSS XML-format files into links"""
@ -190,6 +195,7 @@ def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]:
) )
@enforce_types
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]: def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Shaarli-specific RSS XML-format files into links""" """Parse Shaarli-specific RSS XML-format files into links"""
@ -227,6 +233,7 @@ def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
) )
@enforce_types
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]: def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
"""Parse netscape-format bookmarks export files (produced by all browsers)""" """Parse netscape-format bookmarks export files (produced by all browsers)"""
@ -251,6 +258,7 @@ def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
) )
@enforce_types
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]: def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links""" """Parse Pinboard RSS feed files into links"""
@ -282,6 +290,7 @@ def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
) )
@enforce_types
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]: def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Medium RSS feed files into links""" """Parse Medium RSS feed files into links"""
@ -303,6 +312,7 @@ def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
) )
@enforce_types
def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]: def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]:
"""Parse raw links from each line in a text file""" """Parse raw links from each line in a text file"""

View file

@ -23,6 +23,10 @@ class ArchiveResult:
status: str status: str
start_ts: datetime start_ts: datetime
end_ts: datetime end_ts: datetime
schema: str = 'ArchiveResult'
def __post_init__(self):
assert self.schema == self.__class__.__name__
def _asdict(self): def _asdict(self):
return asdict(self) return asdict(self)
@ -40,9 +44,11 @@ class Link:
sources: List[str] sources: List[str]
history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {}) history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
updated: Optional[datetime] = None updated: Optional[datetime] = None
schema: str = 'Link'
def __post_init__(self): def __post_init__(self):
"""fix any history result items to be type-checked ArchiveResults""" """fix any history result items to be type-checked ArchiveResults"""
assert self.schema == self.__class__.__name__
cast_history = {} cast_history = {}
for method, method_history in self.history.items(): for method, method_history in self.history.items():
cast_history[method] = [] cast_history[method] = []
@ -67,6 +73,7 @@ class Link:
def _asdict(self, extended=False): def _asdict(self, extended=False):
info = { info = {
'schema': 'Link',
'url': self.url, 'url': self.url,
'title': self.title or None, 'title': self.title or None,
'timestamp': self.timestamp, 'timestamp': self.timestamp,
@ -234,12 +241,18 @@ class ArchiveIndex:
num_links: int num_links: int
updated: str updated: str
links: List[Link] links: List[Link]
schema: str = 'ArchiveIndex'
def __post_init__(self):
assert self.schema == self.__class__.__name__
def _asdict(self): def _asdict(self):
return asdict(self) return asdict(self)
@dataclass @dataclass
class RuntimeStats: class RuntimeStats:
"""mutable stats counter for logging archiving timing info to CLI output"""
skipped: int skipped: int
succeeded: int succeeded: int
failed: int failed: int

View file

@ -91,7 +91,7 @@ STATICFILE_EXTENSIONS = {
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8' 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json', 'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img', 'dmg', 'iso', 'img',
@ -113,8 +113,9 @@ STATICFILE_EXTENSIONS = {
def enforce_types(func): def enforce_types(func):
""" """
Checks parameters type signatures against arg and kwarg type hints. Enforce function arg and kwarg types at runtime using its python3 type hints
""" """
# TODO: check return type as well
@wraps(func) @wraps(func)
def typechecked_function(*args, **kwargs): def typechecked_function(*args, **kwargs):
@ -183,6 +184,7 @@ def check_url_parsing_invariants() -> None:
### Random Helpers ### Random Helpers
@enforce_types
def save_stdin_source(raw_text: str) -> str: def save_stdin_source(raw_text: str) -> str:
if not os.path.exists(SOURCES_DIR): if not os.path.exists(SOURCES_DIR):
os.makedirs(SOURCES_DIR) os.makedirs(SOURCES_DIR)
@ -196,6 +198,8 @@ def save_stdin_source(raw_text: str) -> str:
return source_path return source_path
@enforce_types
def save_remote_source(url: str, timeout: int=TIMEOUT) -> str: def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
"""download a given url's content into output/sources/domain-<timestamp>.txt""" """download a given url's content into output/sources/domain-<timestamp>.txt"""
@ -233,6 +237,8 @@ def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
return source_path return source_path
@enforce_types
def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]: def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]:
"""Attempt to guess a page's title by downloading the html""" """Attempt to guess a page's title by downloading the html"""
@ -255,6 +261,8 @@ def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) ->
# )) # ))
return None return None
@enforce_types
def wget_output_path(link: Link) -> Optional[str]: def wget_output_path(link: Link) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may """calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path. adjust some paths to be different than the base_url path.
@ -323,14 +331,17 @@ def wget_output_path(link: Link) -> Optional[str]:
return None return None
@enforce_types
def read_js_script(script_name: str) -> str: def read_js_script(script_name: str) -> str:
script_path = os.path.join(PYTHON_PATH, 'scripts', script_name) script_path = os.path.join(PYTHON_PATH, 'scripts', script_name)
with open(script_path, 'r') as f: with open(script_path, 'r') as f:
return f.read().split('// INFO BELOW HERE')[0].strip() return f.read().split('// INFO BELOW HERE')[0].strip()
### String Manipulation & Logging Helpers ### String Manipulation & Logging Helpers
@enforce_types
def str_between(string: str, start: str, end: str=None) -> str: def str_between(string: str, start: str, end: str=None) -> str:
"""(<abc>12345</def>, <abc>, </def>) -> 12345""" """(<abc>12345</def>, <abc>, </def>) -> 12345"""
@ -341,6 +352,7 @@ def str_between(string: str, start: str, end: str=None) -> str:
return content return content
@enforce_types
def parse_date(date: Any) -> Optional[datetime]: def parse_date(date: Any) -> Optional[datetime]:
"""Parse unix timestamps, iso format, and human-readable strings""" """Parse unix timestamps, iso format, and human-readable strings"""
@ -435,6 +447,8 @@ def merge_links(a: Link, b: Link) -> Link:
history=history, history=history,
) )
@enforce_types
def is_static_file(url: str) -> bool: def is_static_file(url: str) -> bool:
"""Certain URLs just point to a single static file, and """Certain URLs just point to a single static file, and
don't need to be re-archived in many formats don't need to be re-archived in many formats
@ -443,6 +457,8 @@ def is_static_file(url: str) -> bool:
# TODO: the proper way is with MIME type detection, not using extension # TODO: the proper way is with MIME type detection, not using extension
return extension(url) in STATICFILE_EXTENSIONS return extension(url) in STATICFILE_EXTENSIONS
@enforce_types
def derived_link_info(link: Link) -> dict: def derived_link_info(link: Link) -> dict:
"""extend link info with the archive urls and other derived data""" """extend link info with the archive urls and other derived data"""
@ -518,6 +534,7 @@ class TimedProgress:
sys.stdout.flush() sys.stdout.flush()
@enforce_types
def progress_bar(seconds: int, prefix: str='') -> None: def progress_bar(seconds: int, prefix: str='') -> None:
"""show timer in the form of progress bar, with percentage and seconds remaining""" """show timer in the form of progress bar, with percentage and seconds remaining"""
chunk = '' if sys.stdout.encoding == 'UTF-8' else '#' chunk = '' if sys.stdout.encoding == 'UTF-8' else '#'
@ -557,6 +574,7 @@ def progress_bar(seconds: int, prefix: str='') -> None:
pass pass
@enforce_types
def download_url(url: str, timeout: int=TIMEOUT) -> str: def download_url(url: str, timeout: int=TIMEOUT) -> str:
"""Download the contents of a remote url and return the text""" """Download the contents of a remote url and return the text"""
@ -572,6 +590,8 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
encoding = resp.headers.get_content_charset() or 'utf-8' encoding = resp.headers.get_content_charset() or 'utf-8'
return resp.read().decode(encoding) return resp.read().decode(encoding)
@enforce_types
def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None: def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
"""chmod -R <permissions> <cwd>/<path>""" """chmod -R <permissions> <cwd>/<path>"""
@ -584,6 +604,7 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim
raise Exception('Failed to chmod {}/{}'.format(cwd, path)) raise Exception('Failed to chmod {}/{}'.format(cwd, path))
@enforce_types
def chrome_args(**options) -> List[str]: def chrome_args(**options) -> List[str]:
"""helper to build up a chrome shell command with arguments""" """helper to build up a chrome shell command with arguments"""