diff --git a/archivebox/archive.py b/archivebox/archive.py index c6e10bd2..ff4128c9 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -24,6 +24,7 @@ from config import ( GIT_SHA, ) from util import ( + enforce_types, save_remote_source, save_stdin_source, ) @@ -100,7 +101,8 @@ def main(*args) -> List[Link]: return update_archive_data(import_path=import_path, resume=resume) -def update_archive_data(import_path: str=None, resume: float=None) -> List[Link]: +@enforce_types +def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None) -> List[Link]: """The main ArchiveBox entrancepoint. Everything starts here.""" # Step 1: Load list of links from the existing index diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 76153e70..3bfc15a7 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -42,6 +42,7 @@ from config import ( YOUTUBEDL_VERSION, ) from util import ( + enforce_types, domain, extension, without_query, @@ -63,6 +64,7 @@ from logs import ( ) +@enforce_types def archive_link(link: Link, page=None) -> Link: """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" @@ -126,6 +128,7 @@ def archive_link(link: Link, page=None) -> Link: ### Archive Method Functions +@enforce_types def should_fetch_title(link_dir: str, link: Link) -> bool: # if link already has valid title, skip it if link.title and not link.title.lower().startswith('http'): @@ -136,6 +139,7 @@ def should_fetch_title(link_dir: str, link: Link) -> bool: return FETCH_TITLE +@enforce_types def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: """try to guess the page's title from its content""" @@ -169,12 +173,14 @@ def fetch_title(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResul ) +@enforce_types def should_fetch_favicon(link_dir: str, link: Link) -> bool: if os.path.exists(os.path.join(link_dir, 'favicon.ico')): return False return FETCH_FAVICON - + +@enforce_types def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: """download site favicon from google's favicon api""" @@ -207,6 +213,7 @@ def fetch_favicon(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveRes **timer.stats, ) +@enforce_types def should_fetch_wget(link_dir: str, link: Link) -> bool: output_path = wget_output_path(link) if output_path and os.path.exists(os.path.join(link_dir, output_path)): @@ -215,6 +222,7 @@ def should_fetch_wget(link_dir: str, link: Link) -> bool: return FETCH_WGET +@enforce_types def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: """download full site using wget""" @@ -294,6 +302,7 @@ def fetch_wget(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult **timer.stats, ) +@enforce_types def should_fetch_pdf(link_dir: str, link: Link) -> bool: if is_static_file(link.url): return False @@ -304,6 +313,7 @@ def should_fetch_pdf(link_dir: str, link: Link) -> bool: return FETCH_PDF +@enforce_types def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: """print PDF of site to file using chrome --headless""" @@ -338,6 +348,7 @@ def fetch_pdf(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: **timer.stats, ) +@enforce_types def should_fetch_screenshot(link_dir: str, link: Link) -> bool: if is_static_file(link.url): return False @@ -347,6 +358,7 @@ def should_fetch_screenshot(link_dir: str, link: Link) -> bool: return FETCH_SCREENSHOT +@enforce_types def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: """take screenshot of site using chrome --headless""" @@ -381,6 +393,7 @@ def fetch_screenshot(link_dir: str, link: Link, timeout: int=TIMEOUT) -> Archive **timer.stats, ) +@enforce_types def should_fetch_dom(link_dir: str, link: Link) -> bool: if is_static_file(link.url): return False @@ -390,6 +403,7 @@ def should_fetch_dom(link_dir: str, link: Link) -> bool: return FETCH_DOM +@enforce_types def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: """print HTML of site to file using chrome --dump-html""" @@ -426,6 +440,7 @@ def fetch_dom(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: **timer.stats, ) +@enforce_types def should_fetch_git(link_dir: str, link: Link) -> bool: if is_static_file(link.url): return False @@ -443,6 +458,7 @@ def should_fetch_git(link_dir: str, link: Link) -> bool: return FETCH_GIT +@enforce_types def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: """download full site using git""" @@ -485,6 +501,7 @@ def fetch_git(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: ) +@enforce_types def should_fetch_media(link_dir: str, link: Link) -> bool: if is_static_file(link.url): return False @@ -494,6 +511,7 @@ def should_fetch_media(link_dir: str, link: Link) -> bool: return FETCH_MEDIA +@enforce_types def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: """Download playlists or individual video, audio, and subtitles using youtube-dl""" @@ -557,6 +575,7 @@ def fetch_media(link_dir: str, link: Link, timeout: int=MEDIA_TIMEOUT) -> Archiv ) +@enforce_types def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool: if is_static_file(link.url): return False @@ -567,6 +586,7 @@ def should_fetch_archive_dot_org(link_dir: str, link: Link) -> bool: return SUBMIT_ARCHIVE_DOT_ORG +@enforce_types def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveResult: """submit site to archive.org for archiving via their service, save returned archive url""" @@ -622,6 +642,7 @@ def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveR **timer.stats, ) +@enforce_types def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]: # Parse archive.org response headers headers: Dict[str, List[str]] = defaultdict(list) diff --git a/archivebox/config.py b/archivebox/config.py index ec38b367..38a12d4a 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -46,6 +46,10 @@ CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true' CHROME_USER_AGENT = os.getenv('CHROME_USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36') +USE_CURL = os.getenv('USE_CURL', 'True' ).lower() == 'true' +USE_WGET = os.getenv('USE_WGET', 'True' ).lower() == 'true' +USE_CHROME = os.getenv('USE_CHROME', 'True' ).lower() == 'true' + CURL_BINARY = os.getenv('CURL_BINARY', 'curl') GIT_BINARY = os.getenv('GIT_BINARY', 'git') WGET_BINARY = os.getenv('WGET_BINARY', 'wget') @@ -195,13 +199,19 @@ try: print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html') ### Make sure curl is installed - USE_CURL = FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG + if USE_CURL: + USE_CURL = FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG + else: + FETCH_FAVICON = SUBMIT_ARCHIVE_DOT_ORG = False CURL_VERSION = None if USE_CURL: CURL_VERSION = check_version(CURL_BINARY) ### Make sure wget is installed and calculate version - USE_WGET = FETCH_WGET or FETCH_WARC + if USE_WGET: + USE_WGET = FETCH_WGET or FETCH_WARC + else: + FETCH_WGET = FETCH_WARC = False WGET_VERSION = None if USE_WGET: WGET_VERSION = check_version(WGET_BINARY) @@ -222,17 +232,21 @@ try: check_version(YOUTUBEDL_BINARY) ### Make sure chrome is installed and calculate version - USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM + if USE_CHROME: + USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM + else: + FETCH_PDF = FETCH_SCREENSHOT = FETCH_DOM = False CHROME_VERSION = None if USE_CHROME: if CHROME_BINARY is None: CHROME_BINARY = find_chrome_binary() - CHROME_VERSION = check_version(CHROME_BINARY) - # print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) + if CHROME_BINARY: + CHROME_VERSION = check_version(CHROME_BINARY) + # print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) - if CHROME_USER_DATA_DIR is None: - CHROME_USER_DATA_DIR = find_chrome_data_dir() - # print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) + if CHROME_USER_DATA_DIR is None: + CHROME_USER_DATA_DIR = find_chrome_data_dir() + # print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) CHROME_OPTIONS = { 'TIMEOUT': TIMEOUT, diff --git a/archivebox/index.py b/archivebox/index.py index f0cd46af..2bf2b5eb 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -58,7 +58,7 @@ def write_links_index(out_dir: str, links: List[Link], finished: bool=False) -> @enforce_types -def load_links_index(out_dir: str=OUTPUT_DIR, import_path: str=None) -> Tuple[List[Link], List[Link]]: +def load_links_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]: """parse and load existing index with any new links from import_path merged in""" existing_links: List[Link] = [] diff --git a/archivebox/parse.py b/archivebox/parse.py index ba200ff3..093d4a92 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -32,9 +32,11 @@ from util import ( check_url_parsing_invariants, TimedProgress, Link, + enforce_types, ) +@enforce_types def parse_links(source_file: str) -> Tuple[List[Link], str]: """parse a list of URLs with their metadata from an RSS feed, bookmarks export, or text file @@ -77,6 +79,7 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]: ### Import Parser Functions +@enforce_types def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]: """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" @@ -101,6 +104,7 @@ def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]: ) +@enforce_types def parse_json_export(json_file: IO[str]) -> Iterable[Link]: """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" @@ -153,6 +157,7 @@ def parse_json_export(json_file: IO[str]) -> Iterable[Link]: ) +@enforce_types def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]: """Parse RSS XML-format files into links""" @@ -190,6 +195,7 @@ def parse_rss_export(rss_file: IO[str]) -> Iterable[Link]: ) +@enforce_types def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]: """Parse Shaarli-specific RSS XML-format files into links""" @@ -227,6 +233,7 @@ def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]: ) +@enforce_types def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]: """Parse netscape-format bookmarks export files (produced by all browsers)""" @@ -251,6 +258,7 @@ def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]: ) +@enforce_types def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]: """Parse Pinboard RSS feed files into links""" @@ -282,6 +290,7 @@ def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]: ) +@enforce_types def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]: """Parse Medium RSS feed files into links""" @@ -303,6 +312,7 @@ def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]: ) +@enforce_types def parse_plain_text_export(text_file: IO[str]) -> Iterable[Link]: """Parse raw links from each line in a text file""" diff --git a/archivebox/schema.py b/archivebox/schema.py index e02d69c7..fa110653 100644 --- a/archivebox/schema.py +++ b/archivebox/schema.py @@ -23,6 +23,10 @@ class ArchiveResult: status: str start_ts: datetime end_ts: datetime + schema: str = 'ArchiveResult' + + def __post_init__(self): + assert self.schema == self.__class__.__name__ def _asdict(self): return asdict(self) @@ -40,9 +44,11 @@ class Link: sources: List[str] history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {}) updated: Optional[datetime] = None + schema: str = 'Link' def __post_init__(self): """fix any history result items to be type-checked ArchiveResults""" + assert self.schema == self.__class__.__name__ cast_history = {} for method, method_history in self.history.items(): cast_history[method] = [] @@ -67,6 +73,7 @@ class Link: def _asdict(self, extended=False): info = { + 'schema': 'Link', 'url': self.url, 'title': self.title or None, 'timestamp': self.timestamp, @@ -234,12 +241,18 @@ class ArchiveIndex: num_links: int updated: str links: List[Link] + schema: str = 'ArchiveIndex' + + def __post_init__(self): + assert self.schema == self.__class__.__name__ def _asdict(self): return asdict(self) @dataclass class RuntimeStats: + """mutable stats counter for logging archiving timing info to CLI output""" + skipped: int succeeded: int failed: int diff --git a/archivebox/util.py b/archivebox/util.py index 5097ec76..dc5590c5 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -91,7 +91,7 @@ STATICFILE_EXTENSIONS = { 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', - 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8' + 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8', 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'atom', 'rss', 'css', 'js', 'json', 'dmg', 'iso', 'img', @@ -113,8 +113,9 @@ STATICFILE_EXTENSIONS = { def enforce_types(func): """ - Checks parameters type signatures against arg and kwarg type hints. + Enforce function arg and kwarg types at runtime using its python3 type hints """ + # TODO: check return type as well @wraps(func) def typechecked_function(*args, **kwargs): @@ -183,6 +184,7 @@ def check_url_parsing_invariants() -> None: ### Random Helpers +@enforce_types def save_stdin_source(raw_text: str) -> str: if not os.path.exists(SOURCES_DIR): os.makedirs(SOURCES_DIR) @@ -196,6 +198,8 @@ def save_stdin_source(raw_text: str) -> str: return source_path + +@enforce_types def save_remote_source(url: str, timeout: int=TIMEOUT) -> str: """download a given url's content into output/sources/domain-.txt""" @@ -233,6 +237,8 @@ def save_remote_source(url: str, timeout: int=TIMEOUT) -> str: return source_path + +@enforce_types def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> Optional[str]: """Attempt to guess a page's title by downloading the html""" @@ -255,6 +261,8 @@ def fetch_page_title(url: str, timeout: int=10, progress: bool=SHOW_PROGRESS) -> # )) return None + +@enforce_types def wget_output_path(link: Link) -> Optional[str]: """calculate the path to the wgetted .html file, since wget may adjust some paths to be different than the base_url path. @@ -323,14 +331,17 @@ def wget_output_path(link: Link) -> Optional[str]: return None +@enforce_types def read_js_script(script_name: str) -> str: script_path = os.path.join(PYTHON_PATH, 'scripts', script_name) with open(script_path, 'r') as f: return f.read().split('// INFO BELOW HERE')[0].strip() + ### String Manipulation & Logging Helpers +@enforce_types def str_between(string: str, start: str, end: str=None) -> str: """(12345, , ) -> 12345""" @@ -341,6 +352,7 @@ def str_between(string: str, start: str, end: str=None) -> str: return content +@enforce_types def parse_date(date: Any) -> Optional[datetime]: """Parse unix timestamps, iso format, and human-readable strings""" @@ -435,6 +447,8 @@ def merge_links(a: Link, b: Link) -> Link: history=history, ) + +@enforce_types def is_static_file(url: str) -> bool: """Certain URLs just point to a single static file, and don't need to be re-archived in many formats @@ -443,6 +457,8 @@ def is_static_file(url: str) -> bool: # TODO: the proper way is with MIME type detection, not using extension return extension(url) in STATICFILE_EXTENSIONS + +@enforce_types def derived_link_info(link: Link) -> dict: """extend link info with the archive urls and other derived data""" @@ -518,6 +534,7 @@ class TimedProgress: sys.stdout.flush() +@enforce_types def progress_bar(seconds: int, prefix: str='') -> None: """show timer in the form of progress bar, with percentage and seconds remaining""" chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#' @@ -557,6 +574,7 @@ def progress_bar(seconds: int, prefix: str='') -> None: pass +@enforce_types def download_url(url: str, timeout: int=TIMEOUT) -> str: """Download the contents of a remote url and return the text""" @@ -572,6 +590,8 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str: encoding = resp.headers.get_content_charset() or 'utf-8' return resp.read().decode(encoding) + +@enforce_types def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None: """chmod -R /""" @@ -584,6 +604,7 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, tim raise Exception('Failed to chmod {}/{}'.format(cwd, path)) +@enforce_types def chrome_args(**options) -> List[str]: """helper to build up a chrome shell command with arguments"""