From 93216a3c3e59b7f2103faba53fb3f0de83e38cb5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 27 Mar 2019 15:35:13 -0400 Subject: [PATCH] new version handling and absolute imports --- archivebox/archive.py | 21 ++++++++++++--------- archivebox/archive_methods.py | 15 ++++++++------- archivebox/config.py | 16 ++++------------ archivebox/index.py | 15 ++++++++------- archivebox/links.py | 4 ++-- archivebox/parse.py | 4 ++-- archivebox/schema.py | 32 ++++++++++++++++---------------- archivebox/templates/index.html | 2 +- archivebox/util.py | 10 +++++----- 9 files changed, 58 insertions(+), 61 deletions(-) diff --git a/archivebox/archive.py b/archivebox/archive.py index 7cf56197..b381d141 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -13,34 +13,37 @@ __package__ = 'archivebox' import os import sys + from typing import List, Optional -from schema import Link -from links import links_after_timestamp -from index import write_links_index, load_links_index -from archive_methods import archive_link -from config import ( +from .schema import Link +from .links import links_after_timestamp +from .index import write_links_index, load_links_index +from .archive_methods import archive_link +from .config import ( ONLY_NEW, OUTPUT_DIR, - GIT_SHA, + PYTHON_DIR, + VERSION, ) -from util import ( +from .util import ( enforce_types, save_remote_source, save_stdin_source, ) -from logs import ( +from .logs import ( log_archiving_started, log_archiving_paused, log_archiving_finished, ) __AUTHOR__ = 'Nick Sweeting ' -__VERSION__ = GIT_SHA[:9] +__VERSION__ = VERSION __DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.' __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki' + def print_help(): print('ArchiveBox: The self-hosted internet archive.\n') print("Documentation:") diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index fd726de2..2370c98b 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -4,13 +4,13 @@ from typing import Dict, List, Tuple from collections import defaultdict from datetime import datetime -from schema import Link, ArchiveResult, ArchiveError -from index import ( +from .schema import Link, ArchiveResult, ArchiveError +from .index import ( write_link_index, patch_links_index, load_json_link_index, ) -from config import ( +from .config import ( CURL_BINARY, GIT_BINARY, WGET_BINARY, @@ -31,7 +31,7 @@ from config import ( ANSI, OUTPUT_DIR, GIT_DOMAINS, - GIT_SHA, + VERSION, WGET_USER_AGENT, CHECK_SSL_VALIDITY, COOKIES_FILE, @@ -43,7 +43,7 @@ from config import ( ONLY_NEW, WGET_AUTO_COMPRESSION, ) -from util import ( +from .util import ( enforce_types, domain, extension, @@ -58,7 +58,7 @@ from util import ( run, PIPE, DEVNULL, Link, ) -from logs import ( +from .logs import ( log_link_archiving_started, log_link_archiving_finished, log_archive_method_started, @@ -123,6 +123,7 @@ def archive_link(link: Link, page=None) -> Link: if was_changed: patch_links_index(link) + log_link_archiving_finished(link.link_dir, link, is_new, stats) except KeyboardInterrupt: @@ -606,7 +607,7 @@ def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveR CURL_BINARY, '--location', '--head', - '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from + '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from '--max-time', str(timeout), *(() if CHECK_SSL_VALIDITY else ('--insecure',)), submit_url, diff --git a/archivebox/config.py b/archivebox/config.py index e564942e..4573224f 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -40,7 +40,7 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' RESOLUTION = os.getenv('RESOLUTION', '1440,2000' ) GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') -WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}') +WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}') COOKIES_FILE = os.getenv('COOKIES_FILE', None) CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true' @@ -163,21 +163,13 @@ def find_chrome_data_dir() -> Optional[str]: return None -def get_git_version() -> str: - """get the git commit hash of the python code folder (aka code version)""" - try: - return run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() - except Exception: - print('[!] Warning: unable to determine git version, is git installed and in your $PATH?') - return 'unknown' - - # ****************************************************************************** # ************************ Environment & Dependencies ************************** # ****************************************************************************** try: - GIT_SHA = get_git_version() + VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip() + GIT_SHA = VERSION.split('+')[1] ### Terminal Configuration TERM_WIDTH = lambda: shutil.get_terminal_size((100, 10)).columns @@ -234,7 +226,7 @@ try: WGET_AUTO_COMPRESSION = not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode WGET_USER_AGENT = WGET_USER_AGENT.format( - GIT_SHA=GIT_SHA[:9], + VERSION=VERSION, WGET_VERSION=WGET_VERSION or '', ) diff --git a/archivebox/index.py b/archivebox/index.py index 58b752b1..66b234a2 100644 --- a/archivebox/index.py +++ b/archivebox/index.py @@ -6,15 +6,16 @@ from string import Template from typing import List, Tuple, Iterator, Optional from dataclasses import fields -from schema import Link, ArchiveIndex, ArchiveResult -from config import ( +from .schema import Link, ArchiveResult +from .config import ( OUTPUT_DIR, TEMPLATES_DIR, + VERSION, GIT_SHA, FOOTER_INFO, TIMEOUT, ) -from util import ( +from .util import ( merge_links, chmod_file, urlencode, @@ -25,9 +26,9 @@ from util import ( TimedProgress, copy_and_overwrite, ) -from parse import parse_links -from links import validate_links -from logs import ( +from .parse import parse_links +from .links import validate_links +from .logs import ( log_indexing_process_started, log_indexing_started, log_indexing_finished, @@ -178,8 +179,8 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False 'date_updated': datetime.now().strftime('%Y-%m-%d'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), 'footer_info': FOOTER_INFO, + 'version': VERSION, 'git_sha': GIT_SHA, - 'short_git_sha': GIT_SHA[:8], 'rows': link_rows, 'status': 'finished' if finished else 'running', } diff --git a/archivebox/links.py b/archivebox/links.py index 4692943c..0d72472d 100644 --- a/archivebox/links.py +++ b/archivebox/links.py @@ -22,8 +22,8 @@ Link { from typing import Iterable from collections import OrderedDict -from schema import Link -from util import ( +from .schema import Link +from .util import ( scheme, fuzzy_url, merge_links, diff --git a/archivebox/parse.py b/archivebox/parse.py index 9430b305..6ecc0007 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -24,8 +24,8 @@ from typing import Tuple, List, IO, Iterable from datetime import datetime import xml.etree.ElementTree as etree -from config import TIMEOUT -from util import ( +from .config import TIMEOUT +from .util import ( htmldecode, str_between, URL_REGEX, diff --git a/archivebox/schema.py b/archivebox/schema.py index 619ffd7c..d1bb06ea 100644 --- a/archivebox/schema.py +++ b/archivebox/schema.py @@ -108,60 +108,60 @@ class Link: @property def link_dir(self) -> str: - from config import ARCHIVE_DIR + from .config import ARCHIVE_DIR return os.path.join(ARCHIVE_DIR, self.timestamp) @property def archive_path(self) -> str: - from config import ARCHIVE_DIR_NAME + from .config import ARCHIVE_DIR_NAME return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) ### URL Helpers @property def urlhash(self): - from util import hashurl + from .util import hashurl return hashurl(self.url) @property def extension(self) -> str: - from util import extension + from .util import extension return extension(self.url) @property def domain(self) -> str: - from util import domain + from .util import domain return domain(self.url) @property def path(self) -> str: - from util import path + from .util import path return path(self.url) @property def basename(self) -> str: - from util import basename + from .util import basename return basename(self.url) @property def base_url(self) -> str: - from util import base_url + from .util import base_url return base_url(self.url) ### Pretty Printing Helpers @property def bookmarked_date(self) -> Optional[str]: - from util import ts_to_date + from .util import ts_to_date return ts_to_date(self.timestamp) if self.timestamp else None @property def updated_date(self) -> Optional[str]: - from util import ts_to_date + from .util import ts_to_date return ts_to_date(self.updated) if self.updated else None @property def oldest_archive_date(self) -> Optional[datetime]: - from util import ts_to_date + from .util import ts_to_date most_recent = min( (ts_to_date(result.start_ts) @@ -173,7 +173,7 @@ class Link: @property def newest_archive_date(self) -> Optional[datetime]: - from util import ts_to_date + from .util import ts_to_date most_recent = max( (ts_to_date(result.start_ts) @@ -197,13 +197,13 @@ class Link: @property def is_static(self) -> bool: - from util import is_static_file + from .util import is_static_file return is_static_file(self.url) @property def is_archived(self) -> bool: - from config import ARCHIVE_DIR - from util import domain + from .config import ARCHIVE_DIR + from .util import domain return os.path.exists(os.path.join( ARCHIVE_DIR, @@ -240,7 +240,7 @@ class Link: return latest def canonical_outputs(self) -> Dict[str, Optional[str]]: - from util import wget_output_path + from .util import wget_output_path canonical = { 'index_url': 'index.html', 'favicon_url': 'favicon.ico', diff --git a/archivebox/templates/index.html b/archivebox/templates/index.html index 8436a412..144f2ce7 100644 --- a/archivebox/templates/index.html +++ b/archivebox/templates/index.html @@ -209,7 +209,7 @@
Archive created using ArchiveBox - version $short_git_sha   |   + version $version   |   Download index as JSON

$footer_info diff --git a/archivebox/util.py b/archivebox/util.py index e6f93981..fe3c57cf 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -25,8 +25,8 @@ from subprocess import ( from base32_crockford import encode as base32_encode -from schema import Link -from config import ( +from .schema import Link +from .config import ( ANSI, TERM_WIDTH, SOURCES_DIR, @@ -37,9 +37,9 @@ from config import ( CHECK_SSL_VALIDITY, WGET_USER_AGENT, CHROME_OPTIONS, - PYTHON_PATH, + PYTHON_DIR, ) -from logs import pretty_path +from .logs import pretty_path ### Parsing Helpers @@ -334,7 +334,7 @@ def wget_output_path(link: Link) -> Optional[str]: @enforce_types def read_js_script(script_name: str) -> str: - script_path = os.path.join(PYTHON_PATH, 'scripts', script_name) + script_path = os.path.join(PYTHON_DIR, 'scripts', script_name) with open(script_path, 'r') as f: return f.read().split('// INFO BELOW HERE')[0].strip()