new version handling and absolute imports

This commit is contained in:
Nick Sweeting 2019-03-27 15:35:13 -04:00
parent bc1bc9fe02
commit 93216a3c3e
9 changed files with 58 additions and 61 deletions

View file

@ -13,34 +13,37 @@ __package__ = 'archivebox'
import os import os
import sys import sys
from typing import List, Optional from typing import List, Optional
from schema import Link from .schema import Link
from links import links_after_timestamp from .links import links_after_timestamp
from index import write_links_index, load_links_index from .index import write_links_index, load_links_index
from archive_methods import archive_link from .archive_methods import archive_link
from config import ( from .config import (
ONLY_NEW, ONLY_NEW,
OUTPUT_DIR, OUTPUT_DIR,
GIT_SHA, PYTHON_DIR,
VERSION,
) )
from util import ( from .util import (
enforce_types, enforce_types,
save_remote_source, save_remote_source,
save_stdin_source, save_stdin_source,
) )
from logs import ( from .logs import (
log_archiving_started, log_archiving_started,
log_archiving_paused, log_archiving_paused,
log_archiving_finished, log_archiving_finished,
) )
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>' __AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
__VERSION__ = GIT_SHA[:9] __VERSION__ = VERSION
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.' __DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki' __DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
def print_help(): def print_help():
print('ArchiveBox: The self-hosted internet archive.\n') print('ArchiveBox: The self-hosted internet archive.\n')
print("Documentation:") print("Documentation:")

View file

@ -4,13 +4,13 @@ from typing import Dict, List, Tuple
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from schema import Link, ArchiveResult, ArchiveError from .schema import Link, ArchiveResult, ArchiveError
from index import ( from .index import (
write_link_index, write_link_index,
patch_links_index, patch_links_index,
load_json_link_index, load_json_link_index,
) )
from config import ( from .config import (
CURL_BINARY, CURL_BINARY,
GIT_BINARY, GIT_BINARY,
WGET_BINARY, WGET_BINARY,
@ -31,7 +31,7 @@ from config import (
ANSI, ANSI,
OUTPUT_DIR, OUTPUT_DIR,
GIT_DOMAINS, GIT_DOMAINS,
GIT_SHA, VERSION,
WGET_USER_AGENT, WGET_USER_AGENT,
CHECK_SSL_VALIDITY, CHECK_SSL_VALIDITY,
COOKIES_FILE, COOKIES_FILE,
@ -43,7 +43,7 @@ from config import (
ONLY_NEW, ONLY_NEW,
WGET_AUTO_COMPRESSION, WGET_AUTO_COMPRESSION,
) )
from util import ( from .util import (
enforce_types, enforce_types,
domain, domain,
extension, extension,
@ -58,7 +58,7 @@ from util import (
run, PIPE, DEVNULL, run, PIPE, DEVNULL,
Link, Link,
) )
from logs import ( from .logs import (
log_link_archiving_started, log_link_archiving_started,
log_link_archiving_finished, log_link_archiving_finished,
log_archive_method_started, log_archive_method_started,
@ -123,6 +123,7 @@ def archive_link(link: Link, page=None) -> Link:
if was_changed: if was_changed:
patch_links_index(link) patch_links_index(link)
log_link_archiving_finished(link.link_dir, link, is_new, stats) log_link_archiving_finished(link.link_dir, link, is_new, stats)
except KeyboardInterrupt: except KeyboardInterrupt:
@ -606,7 +607,7 @@ def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveR
CURL_BINARY, CURL_BINARY,
'--location', '--location',
'--head', '--head',
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
'--max-time', str(timeout), '--max-time', str(timeout),
*(() if CHECK_SSL_VALIDITY else ('--insecure',)), *(() if CHECK_SSL_VALIDITY else ('--insecure',)),
submit_url, submit_url,

View file

@ -40,7 +40,7 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
RESOLUTION = os.getenv('RESOLUTION', '1440,2000' ) RESOLUTION = os.getenv('RESOLUTION', '1440,2000' )
GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}') WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
COOKIES_FILE = os.getenv('COOKIES_FILE', None) COOKIES_FILE = os.getenv('COOKIES_FILE', None)
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true' CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true'
@ -163,21 +163,13 @@ def find_chrome_data_dir() -> Optional[str]:
return None return None
def get_git_version() -> str:
"""get the git commit hash of the python code folder (aka code version)"""
try:
return run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
except Exception:
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
return 'unknown'
# ****************************************************************************** # ******************************************************************************
# ************************ Environment & Dependencies ************************** # ************************ Environment & Dependencies **************************
# ****************************************************************************** # ******************************************************************************
try: try:
GIT_SHA = get_git_version() VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip()
GIT_SHA = VERSION.split('+')[1]
### Terminal Configuration ### Terminal Configuration
TERM_WIDTH = lambda: shutil.get_terminal_size((100, 10)).columns TERM_WIDTH = lambda: shutil.get_terminal_size((100, 10)).columns
@ -234,7 +226,7 @@ try:
WGET_AUTO_COMPRESSION = not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode WGET_AUTO_COMPRESSION = not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode
WGET_USER_AGENT = WGET_USER_AGENT.format( WGET_USER_AGENT = WGET_USER_AGENT.format(
GIT_SHA=GIT_SHA[:9], VERSION=VERSION,
WGET_VERSION=WGET_VERSION or '', WGET_VERSION=WGET_VERSION or '',
) )

View file

@ -6,15 +6,16 @@ from string import Template
from typing import List, Tuple, Iterator, Optional from typing import List, Tuple, Iterator, Optional
from dataclasses import fields from dataclasses import fields
from schema import Link, ArchiveIndex, ArchiveResult from .schema import Link, ArchiveResult
from config import ( from .config import (
OUTPUT_DIR, OUTPUT_DIR,
TEMPLATES_DIR, TEMPLATES_DIR,
VERSION,
GIT_SHA, GIT_SHA,
FOOTER_INFO, FOOTER_INFO,
TIMEOUT, TIMEOUT,
) )
from util import ( from .util import (
merge_links, merge_links,
chmod_file, chmod_file,
urlencode, urlencode,
@ -25,9 +26,9 @@ from util import (
TimedProgress, TimedProgress,
copy_and_overwrite, copy_and_overwrite,
) )
from parse import parse_links from .parse import parse_links
from links import validate_links from .links import validate_links
from logs import ( from .logs import (
log_indexing_process_started, log_indexing_process_started,
log_indexing_started, log_indexing_started,
log_indexing_finished, log_indexing_finished,
@ -178,8 +179,8 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
'date_updated': datetime.now().strftime('%Y-%m-%d'), 'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'footer_info': FOOTER_INFO, 'footer_info': FOOTER_INFO,
'version': VERSION,
'git_sha': GIT_SHA, 'git_sha': GIT_SHA,
'short_git_sha': GIT_SHA[:8],
'rows': link_rows, 'rows': link_rows,
'status': 'finished' if finished else 'running', 'status': 'finished' if finished else 'running',
} }

View file

@ -22,8 +22,8 @@ Link {
from typing import Iterable from typing import Iterable
from collections import OrderedDict from collections import OrderedDict
from schema import Link from .schema import Link
from util import ( from .util import (
scheme, scheme,
fuzzy_url, fuzzy_url,
merge_links, merge_links,

View file

@ -24,8 +24,8 @@ from typing import Tuple, List, IO, Iterable
from datetime import datetime from datetime import datetime
import xml.etree.ElementTree as etree import xml.etree.ElementTree as etree
from config import TIMEOUT from .config import TIMEOUT
from util import ( from .util import (
htmldecode, htmldecode,
str_between, str_between,
URL_REGEX, URL_REGEX,

View file

@ -108,60 +108,60 @@ class Link:
@property @property
def link_dir(self) -> str: def link_dir(self) -> str:
from config import ARCHIVE_DIR from .config import ARCHIVE_DIR
return os.path.join(ARCHIVE_DIR, self.timestamp) return os.path.join(ARCHIVE_DIR, self.timestamp)
@property @property
def archive_path(self) -> str: def archive_path(self) -> str:
from config import ARCHIVE_DIR_NAME from .config import ARCHIVE_DIR_NAME
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
### URL Helpers ### URL Helpers
@property @property
def urlhash(self): def urlhash(self):
from util import hashurl from .util import hashurl
return hashurl(self.url) return hashurl(self.url)
@property @property
def extension(self) -> str: def extension(self) -> str:
from util import extension from .util import extension
return extension(self.url) return extension(self.url)
@property @property
def domain(self) -> str: def domain(self) -> str:
from util import domain from .util import domain
return domain(self.url) return domain(self.url)
@property @property
def path(self) -> str: def path(self) -> str:
from util import path from .util import path
return path(self.url) return path(self.url)
@property @property
def basename(self) -> str: def basename(self) -> str:
from util import basename from .util import basename
return basename(self.url) return basename(self.url)
@property @property
def base_url(self) -> str: def base_url(self) -> str:
from util import base_url from .util import base_url
return base_url(self.url) return base_url(self.url)
### Pretty Printing Helpers ### Pretty Printing Helpers
@property @property
def bookmarked_date(self) -> Optional[str]: def bookmarked_date(self) -> Optional[str]:
from util import ts_to_date from .util import ts_to_date
return ts_to_date(self.timestamp) if self.timestamp else None return ts_to_date(self.timestamp) if self.timestamp else None
@property @property
def updated_date(self) -> Optional[str]: def updated_date(self) -> Optional[str]:
from util import ts_to_date from .util import ts_to_date
return ts_to_date(self.updated) if self.updated else None return ts_to_date(self.updated) if self.updated else None
@property @property
def oldest_archive_date(self) -> Optional[datetime]: def oldest_archive_date(self) -> Optional[datetime]:
from util import ts_to_date from .util import ts_to_date
most_recent = min( most_recent = min(
(ts_to_date(result.start_ts) (ts_to_date(result.start_ts)
@ -173,7 +173,7 @@ class Link:
@property @property
def newest_archive_date(self) -> Optional[datetime]: def newest_archive_date(self) -> Optional[datetime]:
from util import ts_to_date from .util import ts_to_date
most_recent = max( most_recent = max(
(ts_to_date(result.start_ts) (ts_to_date(result.start_ts)
@ -197,13 +197,13 @@ class Link:
@property @property
def is_static(self) -> bool: def is_static(self) -> bool:
from util import is_static_file from .util import is_static_file
return is_static_file(self.url) return is_static_file(self.url)
@property @property
def is_archived(self) -> bool: def is_archived(self) -> bool:
from config import ARCHIVE_DIR from .config import ARCHIVE_DIR
from util import domain from .util import domain
return os.path.exists(os.path.join( return os.path.exists(os.path.join(
ARCHIVE_DIR, ARCHIVE_DIR,
@ -240,7 +240,7 @@ class Link:
return latest return latest
def canonical_outputs(self) -> Dict[str, Optional[str]]: def canonical_outputs(self) -> Dict[str, Optional[str]]:
from util import wget_output_path from .util import wget_output_path
canonical = { canonical = {
'index_url': 'index.html', 'index_url': 'index.html',
'favicon_url': 'favicon.ico', 'favicon_url': 'favicon.ico',

View file

@ -209,7 +209,7 @@
<center> <center>
<small> <small>
Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a> Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a>
version <a href="https://github.com/pirate/ArchiveBox/commit/$git_sha" title="Git commit">$short_git_sha</a> &nbsp; | &nbsp; version <a href="https://github.com/pirate/ArchiveBox/commit/$git_sha" title="Git commit">$version</a> &nbsp; | &nbsp;
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a> Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
<br/><br/> <br/><br/>
$footer_info $footer_info

View file

@ -25,8 +25,8 @@ from subprocess import (
from base32_crockford import encode as base32_encode from base32_crockford import encode as base32_encode
from schema import Link from .schema import Link
from config import ( from .config import (
ANSI, ANSI,
TERM_WIDTH, TERM_WIDTH,
SOURCES_DIR, SOURCES_DIR,
@ -37,9 +37,9 @@ from config import (
CHECK_SSL_VALIDITY, CHECK_SSL_VALIDITY,
WGET_USER_AGENT, WGET_USER_AGENT,
CHROME_OPTIONS, CHROME_OPTIONS,
PYTHON_PATH, PYTHON_DIR,
) )
from logs import pretty_path from .logs import pretty_path
### Parsing Helpers ### Parsing Helpers
@ -334,7 +334,7 @@ def wget_output_path(link: Link) -> Optional[str]:
@enforce_types @enforce_types
def read_js_script(script_name: str) -> str: def read_js_script(script_name: str) -> str:
script_path = os.path.join(PYTHON_PATH, 'scripts', script_name) script_path = os.path.join(PYTHON_DIR, 'scripts', script_name)
with open(script_path, 'r') as f: with open(script_path, 'r') as f:
return f.read().split('// INFO BELOW HERE')[0].strip() return f.read().split('// INFO BELOW HERE')[0].strip()