mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
new version handling and absolute imports
This commit is contained in:
parent
bc1bc9fe02
commit
93216a3c3e
9 changed files with 58 additions and 61 deletions
|
@ -13,34 +13,37 @@ __package__ = 'archivebox'
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from schema import Link
|
from .schema import Link
|
||||||
from links import links_after_timestamp
|
from .links import links_after_timestamp
|
||||||
from index import write_links_index, load_links_index
|
from .index import write_links_index, load_links_index
|
||||||
from archive_methods import archive_link
|
from .archive_methods import archive_link
|
||||||
from config import (
|
from .config import (
|
||||||
ONLY_NEW,
|
ONLY_NEW,
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
GIT_SHA,
|
PYTHON_DIR,
|
||||||
|
VERSION,
|
||||||
)
|
)
|
||||||
from util import (
|
from .util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
save_remote_source,
|
save_remote_source,
|
||||||
save_stdin_source,
|
save_stdin_source,
|
||||||
)
|
)
|
||||||
from logs import (
|
from .logs import (
|
||||||
log_archiving_started,
|
log_archiving_started,
|
||||||
log_archiving_paused,
|
log_archiving_paused,
|
||||||
log_archiving_finished,
|
log_archiving_finished,
|
||||||
)
|
)
|
||||||
|
|
||||||
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
|
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
|
||||||
__VERSION__ = GIT_SHA[:9]
|
__VERSION__ = VERSION
|
||||||
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
|
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
|
||||||
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
|
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def print_help():
|
def print_help():
|
||||||
print('ArchiveBox: The self-hosted internet archive.\n')
|
print('ArchiveBox: The self-hosted internet archive.\n')
|
||||||
print("Documentation:")
|
print("Documentation:")
|
||||||
|
|
|
@ -4,13 +4,13 @@ from typing import Dict, List, Tuple
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from schema import Link, ArchiveResult, ArchiveError
|
from .schema import Link, ArchiveResult, ArchiveError
|
||||||
from index import (
|
from .index import (
|
||||||
write_link_index,
|
write_link_index,
|
||||||
patch_links_index,
|
patch_links_index,
|
||||||
load_json_link_index,
|
load_json_link_index,
|
||||||
)
|
)
|
||||||
from config import (
|
from .config import (
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
GIT_BINARY,
|
GIT_BINARY,
|
||||||
WGET_BINARY,
|
WGET_BINARY,
|
||||||
|
@ -31,7 +31,7 @@ from config import (
|
||||||
ANSI,
|
ANSI,
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
GIT_DOMAINS,
|
GIT_DOMAINS,
|
||||||
GIT_SHA,
|
VERSION,
|
||||||
WGET_USER_AGENT,
|
WGET_USER_AGENT,
|
||||||
CHECK_SSL_VALIDITY,
|
CHECK_SSL_VALIDITY,
|
||||||
COOKIES_FILE,
|
COOKIES_FILE,
|
||||||
|
@ -43,7 +43,7 @@ from config import (
|
||||||
ONLY_NEW,
|
ONLY_NEW,
|
||||||
WGET_AUTO_COMPRESSION,
|
WGET_AUTO_COMPRESSION,
|
||||||
)
|
)
|
||||||
from util import (
|
from .util import (
|
||||||
enforce_types,
|
enforce_types,
|
||||||
domain,
|
domain,
|
||||||
extension,
|
extension,
|
||||||
|
@ -58,7 +58,7 @@ from util import (
|
||||||
run, PIPE, DEVNULL,
|
run, PIPE, DEVNULL,
|
||||||
Link,
|
Link,
|
||||||
)
|
)
|
||||||
from logs import (
|
from .logs import (
|
||||||
log_link_archiving_started,
|
log_link_archiving_started,
|
||||||
log_link_archiving_finished,
|
log_link_archiving_finished,
|
||||||
log_archive_method_started,
|
log_archive_method_started,
|
||||||
|
@ -123,6 +123,7 @@ def archive_link(link: Link, page=None) -> Link:
|
||||||
if was_changed:
|
if was_changed:
|
||||||
patch_links_index(link)
|
patch_links_index(link)
|
||||||
|
|
||||||
|
|
||||||
log_link_archiving_finished(link.link_dir, link, is_new, stats)
|
log_link_archiving_finished(link.link_dir, link, is_new, stats)
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
@ -606,7 +607,7 @@ def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveR
|
||||||
CURL_BINARY,
|
CURL_BINARY,
|
||||||
'--location',
|
'--location',
|
||||||
'--head',
|
'--head',
|
||||||
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
|
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
||||||
submit_url,
|
submit_url,
|
||||||
|
|
|
@ -40,7 +40,7 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'
|
||||||
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
|
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
|
||||||
RESOLUTION = os.getenv('RESOLUTION', '1440,2000' )
|
RESOLUTION = os.getenv('RESOLUTION', '1440,2000' )
|
||||||
GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')
|
GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')
|
||||||
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
|
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
|
||||||
COOKIES_FILE = os.getenv('COOKIES_FILE', None)
|
COOKIES_FILE = os.getenv('COOKIES_FILE', None)
|
||||||
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
|
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
|
||||||
CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true'
|
CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true'
|
||||||
|
@ -163,21 +163,13 @@ def find_chrome_data_dir() -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_git_version() -> str:
|
|
||||||
"""get the git commit hash of the python code folder (aka code version)"""
|
|
||||||
try:
|
|
||||||
return run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
|
||||||
except Exception:
|
|
||||||
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
|
|
||||||
return 'unknown'
|
|
||||||
|
|
||||||
|
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
# ************************ Environment & Dependencies **************************
|
# ************************ Environment & Dependencies **************************
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
|
|
||||||
try:
|
try:
|
||||||
GIT_SHA = get_git_version()
|
VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip()
|
||||||
|
GIT_SHA = VERSION.split('+')[1]
|
||||||
|
|
||||||
### Terminal Configuration
|
### Terminal Configuration
|
||||||
TERM_WIDTH = lambda: shutil.get_terminal_size((100, 10)).columns
|
TERM_WIDTH = lambda: shutil.get_terminal_size((100, 10)).columns
|
||||||
|
@ -234,7 +226,7 @@ try:
|
||||||
WGET_AUTO_COMPRESSION = not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode
|
WGET_AUTO_COMPRESSION = not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode
|
||||||
|
|
||||||
WGET_USER_AGENT = WGET_USER_AGENT.format(
|
WGET_USER_AGENT = WGET_USER_AGENT.format(
|
||||||
GIT_SHA=GIT_SHA[:9],
|
VERSION=VERSION,
|
||||||
WGET_VERSION=WGET_VERSION or '',
|
WGET_VERSION=WGET_VERSION or '',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -6,15 +6,16 @@ from string import Template
|
||||||
from typing import List, Tuple, Iterator, Optional
|
from typing import List, Tuple, Iterator, Optional
|
||||||
from dataclasses import fields
|
from dataclasses import fields
|
||||||
|
|
||||||
from schema import Link, ArchiveIndex, ArchiveResult
|
from .schema import Link, ArchiveResult
|
||||||
from config import (
|
from .config import (
|
||||||
OUTPUT_DIR,
|
OUTPUT_DIR,
|
||||||
TEMPLATES_DIR,
|
TEMPLATES_DIR,
|
||||||
|
VERSION,
|
||||||
GIT_SHA,
|
GIT_SHA,
|
||||||
FOOTER_INFO,
|
FOOTER_INFO,
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
)
|
)
|
||||||
from util import (
|
from .util import (
|
||||||
merge_links,
|
merge_links,
|
||||||
chmod_file,
|
chmod_file,
|
||||||
urlencode,
|
urlencode,
|
||||||
|
@ -25,9 +26,9 @@ from util import (
|
||||||
TimedProgress,
|
TimedProgress,
|
||||||
copy_and_overwrite,
|
copy_and_overwrite,
|
||||||
)
|
)
|
||||||
from parse import parse_links
|
from .parse import parse_links
|
||||||
from links import validate_links
|
from .links import validate_links
|
||||||
from logs import (
|
from .logs import (
|
||||||
log_indexing_process_started,
|
log_indexing_process_started,
|
||||||
log_indexing_started,
|
log_indexing_started,
|
||||||
log_indexing_finished,
|
log_indexing_finished,
|
||||||
|
@ -178,8 +179,8 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
|
||||||
'date_updated': datetime.now().strftime('%Y-%m-%d'),
|
'date_updated': datetime.now().strftime('%Y-%m-%d'),
|
||||||
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
|
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
|
||||||
'footer_info': FOOTER_INFO,
|
'footer_info': FOOTER_INFO,
|
||||||
|
'version': VERSION,
|
||||||
'git_sha': GIT_SHA,
|
'git_sha': GIT_SHA,
|
||||||
'short_git_sha': GIT_SHA[:8],
|
|
||||||
'rows': link_rows,
|
'rows': link_rows,
|
||||||
'status': 'finished' if finished else 'running',
|
'status': 'finished' if finished else 'running',
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,8 +22,8 @@ Link {
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from schema import Link
|
from .schema import Link
|
||||||
from util import (
|
from .util import (
|
||||||
scheme,
|
scheme,
|
||||||
fuzzy_url,
|
fuzzy_url,
|
||||||
merge_links,
|
merge_links,
|
||||||
|
|
|
@ -24,8 +24,8 @@ from typing import Tuple, List, IO, Iterable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import xml.etree.ElementTree as etree
|
import xml.etree.ElementTree as etree
|
||||||
|
|
||||||
from config import TIMEOUT
|
from .config import TIMEOUT
|
||||||
from util import (
|
from .util import (
|
||||||
htmldecode,
|
htmldecode,
|
||||||
str_between,
|
str_between,
|
||||||
URL_REGEX,
|
URL_REGEX,
|
||||||
|
|
|
@ -108,60 +108,60 @@ class Link:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def link_dir(self) -> str:
|
def link_dir(self) -> str:
|
||||||
from config import ARCHIVE_DIR
|
from .config import ARCHIVE_DIR
|
||||||
return os.path.join(ARCHIVE_DIR, self.timestamp)
|
return os.path.join(ARCHIVE_DIR, self.timestamp)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def archive_path(self) -> str:
|
def archive_path(self) -> str:
|
||||||
from config import ARCHIVE_DIR_NAME
|
from .config import ARCHIVE_DIR_NAME
|
||||||
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
|
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
|
||||||
|
|
||||||
### URL Helpers
|
### URL Helpers
|
||||||
@property
|
@property
|
||||||
def urlhash(self):
|
def urlhash(self):
|
||||||
from util import hashurl
|
from .util import hashurl
|
||||||
|
|
||||||
return hashurl(self.url)
|
return hashurl(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def extension(self) -> str:
|
def extension(self) -> str:
|
||||||
from util import extension
|
from .util import extension
|
||||||
return extension(self.url)
|
return extension(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def domain(self) -> str:
|
def domain(self) -> str:
|
||||||
from util import domain
|
from .util import domain
|
||||||
return domain(self.url)
|
return domain(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def path(self) -> str:
|
def path(self) -> str:
|
||||||
from util import path
|
from .util import path
|
||||||
return path(self.url)
|
return path(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def basename(self) -> str:
|
def basename(self) -> str:
|
||||||
from util import basename
|
from .util import basename
|
||||||
return basename(self.url)
|
return basename(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def base_url(self) -> str:
|
def base_url(self) -> str:
|
||||||
from util import base_url
|
from .util import base_url
|
||||||
return base_url(self.url)
|
return base_url(self.url)
|
||||||
|
|
||||||
### Pretty Printing Helpers
|
### Pretty Printing Helpers
|
||||||
@property
|
@property
|
||||||
def bookmarked_date(self) -> Optional[str]:
|
def bookmarked_date(self) -> Optional[str]:
|
||||||
from util import ts_to_date
|
from .util import ts_to_date
|
||||||
return ts_to_date(self.timestamp) if self.timestamp else None
|
return ts_to_date(self.timestamp) if self.timestamp else None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def updated_date(self) -> Optional[str]:
|
def updated_date(self) -> Optional[str]:
|
||||||
from util import ts_to_date
|
from .util import ts_to_date
|
||||||
return ts_to_date(self.updated) if self.updated else None
|
return ts_to_date(self.updated) if self.updated else None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def oldest_archive_date(self) -> Optional[datetime]:
|
def oldest_archive_date(self) -> Optional[datetime]:
|
||||||
from util import ts_to_date
|
from .util import ts_to_date
|
||||||
|
|
||||||
most_recent = min(
|
most_recent = min(
|
||||||
(ts_to_date(result.start_ts)
|
(ts_to_date(result.start_ts)
|
||||||
|
@ -173,7 +173,7 @@ class Link:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def newest_archive_date(self) -> Optional[datetime]:
|
def newest_archive_date(self) -> Optional[datetime]:
|
||||||
from util import ts_to_date
|
from .util import ts_to_date
|
||||||
|
|
||||||
most_recent = max(
|
most_recent = max(
|
||||||
(ts_to_date(result.start_ts)
|
(ts_to_date(result.start_ts)
|
||||||
|
@ -197,13 +197,13 @@ class Link:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_static(self) -> bool:
|
def is_static(self) -> bool:
|
||||||
from util import is_static_file
|
from .util import is_static_file
|
||||||
return is_static_file(self.url)
|
return is_static_file(self.url)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_archived(self) -> bool:
|
def is_archived(self) -> bool:
|
||||||
from config import ARCHIVE_DIR
|
from .config import ARCHIVE_DIR
|
||||||
from util import domain
|
from .util import domain
|
||||||
|
|
||||||
return os.path.exists(os.path.join(
|
return os.path.exists(os.path.join(
|
||||||
ARCHIVE_DIR,
|
ARCHIVE_DIR,
|
||||||
|
@ -240,7 +240,7 @@ class Link:
|
||||||
return latest
|
return latest
|
||||||
|
|
||||||
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||||
from util import wget_output_path
|
from .util import wget_output_path
|
||||||
canonical = {
|
canonical = {
|
||||||
'index_url': 'index.html',
|
'index_url': 'index.html',
|
||||||
'favicon_url': 'favicon.ico',
|
'favicon_url': 'favicon.ico',
|
||||||
|
|
|
@ -209,7 +209,7 @@
|
||||||
<center>
|
<center>
|
||||||
<small>
|
<small>
|
||||||
Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a>
|
Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a>
|
||||||
version <a href="https://github.com/pirate/ArchiveBox/commit/$git_sha" title="Git commit">$short_git_sha</a> |
|
version <a href="https://github.com/pirate/ArchiveBox/commit/$git_sha" title="Git commit">$version</a> |
|
||||||
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
||||||
<br/><br/>
|
<br/><br/>
|
||||||
$footer_info
|
$footer_info
|
||||||
|
|
|
@ -25,8 +25,8 @@ from subprocess import (
|
||||||
|
|
||||||
from base32_crockford import encode as base32_encode
|
from base32_crockford import encode as base32_encode
|
||||||
|
|
||||||
from schema import Link
|
from .schema import Link
|
||||||
from config import (
|
from .config import (
|
||||||
ANSI,
|
ANSI,
|
||||||
TERM_WIDTH,
|
TERM_WIDTH,
|
||||||
SOURCES_DIR,
|
SOURCES_DIR,
|
||||||
|
@ -37,9 +37,9 @@ from config import (
|
||||||
CHECK_SSL_VALIDITY,
|
CHECK_SSL_VALIDITY,
|
||||||
WGET_USER_AGENT,
|
WGET_USER_AGENT,
|
||||||
CHROME_OPTIONS,
|
CHROME_OPTIONS,
|
||||||
PYTHON_PATH,
|
PYTHON_DIR,
|
||||||
)
|
)
|
||||||
from logs import pretty_path
|
from .logs import pretty_path
|
||||||
|
|
||||||
### Parsing Helpers
|
### Parsing Helpers
|
||||||
|
|
||||||
|
@ -334,7 +334,7 @@ def wget_output_path(link: Link) -> Optional[str]:
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def read_js_script(script_name: str) -> str:
|
def read_js_script(script_name: str) -> str:
|
||||||
script_path = os.path.join(PYTHON_PATH, 'scripts', script_name)
|
script_path = os.path.join(PYTHON_DIR, 'scripts', script_name)
|
||||||
|
|
||||||
with open(script_path, 'r') as f:
|
with open(script_path, 'r') as f:
|
||||||
return f.read().split('// INFO BELOW HERE')[0].strip()
|
return f.read().split('// INFO BELOW HERE')[0].strip()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue