mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 22:54:27 -04:00
add new config and dependency options
This commit is contained in:
parent
1834acfb5f
commit
d689264365
3 changed files with 83 additions and 41 deletions
|
@ -11,6 +11,10 @@ from peekable import Peekable
|
||||||
from index import wget_output_path, parse_json_link_index, write_link_index
|
from index import wget_output_path, parse_json_link_index, write_link_index
|
||||||
from links import links_after_timestamp
|
from links import links_after_timestamp
|
||||||
from config import (
|
from config import (
|
||||||
|
CURL_BINARY,
|
||||||
|
GIT_BINARY,
|
||||||
|
WGET_BINARY,
|
||||||
|
YOUTUBEDL_BINARY,
|
||||||
CHROME_BINARY,
|
CHROME_BINARY,
|
||||||
FETCH_FAVICON,
|
FETCH_FAVICON,
|
||||||
FETCH_TITLE,
|
FETCH_TITLE,
|
||||||
|
@ -37,6 +41,7 @@ from config import (
|
||||||
GIT_SHA,
|
GIT_SHA,
|
||||||
)
|
)
|
||||||
from util import (
|
from util import (
|
||||||
|
without_hash,
|
||||||
check_dependencies,
|
check_dependencies,
|
||||||
fetch_page_title,
|
fetch_page_title,
|
||||||
progress,
|
progress,
|
||||||
|
@ -214,7 +219,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
||||||
|
|
||||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||||
CMD = [
|
CMD = [
|
||||||
'wget',
|
WGET_BINARY,
|
||||||
# '--server-response', # print headers for better error parsing
|
# '--server-response', # print headers for better error parsing
|
||||||
'--no-verbose',
|
'--no-verbose',
|
||||||
'--adjust-extension',
|
'--adjust-extension',
|
||||||
|
@ -417,7 +422,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||||
|
|
||||||
success = False
|
success = False
|
||||||
CMD = [
|
CMD = [
|
||||||
'curl',
|
CURL_BINARY,
|
||||||
'--location',
|
'--location',
|
||||||
'--head',
|
'--head',
|
||||||
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),
|
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA),
|
||||||
|
@ -481,8 +486,9 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
|
||||||
return {'output': 'favicon.ico', 'status': 'skipped'}
|
return {'output': 'favicon.ico', 'status': 'skipped'}
|
||||||
|
|
||||||
CMD = [
|
CMD = [
|
||||||
'curl',
|
CURL_BINARY,
|
||||||
'--max-time', str(timeout),
|
'--max-time', str(timeout),
|
||||||
|
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
||||||
'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
|
'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
|
||||||
]
|
]
|
||||||
fout = open('{}/favicon.ico'.format(link_dir), 'w')
|
fout = open('{}/favicon.ico'.format(link_dir), 'w')
|
||||||
|
@ -542,7 +548,7 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
|
||||||
|
|
||||||
os.makedirs(output, exist_ok=True)
|
os.makedirs(output, exist_ok=True)
|
||||||
CMD = [
|
CMD = [
|
||||||
'youtube-dl',
|
YOUTUBEDL_BINARY,
|
||||||
'--write-description',
|
'--write-description',
|
||||||
'--write-info-json',
|
'--write-info-json',
|
||||||
'--write-annotations',
|
'--write-annotations',
|
||||||
|
@ -552,12 +558,15 @@ def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
|
||||||
'--no-check-certificate',
|
'--no-check-certificate',
|
||||||
'--user-agent',
|
'--user-agent',
|
||||||
'--all-subs',
|
'--all-subs',
|
||||||
'-x',
|
'--extract-audio',
|
||||||
'-k',
|
'--keep-video',
|
||||||
|
'--ignore-errors',
|
||||||
|
'--geo-bypass',
|
||||||
'--audio-format', 'mp3',
|
'--audio-format', 'mp3',
|
||||||
'--audio-quality', '320K',
|
'--audio-quality', '320K',
|
||||||
'--embed-thumbnail',
|
'--embed-thumbnail',
|
||||||
'--add-metadata',
|
'--add-metadata',
|
||||||
|
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
|
||||||
link['url'],
|
link['url'],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -605,7 +614,14 @@ def fetch_git(link_dir, link, timeout=TIMEOUT):
|
||||||
if os.path.exists(os.path.join(link_dir, 'git')):
|
if os.path.exists(os.path.join(link_dir, 'git')):
|
||||||
return {'output': 'git', 'status': 'skipped'}
|
return {'output': 'git', 'status': 'skipped'}
|
||||||
|
|
||||||
CMD = ['git', 'clone', '--mirror', '--recursive', link['url'].split('#')[0], 'git']
|
CMD = [
|
||||||
|
GIT_BINARY,
|
||||||
|
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
|
||||||
|
'clone',
|
||||||
|
'--mirror',
|
||||||
|
'--recursive',
|
||||||
|
without_hash(link['url']),
|
||||||
|
]
|
||||||
output = 'git'
|
output = 'git'
|
||||||
|
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
|
|
|
@ -11,12 +11,13 @@ from subprocess import run, PIPE
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
|
|
||||||
IS_TTY = sys.stdout.isatty()
|
IS_TTY = sys.stdout.isatty()
|
||||||
ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true'
|
|
||||||
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
|
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
|
||||||
SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true'
|
SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true'
|
||||||
OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' )
|
ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true'
|
||||||
MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600'))
|
MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600'))
|
||||||
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
|
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
|
||||||
|
OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' )
|
||||||
|
FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',)
|
||||||
|
|
||||||
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
|
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
|
||||||
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
|
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
|
||||||
|
@ -33,13 +34,15 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'
|
||||||
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
|
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
|
||||||
RESOLUTION = os.getenv('RESOLUTION', '1440,2000' )
|
RESOLUTION = os.getenv('RESOLUTION', '1440,2000' )
|
||||||
GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')
|
GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')
|
||||||
COOKIES_FILE = os.getenv('COOKIES_FILE', None)
|
|
||||||
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
|
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
|
||||||
|
COOKIES_FILE = os.getenv('COOKIES_FILE', None)
|
||||||
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
|
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
|
||||||
|
|
||||||
CHROME_BINARY = os.getenv('CHROME_BINARY', None) # change to google-chrome browser if using google-chrome
|
CURL_BINARY = os.getenv('CURL_BINARY', 'curl')
|
||||||
WGET_BINARY = os.getenv('WGET_BINARY', 'wget' )
|
GIT_BINARY = os.getenv('GIT_BINARY', 'git')
|
||||||
FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',)
|
WGET_BINARY = os.getenv('WGET_BINARY', 'wget')
|
||||||
|
YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl')
|
||||||
|
CHROME_BINARY = os.getenv('CHROME_BINARY', None)
|
||||||
|
|
||||||
### Paths
|
### Paths
|
||||||
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
|
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
|
||||||
|
@ -101,7 +104,7 @@ if not USE_COLOR:
|
||||||
### Confirm Environment Setup
|
### Confirm Environment Setup
|
||||||
GIT_SHA = 'unknown'
|
GIT_SHA = 'unknown'
|
||||||
try:
|
try:
|
||||||
GIT_SHA = run(["git", "rev-list", "-1", "HEAD", "./"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
||||||
except Exception:
|
except Exception:
|
||||||
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
|
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
|
||||||
|
|
||||||
|
@ -115,7 +118,7 @@ except Exception:
|
||||||
|
|
||||||
WGET_VERSION = 'unknown'
|
WGET_VERSION = 'unknown'
|
||||||
try:
|
try:
|
||||||
wget_vers_str = run(["wget", "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
||||||
WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
|
WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
|
||||||
except Exception:
|
except Exception:
|
||||||
if USE_WGET:
|
if USE_WGET:
|
||||||
|
|
|
@ -14,23 +14,30 @@ from subprocess import TimeoutExpired, Popen, PIPE, DEVNULL, CompletedProcess, C
|
||||||
from multiprocessing import Process
|
from multiprocessing import Process
|
||||||
|
|
||||||
from config import (
|
from config import (
|
||||||
IS_TTY,
|
|
||||||
OUTPUT_PERMISSIONS,
|
|
||||||
REPO_DIR,
|
|
||||||
SOURCES_DIR,
|
|
||||||
OUTPUT_DIR,
|
|
||||||
ARCHIVE_DIR,
|
|
||||||
TIMEOUT,
|
|
||||||
TERM_WIDTH,
|
|
||||||
SHOW_PROGRESS,
|
|
||||||
ANSI,
|
ANSI,
|
||||||
|
IS_TTY,
|
||||||
|
TERM_WIDTH,
|
||||||
|
REPO_DIR,
|
||||||
|
OUTPUT_DIR,
|
||||||
|
SOURCES_DIR,
|
||||||
|
ARCHIVE_DIR,
|
||||||
|
OUTPUT_PERMISSIONS,
|
||||||
|
TIMEOUT,
|
||||||
|
SHOW_PROGRESS,
|
||||||
|
CHECK_SSL_VALIDITY,
|
||||||
|
CURL_BINARY,
|
||||||
|
WGET_BINARY,
|
||||||
CHROME_BINARY,
|
CHROME_BINARY,
|
||||||
|
GIT_BINARY,
|
||||||
|
YOUTUBEDL_BINARY,
|
||||||
|
FETCH_TITLE,
|
||||||
|
FETCH_FAVICON,
|
||||||
FETCH_WGET,
|
FETCH_WGET,
|
||||||
|
FETCH_WARC,
|
||||||
FETCH_PDF,
|
FETCH_PDF,
|
||||||
FETCH_SCREENSHOT,
|
FETCH_SCREENSHOT,
|
||||||
FETCH_DOM,
|
FETCH_DOM,
|
||||||
FETCH_FAVICON,
|
FETCH_GIT,
|
||||||
FETCH_TITLE,
|
|
||||||
FETCH_MEDIA,
|
FETCH_MEDIA,
|
||||||
SUBMIT_ARCHIVE_DOT_ORG,
|
SUBMIT_ARCHIVE_DOT_ORG,
|
||||||
)
|
)
|
||||||
|
@ -64,6 +71,20 @@ def check_dependencies():
|
||||||
print(' See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
|
print(' See https://github.com/pirate/ArchiveBox#troubleshooting for help upgrading your Python installation.')
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
|
||||||
|
if run(['which', CURL_BINARY], stdout=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL).returncode:
|
||||||
|
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
|
||||||
|
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CURL_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if FETCH_WGET or FETCH_WARC:
|
||||||
|
if run(['which', WGET_BINARY], stdout=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL).returncode:
|
||||||
|
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
|
||||||
|
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(WGET_BINARY))
|
||||||
|
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
|
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
|
||||||
if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
|
if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
|
||||||
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
|
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
|
||||||
|
@ -88,24 +109,17 @@ def check_dependencies():
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
if FETCH_WGET:
|
if FETCH_GIT:
|
||||||
if run(['which', 'wget'], stdout=DEVNULL).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode:
|
if run(['which', GIT_BINARY], stdout=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL).returncode:
|
||||||
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
|
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
|
||||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget'))
|
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(GIT_BINARY))
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
|
|
||||||
if run(['which', 'curl'], stdout=DEVNULL).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode:
|
|
||||||
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
|
|
||||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl'))
|
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
if FETCH_MEDIA:
|
if FETCH_MEDIA:
|
||||||
if run(['which', 'youtube-dl'], stdout=DEVNULL).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode:
|
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL).returncode:
|
||||||
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
|
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
|
||||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl'))
|
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
|
||||||
print(' See https://github.com/pirate/ArchiveBox for help.')
|
print(' See https://github.com/pirate/ArchiveBox for help.')
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
@ -246,8 +260,17 @@ def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
|
||||||
sys.stdout.write('.')
|
sys.stdout.write('.')
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
html_content = urlopen(url, timeout=timeout).read().decode('utf-8')
|
if CHECK_SSL_VALIDITY:
|
||||||
match = re.search(HTML_TITLE_REGEX, html_content)
|
html_content = urlopen(url, timeout=timeout)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
import ssl
|
||||||
|
insecure = ssl._create_unverified_context()
|
||||||
|
html_content = urlopen(url, timeout=timeout, context=insecure)
|
||||||
|
except ImportError:
|
||||||
|
html_content = urlopen(url, timeout=timeout)
|
||||||
|
|
||||||
|
match = re.search(HTML_TITLE_REGEX, html_content.read().decode('utf-8'))
|
||||||
return match.group(1).strip() if match else None
|
return match.group(1).strip() if match else None
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue