mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-28 05:34:14 -04:00
Merge branch 'master' into archive-result
This commit is contained in:
commit
efe3027797
43 changed files with 743 additions and 280 deletions
172
archivebox/base32_crockford.py
Normal file
172
archivebox/base32_crockford.py
Normal file
|
@ -0,0 +1,172 @@
|
|||
"""
|
||||
base32-crockford
|
||||
================
|
||||
|
||||
A Python module implementing the alternate base32 encoding as described
|
||||
by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html.
|
||||
|
||||
He designed the encoding to:
|
||||
|
||||
* Be human and machine readable
|
||||
* Be compact
|
||||
* Be error resistant
|
||||
* Be pronounceable
|
||||
|
||||
It uses a symbol set of 10 digits and 22 letters, excluding I, L O and
|
||||
U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1'
|
||||
and 'o' is converted to '0'. Encoding uses only upper-case characters.
|
||||
|
||||
Hyphens may be present in symbol strings to improve readability, and
|
||||
are removed when decoding.
|
||||
|
||||
A check symbol can be appended to a symbol string to detect errors
|
||||
within the string.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
PY3 = sys.version_info[0] == 3
|
||||
|
||||
if not PY3:
|
||||
import string as str
|
||||
|
||||
|
||||
__all__ = ["encode", "decode", "normalize"]
|
||||
|
||||
|
||||
if PY3:
|
||||
string_types = (str,)
|
||||
else:
|
||||
string_types = (basestring,) # noqa
|
||||
|
||||
# The encoded symbol space does not include I, L, O or U
|
||||
symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ'
|
||||
# These five symbols are exclusively for checksum values
|
||||
check_symbols = '*~$=U'
|
||||
|
||||
encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols))
|
||||
decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols))
|
||||
normalize_symbols = str.maketrans('IiLlOo', '111100')
|
||||
valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols,
|
||||
re.escape(check_symbols)))
|
||||
|
||||
base = len(symbols)
|
||||
check_base = len(symbols + check_symbols)
|
||||
|
||||
|
||||
def encode(number, checksum=False, split=0):
|
||||
"""Encode an integer into a symbol string.
|
||||
|
||||
A ValueError is raised on invalid input.
|
||||
|
||||
If checksum is set to True, a check symbol will be
|
||||
calculated and appended to the string.
|
||||
|
||||
If split is specified, the string will be divided into
|
||||
clusters of that size separated by hyphens.
|
||||
|
||||
The encoded string is returned.
|
||||
"""
|
||||
number = int(number)
|
||||
if number < 0:
|
||||
raise ValueError("number '%d' is not a positive integer" % number)
|
||||
|
||||
split = int(split)
|
||||
if split < 0:
|
||||
raise ValueError("split '%d' is not a positive integer" % split)
|
||||
|
||||
check_symbol = ''
|
||||
if checksum:
|
||||
check_symbol = encode_symbols[number % check_base]
|
||||
|
||||
if number == 0:
|
||||
return '0' + check_symbol
|
||||
|
||||
symbol_string = ''
|
||||
while number > 0:
|
||||
remainder = number % base
|
||||
number //= base
|
||||
symbol_string = encode_symbols[remainder] + symbol_string
|
||||
symbol_string = symbol_string + check_symbol
|
||||
|
||||
if split:
|
||||
chunks = []
|
||||
for pos in range(0, len(symbol_string), split):
|
||||
chunks.append(symbol_string[pos:pos + split])
|
||||
symbol_string = '-'.join(chunks)
|
||||
|
||||
return symbol_string
|
||||
|
||||
|
||||
def decode(symbol_string, checksum=False, strict=False):
|
||||
"""Decode an encoded symbol string.
|
||||
|
||||
If checksum is set to True, the string is assumed to have a
|
||||
trailing check symbol which will be validated. If the
|
||||
checksum validation fails, a ValueError is raised.
|
||||
|
||||
If strict is set to True, a ValueError is raised if the
|
||||
normalization step requires changes to the string.
|
||||
|
||||
The decoded string is returned.
|
||||
"""
|
||||
symbol_string = normalize(symbol_string, strict=strict)
|
||||
if checksum:
|
||||
symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1]
|
||||
|
||||
number = 0
|
||||
for symbol in symbol_string:
|
||||
number = number * base + decode_symbols[symbol]
|
||||
|
||||
if checksum:
|
||||
check_value = decode_symbols[check_symbol]
|
||||
modulo = number % check_base
|
||||
if check_value != modulo:
|
||||
raise ValueError("invalid check symbol '%s' for string '%s'" %
|
||||
(check_symbol, symbol_string))
|
||||
|
||||
return number
|
||||
|
||||
|
||||
def normalize(symbol_string, strict=False):
|
||||
"""Normalize an encoded symbol string.
|
||||
|
||||
Normalization provides error correction and prepares the
|
||||
string for decoding. These transformations are applied:
|
||||
|
||||
1. Hyphens are removed
|
||||
2. 'I', 'i', 'L' or 'l' are converted to '1'
|
||||
3. 'O' or 'o' are converted to '0'
|
||||
4. All characters are converted to uppercase
|
||||
|
||||
A TypeError is raised if an invalid string type is provided.
|
||||
|
||||
A ValueError is raised if the normalized string contains
|
||||
invalid characters.
|
||||
|
||||
If the strict parameter is set to True, a ValueError is raised
|
||||
if any of the above transformations are applied.
|
||||
|
||||
The normalized string is returned.
|
||||
"""
|
||||
if isinstance(symbol_string, string_types):
|
||||
if not PY3:
|
||||
try:
|
||||
symbol_string = symbol_string.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
raise ValueError("string should only contain ASCII characters")
|
||||
else:
|
||||
raise TypeError("string is of invalid type %s" %
|
||||
symbol_string.__class__.__name__)
|
||||
|
||||
norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper()
|
||||
|
||||
if not valid_symbols.match(norm_string):
|
||||
raise ValueError("string '%s' contains invalid characters" % norm_string)
|
||||
|
||||
if strict and norm_string != symbol_string:
|
||||
raise ValueError("string '%s' requires normalization" % symbol_string)
|
||||
|
||||
return norm_string
|
|
@ -62,10 +62,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
help="Re-archive URLs from scratch, overwriting any existing files"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--init', #'-i',
|
||||
"--init", #'-i',
|
||||
action='store_true',
|
||||
help="Init/upgrade the curent data directory before adding",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extract",
|
||||
type=str,
|
||||
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
||||
This does not take precedence over the configuration",
|
||||
default=""
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
urls = command.urls
|
||||
stdin_urls = accept_stdin(stdin)
|
||||
|
@ -83,6 +90,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
overwrite=command.overwrite,
|
||||
init=command.init,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
extractors=command.extract,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
parser.add_argument(
|
||||
'--filter-type',
|
||||
type=str,
|
||||
choices=('exact', 'substring', 'domain', 'regex'),
|
||||
choices=('exact', 'substring', 'domain', 'regex','tag'),
|
||||
default='exact',
|
||||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
|
|
|
@ -50,7 +50,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
parser.add_argument(
|
||||
'--filter-type',
|
||||
type=str,
|
||||
choices=('exact', 'substring', 'domain', 'regex'),
|
||||
choices=('exact', 'substring', 'domain', 'regex','tag'),
|
||||
default='exact',
|
||||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
|
|
|
@ -36,7 +36,7 @@ from .config_stubs import (
|
|||
#
|
||||
|
||||
# ******************************************************************************
|
||||
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
|
||||
# Documentation: https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
|
||||
# Use the 'env' command to pass config options to ArchiveBox. e.g.:
|
||||
# env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
|
||||
# ******************************************************************************
|
||||
|
@ -98,8 +98,8 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
|||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
|
||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
|
||||
|
||||
'COOKIES_FILE': {'type': str, 'default': None},
|
||||
|
@ -157,6 +157,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
|||
'READABILITY_BINARY': {'type': str, 'default': 'readability-extractor'},
|
||||
'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||
'CHROME_BINARY': {'type': str, 'default': None},
|
||||
},
|
||||
}
|
||||
|
@ -248,7 +249,7 @@ CONFIG_HEADER = (
|
|||
# archivebox init
|
||||
#
|
||||
# A list of all possible config with documentation and examples can be found here:
|
||||
# https://github.com/pirate/ArchiveBox/wiki/Configuration
|
||||
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
|
||||
|
||||
""")
|
||||
|
||||
|
@ -296,6 +297,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
|||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
||||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
||||
|
||||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
|
||||
|
@ -318,6 +320,8 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
|||
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
|
||||
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
|
||||
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])},
|
||||
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
|
||||
|
||||
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
|
||||
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
|
||||
'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
|
||||
|
@ -505,7 +509,7 @@ def load_config(defaults: ConfigDefaultDict,
|
|||
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
|
||||
stderr()
|
||||
stderr(' For config documentation and examples see:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
|
||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
|
||||
stderr()
|
||||
raise
|
||||
raise SystemExit(2)
|
||||
|
@ -565,7 +569,7 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
|
|||
# stderr(f' {binary} --version')
|
||||
# stderr()
|
||||
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
|
||||
# stderr(' https://github.com/pirate/ArchiveBox/wiki/Install')
|
||||
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
|
||||
return None
|
||||
|
||||
def bin_path(binary: Optional[str]) -> Optional[str]:
|
||||
|
@ -643,12 +647,15 @@ def find_chrome_data_dir() -> Optional[str]:
|
|||
return None
|
||||
|
||||
def wget_supports_compression(config):
|
||||
cmd = [
|
||||
config['WGET_BINARY'],
|
||||
"--compression=auto",
|
||||
"--help",
|
||||
]
|
||||
return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
|
||||
try:
|
||||
cmd = [
|
||||
config['WGET_BINARY'],
|
||||
"--compression=auto",
|
||||
"--help",
|
||||
]
|
||||
return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
|
||||
except (FileNotFoundError, OSError):
|
||||
return False
|
||||
|
||||
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
||||
return {
|
||||
|
@ -662,6 +669,11 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
|||
'enabled': True,
|
||||
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
|
||||
},
|
||||
# 'NODE_MODULES_DIR': {
|
||||
# 'path': ,
|
||||
# 'enabled': ,
|
||||
# 'is_valid': (...).exists(),
|
||||
# },
|
||||
}
|
||||
|
||||
def get_external_locations(config: ConfigDict) -> ConfigValue:
|
||||
|
@ -715,6 +727,13 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
|
||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
'ARCHIVEBOX_BINARY': {
|
||||
'path': bin_path(config['ARCHIVEBOX_BINARY']),
|
||||
'version': config['VERSION'],
|
||||
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': True,
|
||||
},
|
||||
'PYTHON_BINARY': {
|
||||
'path': bin_path(config['PYTHON_BINARY']),
|
||||
'version': config['PYTHON_VERSION'],
|
||||
|
@ -743,6 +762,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
|||
'enabled': config['USE_WGET'],
|
||||
'is_valid': bool(config['WGET_VERSION']),
|
||||
},
|
||||
'NODE_BINARY': {
|
||||
'path': bin_path(config['NODE_BINARY']),
|
||||
'version': config['NODE_VERSION'],
|
||||
'hash': bin_hash(config['NODE_BINARY']),
|
||||
'enabled': config['USE_NODE'],
|
||||
'is_valid': bool(config['SINGLEFILE_VERSION']),
|
||||
},
|
||||
'SINGLEFILE_BINARY': {
|
||||
'path': bin_path(config['SINGLEFILE_BINARY']),
|
||||
'version': config['SINGLEFILE_VERSION'],
|
||||
|
@ -828,13 +854,13 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
|
|||
if config['USER'] == 'root':
|
||||
stderr('[!] ArchiveBox should never be run as root!', color='red')
|
||||
stderr(' For more information, see the security overview documentation:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
|
||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
|
||||
raise SystemExit(2)
|
||||
|
||||
### Check Python environment
|
||||
if sys.version_info[:3] < (3, 6, 0):
|
||||
stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
|
||||
stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
|
||||
stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
|
||||
raise SystemExit(2)
|
||||
|
||||
if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
|
||||
|
@ -854,7 +880,7 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
|
|||
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
|
||||
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
|
||||
stderr(' For more info see:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
|
||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
|
||||
if '/Default' in str(config['CHROME_USER_DATA_DIR']):
|
||||
stderr()
|
||||
stderr(' Try removing /Default from the end e.g.:')
|
||||
|
@ -878,7 +904,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
|||
)
|
||||
)
|
||||
if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
|
||||
hint(('npm install --prefix . "git+https://github.com/pirate/ArchiveBox.git"',
|
||||
hint(('npm install --prefix . "git+https://github.com/ArchiveBox/ArchiveBox.git"',
|
||||
f'or archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning',
|
||||
''), prefix=' ')
|
||||
stderr('')
|
||||
|
@ -889,7 +915,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
|||
stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
|
||||
stderr()
|
||||
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
stderr()
|
||||
|
||||
elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
|
||||
|
@ -898,7 +924,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
|||
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
|
||||
stderr()
|
||||
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
stderr()
|
||||
|
||||
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
|
||||
|
@ -907,7 +933,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
|||
stderr(' (Setting it somewhere over 60 seconds is recommended)')
|
||||
stderr()
|
||||
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
|
||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
|
||||
stderr()
|
||||
|
||||
def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
|
||||
|
|
|
@ -86,7 +86,7 @@ class SnapshotAdmin(admin.ModelAdmin):
|
|||
list_display = ('added', 'title_str', 'url_str', 'files', 'size')
|
||||
sort_fields = ('title_str', 'url_str', 'added')
|
||||
readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
|
||||
search_fields = ('url', 'timestamp', 'title', 'tags')
|
||||
search_fields = ['url', 'timestamp', 'title', 'tags__name']
|
||||
fields = (*readonly_fields, 'title', 'tags')
|
||||
list_filter = ('added', 'updated', 'tags')
|
||||
ordering = ['-added']
|
||||
|
|
|
@ -14,7 +14,7 @@ urlpatterns = [
|
|||
path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
|
||||
path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
|
||||
|
||||
path('docs/', RedirectView.as_view(url='https://github.com/pirate/ArchiveBox/wiki'), name='Docs'),
|
||||
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
|
||||
|
||||
path('archive/', RedirectView.as_view(url='/')),
|
||||
path('archive/<path:path>', LinkDetails.as_view(), name='LinkAssets'),
|
||||
|
|
|
@ -361,6 +361,7 @@ LINK_FILTERS = {
|
|||
'substring': lambda pattern: Q(url__icontains=pattern),
|
||||
'regex': lambda pattern: Q(url__iregex=pattern),
|
||||
'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
|
||||
'tag': lambda pattern: Q(tags__name=pattern),
|
||||
}
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -32,9 +32,9 @@ MAIN_INDEX_HEADER = {
|
|||
'version': VERSION,
|
||||
'git_sha': GIT_SHA,
|
||||
'website': 'https://ArchiveBox.io',
|
||||
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
|
||||
'source': 'https://github.com/pirate/ArchiveBox',
|
||||
'issues': 'https://github.com/pirate/ArchiveBox/issues',
|
||||
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
||||
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
||||
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
|
||||
'dependencies': DEPENDENCIES,
|
||||
},
|
||||
}
|
||||
|
|
|
@ -447,7 +447,7 @@ def log_shell_welcome_msg():
|
|||
print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
|
||||
print()
|
||||
print('[i] Welcome to the ArchiveBox Shell!')
|
||||
print(' https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage')
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage')
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} Example use:'.format(**ANSI))
|
||||
print(' print(Snapshot.objects.filter(is_archived=True).count())')
|
||||
|
|
|
@ -3,6 +3,7 @@ __package__ = 'archivebox'
|
|||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from datetime import date
|
||||
|
||||
|
@ -111,6 +112,7 @@ from .logging_util import (
|
|||
|
||||
|
||||
ALLOWED_IN_OUTPUT_DIR = {
|
||||
'lost+found',
|
||||
'.DS_Store',
|
||||
'.venv',
|
||||
'venv',
|
||||
|
@ -178,7 +180,7 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
archivebox update --resume=15109948213.123
|
||||
|
||||
{lightred}Documentation:{reset}
|
||||
https://github.com/pirate/ArchiveBox/wiki
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki
|
||||
'''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
|
||||
|
||||
else:
|
||||
|
@ -197,7 +199,7 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
print(' 2. archivebox init')
|
||||
print()
|
||||
print('For more information, see the documentation here:')
|
||||
print(' https://github.com/pirate/ArchiveBox/wiki')
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki')
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
@ -209,6 +211,8 @@ def version(quiet: bool=False,
|
|||
print(VERSION)
|
||||
else:
|
||||
print('ArchiveBox v{}'.format(VERSION))
|
||||
p = platform.uname()
|
||||
print(p.system, platform.platform(), p.machine)
|
||||
print()
|
||||
|
||||
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
|
||||
|
@ -525,11 +529,14 @@ def add(urls: Union[str, List[str]],
|
|||
index_only: bool=False,
|
||||
overwrite: bool=False,
|
||||
init: bool=False,
|
||||
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
||||
out_dir: Path=OUTPUT_DIR,
|
||||
extractors: str="") -> List[Link]:
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||
|
||||
extractors = extractors.split(",") if extractors else []
|
||||
|
||||
if init:
|
||||
run_subcommand('init', stdin=None, pwd=out_dir)
|
||||
|
||||
|
@ -567,12 +574,17 @@ def add(urls: Union[str, List[str]],
|
|||
return all_links
|
||||
|
||||
# Run the archive methods for each link
|
||||
archive_kwargs = {
|
||||
"out_dir": out_dir,
|
||||
}
|
||||
if extractors:
|
||||
archive_kwargs["methods"] = extractors
|
||||
if update_all:
|
||||
archive_links(all_links, overwrite=overwrite, out_dir=out_dir)
|
||||
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
||||
elif overwrite:
|
||||
archive_links(imported_links, overwrite=True, out_dir=out_dir)
|
||||
archive_links(imported_links, overwrite=True, **archive_kwargs)
|
||||
elif new_links:
|
||||
archive_links(new_links, overwrite=False, out_dir=out_dir)
|
||||
archive_links(new_links, overwrite=False, **archive_kwargs)
|
||||
|
||||
return all_links
|
||||
|
||||
|
@ -857,7 +869,7 @@ def config(config_options_str: Optional[str]=None,
|
|||
stderr(f' {line}')
|
||||
raise SystemExit(2)
|
||||
|
||||
raw_key, val = line.split('=')
|
||||
raw_key, val = line.split('=', 1)
|
||||
raw_key = raw_key.upper().strip()
|
||||
key = get_real_name(raw_key)
|
||||
if key != raw_key:
|
||||
|
@ -930,7 +942,7 @@ def schedule(add: bool=False,
|
|||
|
||||
if every or add:
|
||||
every = every or 'day'
|
||||
quoted = lambda s: f'"{s}"' if s and ' ' in s else s
|
||||
quoted = lambda s: f'"{s}"' if s and ' ' in str(s) else str(s)
|
||||
cmd = [
|
||||
'cd',
|
||||
quoted(out_dir),
|
||||
|
|
|
@ -39,11 +39,17 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
|
|||
mode = 'wb+' if isinstance(contents, bytes) else 'w'
|
||||
|
||||
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
|
||||
with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
|
||||
if isinstance(contents, dict):
|
||||
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
|
||||
elif isinstance(contents, (bytes, str)):
|
||||
f.write(contents)
|
||||
try:
|
||||
with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
|
||||
if isinstance(contents, dict):
|
||||
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
|
||||
elif isinstance(contents, (bytes, str)):
|
||||
f.write(contents)
|
||||
except OSError as e:
|
||||
print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})")
|
||||
print(" For data integrity, ArchiveBox requires a filesystem that supports atomic writes.")
|
||||
print(" Filesystems and network drives that don't implement FSYNC are incompatible and require workarounds.")
|
||||
raise SystemExit(1)
|
||||
os.chmod(path, int(OUTPUT_PERMISSIONS, base=8))
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -226,6 +226,7 @@
|
|||
|
||||
.exists-False {
|
||||
opacity: 0.1;
|
||||
filter: grayscale(100%);
|
||||
pointer-events: none;
|
||||
}
|
||||
</style>
|
||||
|
@ -265,7 +266,7 @@
|
|||
<div class="col-sm-10" style="text-align: right">
|
||||
<a href="/add/">Add Links</a> |
|
||||
<a href="/admin/core/snapshot/">Admin</a> |
|
||||
<a href="https://github.com/pirate/ArchiveBox/wiki">Docs</a>
|
||||
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Docs</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -277,7 +278,7 @@
|
|||
<br />
|
||||
<center>
|
||||
<small>
|
||||
Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a> |
|
||||
Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a> |
|
||||
|
||||
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
||||
<br /><br />
|
||||
|
|
|
@ -223,7 +223,7 @@
|
|||
<div class="col-sm-10" style="text-align: right">
|
||||
<a href="/add/">Add Links</a> |
|
||||
<a href="/admin/core/snapshot/">Admin</a> |
|
||||
<a href="https://github.com/pirate/ArchiveBox/wiki">Docs</a>
|
||||
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Docs</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -266,8 +266,8 @@
|
|||
<br/>
|
||||
<center>
|
||||
<small>
|
||||
Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a>
|
||||
version <a href="https://github.com/pirate/ArchiveBox/tree/v{{VERSION}}" title="Git commit">v{{VERSION}}</a> |
|
||||
Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
|
||||
version <a href="https://github.com/ArchiveBox/ArchiveBox/tree/v{{VERSION}}" title="Git commit">v{{VERSION}}</a> |
|
||||
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
||||
<br/><br/>
|
||||
{{FOOTER_INFO}}
|
||||
|
|
|
@ -187,8 +187,8 @@
|
|||
</a>
|
||||
</div>
|
||||
<div class="col-sm-10" style="text-align: right">
|
||||
<a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> |
|
||||
<a href="https://github.com/pirate/ArchiveBox">Source</a> |
|
||||
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Documentation</a> |
|
||||
<a href="https://github.com/ArchiveBox/ArchiveBox">Source</a> |
|
||||
<a href="https://archivebox.io">Website</a>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -209,8 +209,8 @@
|
|||
<br/>
|
||||
<center>
|
||||
<small>
|
||||
Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a>
|
||||
version <a href="https://github.com/pirate/ArchiveBox/tree/v$version" title="Git commit">v$version</a> |
|
||||
Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
|
||||
version <a href="https://github.com/ArchiveBox/ArchiveBox/tree/v$version" title="Git commit">v$version</a> |
|
||||
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
||||
<br/><br/>
|
||||
$footer_info
|
||||
|
|
|
@ -16,7 +16,7 @@ from dateparser import parse as dateparser
|
|||
|
||||
import requests
|
||||
from requests.exceptions import RequestException, ReadTimeout
|
||||
from base32_crockford import encode as base32_encode # type: ignore
|
||||
from .base32_crockford import encode as base32_encode # type: ignore
|
||||
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
||||
|
||||
try:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue