Merge branch 'master' into django

This commit is contained in:
Nick Sweeting 2020-06-25 21:30:29 -04:00
commit cb67b09f9d
29 changed files with 418 additions and 911 deletions

View file

@ -1 +1 @@
0.4.2
0.4.3

View file

@ -3,4 +3,5 @@ __package__ = 'archivebox'
from . import core
from . import cli
# The main CLI source code, is in 'archivebox/main.py'
from .main import *

View file

@ -44,6 +44,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'TIMEOUT': {'type': int, 'default': 60},
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
'URL_BLACKLIST': {'type': str, 'default': None},
},
@ -77,6 +78,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'}
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
@ -85,6 +87,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': True},
},
'DEPENDENCY_CONFIG': {
@ -130,7 +133,7 @@ DEFAULT_CLI_COLORS = {
ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
STATICFILE_EXTENSIONS = {
# 99.999% of the time, URLs ending in these extentions are static files
# 99.999% of the time, URLs ending in these extensions are static files
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
@ -147,7 +150,7 @@ STATICFILE_EXTENSIONS = {
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# Thse are always treated as pages, not as static files, never add them:
# These are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
@ -210,8 +213,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['FETCH_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
@ -480,6 +484,7 @@ def find_chrome_binary() -> Optional[str]:
'chromium-browser',
'chromium',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'chrome',
'google-chrome',
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'google-chrome-stable',
@ -506,6 +511,7 @@ def find_chrome_data_dir() -> Optional[str]:
'~/.config/chromium',
'~/Library/Application Support/Chromium',
'~/AppData/Local/Chromium/User Data',
'~/.config/chrome',
'~/.config/google-chrome',
'~/Library/Application Support/Google/Chrome',
'~/AppData/Local/Google/Chrome/User Data',

View file

@ -13,6 +13,7 @@ from ..config import (
CURL_BINARY,
CURL_VERSION,
CHECK_SSL_VALIDITY,
CURL_USER_AGENT,
)
from ..cli.logging import TimedProgress
@ -37,14 +38,16 @@ def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
'--max-time', str(timeout),
'--location',
'--output', str(output),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else [],
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
]
status = 'succeeded'
status = 'pending'
timer = TimedProgress(timeout, prefix=' ')
try:
run(cmd, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout)
chmod_file(output, cwd=out_dir)
status = 'succeeded'
except Exception as err:
status = 'failed'
output = err

View file

@ -24,6 +24,7 @@ from ..config import (
SAVE_WARC,
WGET_BINARY,
WGET_VERSION,
RESTRICT_FILE_NAMES,
CHECK_SSL_VALIDITY,
SAVE_WGET_REQUISITES,
WGET_AUTO_COMPRESSION,
@ -66,14 +67,14 @@ def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
'--restrict-file-names=windows',
'--timeout={}'.format(timeout),
*([] if SAVE_WARC else ['--timestamping']),
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
*(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
*(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
*([] if SAVE_WARC else ['--timestamping']),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
link.url,
]

View file

@ -325,7 +325,8 @@ def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
# Patch HTML main index
html_path = os.path.join(out_dir, 'index.html')
with open(html_path, 'r') as f:
html = f.read().split('\n')
html = f.read().splitlines()
for idx, line in enumerate(html):
if title and ('<span data-title-for="{}"'.format(link.url) in line):
html[idx] = '<span>{}</span>'.format(title)
@ -333,7 +334,7 @@ def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
html[idx] = '<span>{}</span>'.format(successful)
break
atomic_write('\n'.join(html), html_path)
atomic_write(html_path, '\n'.join(html))
### Link Details Index

View file

@ -41,7 +41,7 @@ TITLE_LOADING_MSG = 'Not yet archived...'
def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]:
"""parse an archive index html file and return the list of urls"""
index_path = os.path.join(out_dir, HTML_INDEX_FILENAME)
index_path = join(out_dir, HTML_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
for line in f:
@ -58,7 +58,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished:
copy_and_overwrite(join(TEMPLATES_DIR, STATIC_DIR_NAME), join(out_dir, STATIC_DIR_NAME))
rendered_html = main_index_template(links, finished=finished)
atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)
@enforce_types
@ -116,7 +116,7 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
rendered_html = link_details_template(link)
atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)
@enforce_types

View file

@ -74,7 +74,7 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
'last_run_cmd': sys.argv,
'links': links,
}
atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json)
### Link Details Index
@ -86,7 +86,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
path = os.path.join(out_dir, JSON_INDEX_FILENAME)
atomic_write(link._asdict(extended=True), path)
atomic_write(path, link._asdict(extended=True))
@enforce_types

View file

@ -13,7 +13,6 @@ import os
from typing import Tuple, List
from datetime import datetime
from ..index.schema import Link
from ..system import atomic_write
from ..config import (
ANSI,
@ -29,6 +28,7 @@ from ..util import (
enforce_types,
URL_REGEX,
)
from ..index.schema import Link
from ..cli.logging import pretty_path, TimedProgress
from .pocket_html import parse_pocket_html_export
from .pinboard_rss import parse_pinboard_rss_export
@ -93,8 +93,7 @@ def save_stdin_to_sources(raw_text: str, out_dir: str=OUTPUT_DIR) -> str:
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(sources_dir, '{}-{}.txt'.format('stdin', ts))
atomic_write(raw_text, source_path)
atomic_write(source_path, raw_text)
return source_path
@ -112,6 +111,7 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
# Source is a URL that needs to be downloaded
source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
print('{}[*] [{}] Downloading {}{}'.format(
ANSI['green'],
@ -134,10 +134,11 @@ def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DI
raise SystemExit(1)
else:
# Source is a path to a local file on the filesystem
with open(path, 'r') as f:
raw_source_text = f.read()
atomic_write(raw_source_text, source_path)
atomic_write(source_path, raw_source_text)
print(' > {}'.format(pretty_path(source_path)))

View file

@ -8,6 +8,7 @@ import json as pyjson
from typing import Optional, Union, Set, Tuple
from crontab import CronTab
from atomicwrites import atomic_write as awrite
from subprocess import (
Popen,
@ -22,10 +23,10 @@ from .util import enforce_types, ExtendedEncoder
from .config import OUTPUT_PERMISSIONS
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
if input is not None:
if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.')
@ -59,30 +60,14 @@ def run(*popenargs, input=None, capture_output=False, timeout=None, check=False,
return CompletedProcess(process.args, retcode, stdout, stderr)
def atomic_write(contents: Union[dict, str, bytes], path: str) -> None:
def atomic_write(path: str, contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
try:
tmp_file = '{}.tmp'.format(path)
if isinstance(contents, bytes):
args = {'mode': 'wb+'}
with awrite(path, overwrite=overwrite) as f:
if isinstance(contents, dict):
pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
else:
args = {'mode': 'w+', 'encoding': 'utf-8'}
with open(tmp_file, **args) as f:
if isinstance(contents, dict):
pyjson.dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
else:
f.write(contents)
os.fsync(f.fileno())
os.rename(tmp_file, path)
chmod_file(path)
finally:
if os.path.exists(tmp_file):
os.remove(tmp_file)
f.write(contents)
@enforce_types
def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS, timeout: int=30) -> None:
@ -105,7 +90,8 @@ def copy_and_overwrite(from_path: str, to_path: str):
shutil.copytree(from_path, to_path)
else:
with open(from_path, 'rb') as src:
atomic_write(src.read(), to_path)
contents = src.read()
atomic_write(to_path, contents)
@enforce_types

View file

@ -6,6 +6,37 @@
<title>Archived Sites</title>
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
<style>
:root {
--bg-main: #efefef;
--accent-1: #aa1e55;
--accent-2: #ffebeb;
--accent-3: #efefef;
--text-1: #1c1c1c;
--text-2: #eaeaea;
--text-main: #1a1a1a;
--font-main: "Gill Sans", Helvetica, sans-serif;
}
/* Dark Mode (WIP) */
/*
@media (prefers-color-scheme: dark) {
:root {
--accent-2: hsl(160, 100%, 96%);
--text-1: #eaeaea;
--text-2: #1a1a1a;
--bg-main: #101010;
}
#table-bookmarks_wrapper,
#table-bookmarks_wrapper img,
tbody td:nth-child(3),
tbody td:nth-child(3) span,
footer {
filter: invert(100%);
}
}*/
html, body {
width: 100%;
height: 100%;
@ -14,11 +45,12 @@
text-align: center;
margin: 0px;
padding: 0px;
font-family: "Gill Sans", Helvetica, sans-serif;
font-family: var(--font-main);
}
.header-top small {
font-weight: 200;
color: #efefef;
color: var(--accent-3);
}
.header-top {
@ -31,8 +63,8 @@
font-size: calc(11px + 0.84vw);
font-weight: 200;
padding: 4px 4px;
border-bottom: 3px solid #aa1e55;
background-color: #aa1e55;
border-bottom: 3px solid var(--accent-1);
background-color: var(--accent-1);
}
input[type=search] {
width: 22vw;
@ -86,7 +118,7 @@
height: 35px;
}
tbody tr:nth-child(odd) {
background-color: #ffebeb !important;
background-color: var(--accent-2) !important;
}
table tr td {
white-space: nowrap;
@ -146,7 +178,7 @@
color:black;
}
tr td a.title small {
background-color: #efefef;
background-color: var(--accent-3);
border-radius: 4px;
float:right
}

View file

@ -4,6 +4,34 @@
<title>Archived Sites</title>
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
<style>
:root {
--accent-1: #aa1e55;
--accent-2: #ffebeb;
--accent-3: #efefef;
--bg-main: #efefef;
--text-main: black;
--text-1: #1a1a1a;
--text-2: #eaeaea;
}
@media (prefers-color-scheme: dark) {
:root {
--accent-2: hsl(160, 100%, 96%);
--text-1: #eaeaea;
--text-2: #1a1a1a;
--bg-main: #101010;
}
#table-bookmarks_wrapper,
#table-bookmarks_wrapper img,
tbody td:nth-child(3),
tbody td:nth-child(3) span,
footer {
filter: invert(100%);
}
}
html, body {
width: 100%;
height: 100%;
@ -13,7 +41,10 @@
margin: 0px;
padding: 0px;
font-family: "Gill Sans", Helvetica, sans-serif;
background: var(--bg-main);
color: var(--text-main);
}
<<<<<<< HEAD:archivebox/themes/legacy/main_index.html
.header-top small {
font-weight: 200;
color: #efefef;
@ -24,6 +55,33 @@
height: auto;
min-height: 40px;
margin: 0px;
=======
header {
background-color: var(--accent-1);
color: var(--text-1);
padding: 10px;
padding-top: 0px;
padding-bottom: 15px;
/*height: 40px;*/
}
header h1 {
margin: 7px 0px;
font-size: 35px;
font-weight: 300;
color: var(--text-1);
}
header h1 img {
height: 44px;
vertical-align: bottom;
}
header a {
text-decoration: none !important;
color: var(--text-1);
}
.header-center {
margin: auto;
float: none;
>>>>>>> master:archivebox/templates/index.html
text-align: center;
color: white;
font-size: calc(11px + 0.84vw);
@ -32,11 +90,17 @@
border-bottom: 3px solid #aa1e55;
background-color: #aa1e55;
}
<<<<<<< HEAD:archivebox/themes/legacy/main_index.html
input[type=search] {
width: 22vw;
border-radius: 4px;
border: 1px solid #aeaeae;
padding: 3px 5px;
=======
.header-center small {
color: var(--text-2);
opacity: 0.7;
>>>>>>> master:archivebox/templates/index.html
}
.nav > div {
min-height: 30px;
@ -45,9 +109,14 @@
text-decoration: none;
color: rgba(0,0,0,0.6);
}
<<<<<<< HEAD:archivebox/themes/legacy/main_index.html
.header-top a:hover {
text-decoration: none;
color: rgba(0,0,0,0.9);
=======
header + div {
padding-top: 10px;
>>>>>>> master:archivebox/templates/index.html
}
.header-top .col-lg-4 {
text-align: center;
@ -84,7 +153,7 @@
height: 35px;
}
tbody tr:nth-child(odd) {
background-color: #ffebeb !important;
background-color: var(--accent-2) !important;
}
table tr td {
white-space: nowrap;
@ -144,7 +213,7 @@
color:black;
}
tr td a.title small {
background-color: #efefef;
background-color: var(--accent-3);
border-radius: 4px;
float:right
}

View file

@ -1,5 +1,6 @@
import re
import ssl
import json as pyjson
from typing import List, Optional, Any
@ -12,8 +13,7 @@ from html import escape, unescape
from datetime import datetime
from dateutil import parser as dateparser
from base32_crockford import encode as base32_encode # type: ignore
import json as pyjson
from base32_crockford import encode as base32_encode # type: ignore
from .config import (
TIMEOUT,
@ -23,6 +23,12 @@ from .config import (
CHROME_OPTIONS,
)
try:
import chardet
detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
except ImportError:
detect_encoding = lambda rawdata: "utf-8"
### Parsing Helpers
# All of these are (str) -> str
@ -158,8 +164,9 @@ def download_url(url: str, timeout: int=TIMEOUT) -> str:
insecure = ssl._create_unverified_context()
resp = urlopen(req, timeout=timeout, context=insecure)
encoding = resp.headers.get_content_charset() or 'utf-8' # type: ignore
return resp.read().decode(encoding)
rawdata = resp.read()
encoding = resp.headers.get_content_charset() or detect_encoding(rawdata)
return rawdata.decode(encoding)
@enforce_types