Merge branch 'dev' into method_allow_deny

This commit is contained in:
Nick Sweeting 2023-10-20 04:25:44 -07:00 committed by GitHub
commit 63ad43f46c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 4485 additions and 1748 deletions

View file

@ -57,9 +57,17 @@ SYSTEM_USER = getpass.getuser() or os.getlogin()
try:
import pwd
SYSTEM_USER = pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
except KeyError:
# Process' UID might not map to a user in cases such as running the Docker image
# (where `archivebox` is 999) as a different UID.
pass
except ModuleNotFoundError:
# pwd is only needed for some linux systems, doesn't exist on windows
pass
except Exception:
# this should never happen, uncomment to debug
# raise
pass
############################### Config Schema ##################################
@ -82,8 +90,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
'ADMIN_USERNAME': {'type': str, 'default': None},
'ADMIN_PASSWORD': {'type': str, 'default': None},
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
},
@ -100,12 +113,22 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
'TIME_ZONE': {'type': str, 'default': 'UTC'},
'TIMEZONE': {'type': str, 'default': 'UTC'},
'TIMEZONE': {'type': str, 'default': 'UTC'},
'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
'LDAP': {'type': bool, 'default': False},
'LDAP_SERVER_URI': {'type': str, 'default': None},
'LDAP_BIND_DN': {'type': str, 'default': None},
'LDAP_BIND_PASSWORD': {'type': str, 'default': None},
'LDAP_USER_BASE': {'type': str, 'default': None},
'LDAP_USER_FILTER': {'type': str, 'default': None},
'LDAP_USERNAME_ATTR': {'type': str, 'default': None},
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
},
'ARCHIVE_METHOD_TOGGLES': {
@ -151,10 +174,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--write-thumbnail',
'--no-call-home',
'--write-sub',
'--all-subs',
# There are too many of these and youtube
# throttles you with HTTP error 429
#'--write-auto-subs',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
@ -167,7 +187,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
'--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
]},
@ -216,18 +236,19 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'},
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
'READWISE_READER_TOKENS': {'type': dict, 'default': {}},
},
}
@ -420,7 +441,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},

View file

@ -20,6 +20,17 @@ from ..config import (
OUTPUT_DIR,
LOGS_DIR,
TIMEZONE,
LDAP,
LDAP_SERVER_URI,
LDAP_BIND_DN,
LDAP_BIND_PASSWORD,
LDAP_USER_BASE,
LDAP_USER_FILTER,
LDAP_USERNAME_ATTR,
LDAP_FIRSTNAME_ATTR,
LDAP_LASTNAME_ATTR,
LDAP_EMAIL_ATTR,
)
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
@ -55,6 +66,12 @@ INSTALLED_APPS = [
]
# For usage with https://www.jetadmin.io/integrations/django
# INSTALLED_APPS += ['jet_django']
# JET_PROJECT = 'archivebox'
# JET_TOKEN = 'some-api-token-here'
MIDDLEWARE = [
'core.middleware.TimezoneMiddleware',
'django.middleware.security.SecurityMiddleware',
@ -67,11 +84,58 @@ MIDDLEWARE = [
'core.middleware.CacheControlMiddleware',
]
################################################################################
### Authentication Settings
################################################################################
AUTHENTICATION_BACKENDS = [
'django.contrib.auth.backends.RemoteUserBackend',
'django.contrib.auth.backends.ModelBackend',
]
if LDAP:
try:
import ldap
from django_auth_ldap.config import LDAPSearch
global AUTH_LDAP_SERVER_URI
global AUTH_LDAP_BIND_DN
global AUTH_LDAP_BIND_PASSWORD
global AUTH_LDAP_USER_SEARCH
global AUTH_LDAP_USER_ATTR_MAP
AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
AUTH_LDAP_BIND_DN = LDAP_BIND_DN
AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
assert AUTH_LDAP_SERVER_URI and LDAP_USERNAME_ATTR and LDAP_USER_FILTER, 'LDAP_* config options must all be set if LDAP=True'
AUTH_LDAP_USER_SEARCH = LDAPSearch(
LDAP_USER_BASE,
ldap.SCOPE_SUBTREE,
'(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
)
AUTH_LDAP_USER_ATTR_MAP = {
'username': LDAP_USERNAME_ATTR,
'first_name': LDAP_FIRSTNAME_ATTR,
'last_name': LDAP_LASTNAME_ATTR,
'email': LDAP_EMAIL_ATTR,
}
AUTHENTICATION_BACKENDS = [
'django_auth_ldap.backend.LDAPBackend',
]
except ModuleNotFoundError:
sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n')
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
# sys.exit(1)
################################################################################
### Debug Settings
################################################################################
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
if DEBUG_TOOLBAR:
@ -267,8 +331,8 @@ class NoisyRequestsFilter(logging.Filter):
if LOGS_DIR.exists():
ERROR_LOG = (LOGS_DIR / 'errors.log')
else:
# meh too many edge cases here around creating log dir w/ correct permissions
# cant be bothered, just trash the log and let them figure it out via stdout/stderr
# historically too many edge cases here around creating log dir w/ correct permissions early on
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
ERROR_LOG = tempfile.NamedTemporaryFile().name
LOGGING = {

View file

@ -33,6 +33,9 @@ urlpatterns = [
path('admin/', admin.site.urls),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda _: 1/0),
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
path('index.html', RedirectView.as_view(url='/')),
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),

View file

@ -9,6 +9,7 @@ from ..util import (
enforce_types,
is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
@ -57,6 +58,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
except Exception as err:
status = 'failed'
output = err
chrome_cleanup()
finally:
timer.end()

View file

@ -9,6 +9,7 @@ from ..util import (
enforce_types,
is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
@ -54,6 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
except Exception as err:
status = 'failed'
output = err
chrome_cleanup()
finally:
timer.end()

View file

@ -71,7 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
result = run(cmd, cwd=out_dir, timeout=timeout)
try:
result_json = json.loads(result.stdout)
assert result_json and 'content' in result_json
assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
except json.JSONDecodeError:
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
@ -85,7 +85,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
if line.strip()
]
hints = (

View file

@ -9,6 +9,7 @@ from ..util import (
enforce_types,
is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
@ -54,6 +55,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
except Exception as err:
status = 'failed'
output = err
chrome_cleanup()
finally:
timer.end()

View file

@ -26,7 +26,7 @@ from ..logging_util import TimedProgress
HTML_TITLE_REGEX = re.compile(
r'<title.*?>' # start matching text after <title> tag
r'(.[^<>]+)', # get everything up to these symbols
r'([^<>]+)', # get everything up to these symbols
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
)

View file

@ -177,7 +177,7 @@ def snapshot_icons(snapshot) -> str:
# The check for archive_org is different, so it has to be handled separately
# get from db (faster)
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# get from filesystem (slower)
# target_path = Path(path) / "archive.org.txt"
# exists = target_path.exists()

View file

@ -441,7 +441,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
for line in list(hints)[:5] if line.strip()
)
@ -533,11 +533,27 @@ def log_shell_welcome_msg():
### Helpers
@enforce_types
def pretty_path(path: Union[Path, str]) -> str:
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = Path('.').resolve()
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
return str(path).replace(str(pwd) + '/', './')
pwd = str(Path(pwd)) # .resolve()
path = str(path)
if not path:
return path
# replace long absolute paths with ./ relative ones to save on terminal output width
if path.startswith(pwd) and (pwd != '/'):
path = path.replace(pwd, '.', 1)
# quote paths containing spaces
if ' ' in path:
path = f'"{path}"'
# if path is just a plain dot, replace it back with the absolute path for clarity
if path == '.':
path = pwd
return path
@enforce_types
@ -578,6 +594,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
else:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']:
if Path(folder['path']).exists():
num_files = (
@ -592,13 +609,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
# add symbol @ next to filecount if path is a remote filesystem mount
num_files = f'{num_files} @' if num_files else '@'
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
if path and ' ' in path:
path = f'"{path}"'
# if path is just a plain dot, replace it back with the full path for clarity
if path == '.':
path = str(OUTPUT_DIR)
path = pretty_path(folder['path'])
return ' '.join((
ANSI[color],
@ -629,9 +640,7 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else ''
if path and ' ' in path:
path = f'"{path}"'
path = pretty_path(dependency['path'])
return ' '.join((
ANSI[color],

View file

@ -112,6 +112,8 @@ from .config import (
load_all_config,
CONFIG,
USER_CONFIG,
ADMIN_USERNAME,
ADMIN_PASSWORD,
get_real_name,
setup_django,
)
@ -216,7 +218,7 @@ def version(quiet: bool=False,
if not quiet:
# 0.6.3
# ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 FS_USER=501:20 SEARCH_BACKEND=ripgrep
p = platform.uname()
print(
@ -236,7 +238,8 @@ def version(quiet: bool=False,
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
f'FS_USER={PUID}:{PGID}',
f'FS_PERMS={OUTPUT_PERMISSIONS}',
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
)
print()
@ -251,19 +254,19 @@ def version(quiet: bool=False,
print()
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
for name, folder in CODE_LOCATIONS.items():
print(printable_folder_status(name, folder))
for name, path in CODE_LOCATIONS.items():
print(printable_folder_status(name, path))
print()
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
for name, folder in EXTERNAL_LOCATIONS.items():
print(printable_folder_status(name, folder))
for name, path in EXTERNAL_LOCATIONS.items():
print(printable_folder_status(name, path))
print()
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
print('{white}[i] Data locations:{reset}'.format(**ANSI))
for name, folder in DATA_LOCATIONS.items():
print(printable_folder_status(name, folder))
for name, path in DATA_LOCATIONS.items():
print(printable_folder_status(name, path))
else:
print()
print('{white}[i] Data locations:{reset}'.format(**ANSI))
@ -419,14 +422,16 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
write_main_index(list(pending_links.values()), out_dir=out_dir)
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
from django.contrib.auth.models import User
if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
else:
# TODO: allow creating new supersuer via env vars on first init
# if config.HTTP_USER and config.HTTP_PASS:
# from django.contrib.auth.models import User
# User.objects.create_superuser(HTTP_USER, '', HTTP_PASS)
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
json_index = out_dir / JSON_INDEX_FILENAME

View file

@ -34,6 +34,7 @@ from ..index.schema import Link
from ..logging_util import TimedProgress, log_source_saved
from . import pocket_api
from . import readwise_reader_api
from . import wallabag_atom
from . import pocket_html
from . import pinboard_rss
@ -51,6 +52,7 @@ from . import url_list
PARSERS = {
# Specialized parsers
pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER),
wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),
@ -233,6 +235,10 @@ _test_url_strs = {
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
'<test>http://example7.com</test>': 1,
'https://<test>': 0,
'https://[test]': 0,
'http://"test"': 0,
'http://\'test\'': 0,
'[https://example8.com/what/is/this.php?what=1]': 1,
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
'<what>https://example10.com#and-thing=2 "</about>': 1,

View file

@ -17,7 +17,10 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
json_file.seek(0)
links = json.load(json_file)
# sometimes the first line is a comment or filepath, so we get everything after the first {
json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
links = json.loads(json_file_json_str)
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
for link in links:

View file

@ -0,0 +1,123 @@
__package__ = "archivebox.parsers"
import re
import requests
from datetime import datetime
from typing import IO, Iterable, Optional
from configparser import ConfigParser
from pathlib import Path
from ..index.schema import Link
from ..util import enforce_types
from ..system import atomic_write
from ..config import (
SOURCES_DIR,
READWISE_READER_TOKENS,
)
API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db"
class ReadwiseReaderAPI:
cursor: Optional[str]
def __init__(self, api_token, cursor=None) -> None:
self.api_token = api_token
self.cursor = cursor
def get_archive(self):
response = requests.get(
url="https://readwise.io/api/v3/list/",
headers={"Authorization": "Token s71gNtiNDWquEvlJFFUyDU10ao8fn99lGyNryvyllQcDSnrd7X"},
params={
"location": "archive",
"pageCursor": self.cursor,
}
)
response.raise_for_status()
return response
def get_readwise_reader_articles(api: ReadwiseReaderAPI):
response = api.get_archive()
body = response.json()
articles = body["results"]
yield from articles
if body['nextPageCursor']:
api.cursor = body["nextPageCursor"]
yield from get_readwise_reader_articles(api)
def link_from_article(article: dict, sources: list):
url: str = article['source_url']
title = article["title"] or url
timestamp = datetime.fromisoformat(article['updated_at']).timestamp()
return Link(
url=url,
timestamp=str(timestamp),
title=title,
tags="",
sources=sources,
)
def write_cursor(username: str, since: str):
if not API_DB_PATH.exists():
atomic_write(API_DB_PATH, "")
since_file = ConfigParser()
since_file.optionxform = str
since_file.read(API_DB_PATH)
since_file[username] = {"since": since}
with open(API_DB_PATH, "w+") as new:
since_file.write(new)
def read_cursor(username: str) -> Optional[str]:
if not API_DB_PATH.exists():
atomic_write(API_DB_PATH, "")
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(API_DB_PATH)
return config_file.get(username, "since", fallback=None)
@enforce_types
def should_parse_as_readwise_reader_api(text: str) -> bool:
return text.startswith("readwise-reader://")
@enforce_types
def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse bookmarks from the Readwise Reader API"""
input_buffer.seek(0)
pattern = re.compile(r"^readwise-reader:\/\/(\w+)")
for line in input_buffer:
if should_parse_as_readwise_reader_api(line):
username = pattern.search(line).group(1)
api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
for article in get_readwise_reader_articles(api):
yield link_from_article(article, sources=[line])
if api.cursor:
write_cursor(username, api.cursor)
KEY = "readwise_reader_api"
NAME = "Readwise Reader API"
PARSER = parse_readwise_reader_api_export

View file

@ -1,62 +1,3 @@
{% extends "base.html" %}
{% load static %}
{% block body %}
<div id="toolbar">
<form id="changelist-search" action="{% url 'public-index' %}" method="get">
<div>
<label for="searchbar"><img src="/static/admin/img/search.svg" alt="Search"></label>
<input type="text" size="40" name="q" value="" id="searchbar" autofocus placeholder="Title, URL, tags, timestamp, or content...".>
<input type="submit" value="Search" style="height: 36px; padding-top: 6px; margin: 8px"/>
<input type="button"
value="♺"
title="Refresh..."
onclick="location.href='{% url 'public-index' %}'"
style="background-color: rgba(121, 174, 200, 0.8); height: 30px; font-size: 0.8em; margin-top: 12px; padding-top: 6px; float:right">
</input>
</div>
</form>
</div>
<table id="table-bookmarks">
<thead>
<tr>
<th style="width: 100px;">Bookmarked</th>
<th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
<th style="width: 140px">Files</th>
<th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
</tr>
</thead>
<tbody>
{% for link in object_list %}
{% include 'main_index_row.html' with link=link %}
{% endfor %}
</tbody>
</table>
<center>
<span class="step-links">
{% if page_obj.has_previous %}
<a href="{% url 'public-index' %}?page=1">&laquo; first</a>
<a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
{% endif %}
<span class="current">
Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
</span>
{% if page_obj.has_next %}
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
{% endif %}
</span>
{% if page_obj.has_next %}
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
{% endif %}
</span>
<br>
</center>
{% endblock %}
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_list %}
{% load core_tags %}

View file

@ -33,7 +33,7 @@
<br/>
<div class="loader"></div>
<br/>
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for detailed progress...
</center>
</div>
<form id="add-form" method="POST" class="p-form">{% csrf_token %}
@ -46,19 +46,22 @@
</form>
<br/><br/><br/>
<center id="delay-warning" style="display: none">
<small>(it's safe to leave this page, adding will continue in the background)</small>
<small>(you will be redirected to your <a href="/">Snapshot list</a> momentarily, its safe to close this page at any time)</small>
</center>
{% if absolute_add_path %}
<center id="bookmarklet">
<!-- <center id="bookmarklet">
<p>Bookmark this link to quickly add to your archive:
<a href="javascript:void(window.open('{{ absolute_add_path }}?url='+encodeURIComponent(document.location.href)));">Add to ArchiveBox</a></p>
</center>
</center> -->
{% endif %}
<script>
document.getElementById('add-form').addEventListener('submit', function(event) {
document.getElementById('in-progress').style.display = 'block'
document.getElementById('add-form').style.display = 'none'
document.getElementById('delay-warning').style.display = 'block'
setTimeout(function() {
window.location = '/'
}, 2000)
return true
})
</script>

View file

@ -17,6 +17,8 @@ from requests.exceptions import RequestException, ReadTimeout
from .vendor.base32_crockford import encode as base32_encode # type: ignore
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
from os.path import lexists
from os import remove as remove_file
try:
import chardet
@ -59,7 +61,7 @@ URL_REGEX = re.compile(
r'(?=('
r'http[s]?://' # start matching from allowed schemes
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
r'))',
@ -272,6 +274,16 @@ def chrome_args(**options) -> List[str]:
return cmd_args
def chrome_cleanup():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
from .config import IN_DOCKER
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
remove_file("/home/archivebox/.config/chromium/SingletonLock")
def ansi_to_html(text):
"""