mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-29 05:55:28 -04:00
Merge branch 'dev' into issue1316
This commit is contained in:
commit
ef856e8051
50 changed files with 1469 additions and 1694 deletions
|
@ -1 +1,7 @@
|
|||
__package__ = 'archivebox'
|
||||
|
||||
|
||||
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
|
||||
import datetime
|
||||
from django.utils import timezone
|
||||
timezone.utc = datetime.timezone.utc
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
__package__ = 'archivebox.api'
|
|
@ -1,3 +1,5 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
|
|
|
@ -1,184 +0,0 @@
|
|||
# archivebox_api.py
|
||||
from typing import List, Optional
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel
|
||||
from ninja import Router
|
||||
from main import (
|
||||
add,
|
||||
remove,
|
||||
update,
|
||||
list_all,
|
||||
ONLY_NEW,
|
||||
) # Assuming these functions are defined in main.py
|
||||
|
||||
|
||||
# Schemas
|
||||
|
||||
class StatusChoices(str, Enum):
|
||||
indexed = 'indexed'
|
||||
archived = 'archived'
|
||||
unarchived = 'unarchived'
|
||||
present = 'present'
|
||||
valid = 'valid'
|
||||
invalid = 'invalid'
|
||||
duplicate = 'duplicate'
|
||||
orphaned = 'orphaned'
|
||||
corrupted = 'corrupted'
|
||||
unrecognized = 'unrecognized'
|
||||
|
||||
|
||||
class AddURLSchema(BaseModel):
|
||||
urls: List[str]
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
|
||||
update_all: bool = False
|
||||
index_only: bool = False
|
||||
overwrite: bool = False
|
||||
init: bool = False
|
||||
extractors: str = ""
|
||||
parser: str = "auto"
|
||||
|
||||
|
||||
class RemoveURLSchema(BaseModel):
|
||||
yes: bool = False
|
||||
delete: bool = False
|
||||
before: Optional[float] = None
|
||||
after: Optional[float] = None
|
||||
filter_type: str = "exact"
|
||||
filter_patterns: Optional[List[str]] = None
|
||||
|
||||
|
||||
class UpdateSchema(BaseModel):
|
||||
resume: Optional[float] = None
|
||||
only_new: Optional[bool] = None
|
||||
index_only: Optional[bool] = False
|
||||
overwrite: Optional[bool] = False
|
||||
before: Optional[float] = None
|
||||
after: Optional[float] = None
|
||||
status: Optional[StatusChoices] = None
|
||||
filter_type: Optional[str] = 'exact'
|
||||
filter_patterns: Optional[List[str]] = None
|
||||
extractors: Optional[str] = ""
|
||||
|
||||
|
||||
class ListAllSchema(BaseModel):
|
||||
filter_patterns: Optional[List[str]] = None
|
||||
filter_type: str = 'exact'
|
||||
status: Optional[StatusChoices] = None
|
||||
after: Optional[float] = None
|
||||
before: Optional[float] = None
|
||||
sort: Optional[str] = None
|
||||
csv: Optional[str] = None
|
||||
json: bool = False
|
||||
html: bool = False
|
||||
with_headers: bool = False
|
||||
|
||||
|
||||
# API Router
|
||||
router = Router()
|
||||
|
||||
|
||||
@router.post("/add", response={200: dict})
|
||||
def api_add(request, payload: AddURLSchema):
|
||||
try:
|
||||
result = add(
|
||||
urls=payload.urls,
|
||||
tag=payload.tag,
|
||||
depth=payload.depth,
|
||||
update=payload.update,
|
||||
update_all=payload.update_all,
|
||||
index_only=payload.index_only,
|
||||
overwrite=payload.overwrite,
|
||||
init=payload.init,
|
||||
extractors=payload.extractors,
|
||||
parser=payload.parser,
|
||||
)
|
||||
# Currently the add function returns a list of ALL items in the DB, ideally only return new items
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "URLs added successfully.",
|
||||
"result": str(result),
|
||||
}
|
||||
except Exception as e:
|
||||
# Handle exceptions raised by the add function or during processing
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
|
||||
@router.post("/remove", response={200: dict})
|
||||
def api_remove(request, payload: RemoveURLSchema):
|
||||
try:
|
||||
result = remove(
|
||||
yes=payload.yes,
|
||||
delete=payload.delete,
|
||||
before=payload.before,
|
||||
after=payload.after,
|
||||
filter_type=payload.filter_type,
|
||||
filter_patterns=payload.filter_patterns,
|
||||
)
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "URLs removed successfully.",
|
||||
"result": result,
|
||||
}
|
||||
except Exception as e:
|
||||
# Handle exceptions raised by the remove function or during processing
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
|
||||
@router.post("/update", response={200: dict})
|
||||
def api_update(request, payload: UpdateSchema):
|
||||
try:
|
||||
result = update(
|
||||
resume=payload.resume,
|
||||
only_new=payload.only_new,
|
||||
index_only=payload.index_only,
|
||||
overwrite=payload.overwrite,
|
||||
before=payload.before,
|
||||
after=payload.after,
|
||||
status=payload.status,
|
||||
filter_type=payload.filter_type,
|
||||
filter_patterns=payload.filter_patterns,
|
||||
extractors=payload.extractors,
|
||||
)
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Archive updated successfully.",
|
||||
"result": result,
|
||||
}
|
||||
except Exception as e:
|
||||
# Handle exceptions raised by the update function or during processing
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
|
||||
@router.post("/list_all", response={200: dict})
|
||||
def api_list_all(request, payload: ListAllSchema):
|
||||
try:
|
||||
result = list_all(
|
||||
filter_patterns=payload.filter_patterns,
|
||||
filter_type=payload.filter_type,
|
||||
status=payload.status,
|
||||
after=payload.after,
|
||||
before=payload.before,
|
||||
sort=payload.sort,
|
||||
csv=payload.csv,
|
||||
json=payload.json,
|
||||
html=payload.html,
|
||||
with_headers=payload.with_headers,
|
||||
)
|
||||
# TODO: This is kind of bad, make the format a choice field
|
||||
if payload.json:
|
||||
return {"status": "success", "format": "json", "data": result}
|
||||
elif payload.html:
|
||||
return {"status": "success", "format": "html", "data": result}
|
||||
elif payload.csv:
|
||||
return {"status": "success", "format": "csv", "data": result}
|
||||
else:
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "List generated successfully.",
|
||||
"data": result,
|
||||
}
|
||||
except Exception as e:
|
||||
# Handle exceptions raised by the list_all function or during processing
|
||||
return {"status": "error", "message": str(e)}
|
|
@ -1,48 +1,107 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from django.http import HttpRequest
|
||||
from django.contrib.auth import login
|
||||
from django.contrib.auth import authenticate
|
||||
from ninja import Form, Router, Schema
|
||||
from ninja.security import HttpBearer
|
||||
from django.contrib.auth.models import AbstractBaseUser
|
||||
|
||||
from api.models import Token
|
||||
|
||||
router = Router()
|
||||
from ninja.security import HttpBearer, APIKeyQuery, APIKeyHeader, HttpBasicAuth, django_auth_superuser
|
||||
|
||||
|
||||
class GlobalAuth(HttpBearer):
|
||||
def authenticate(self, request, token):
|
||||
def auth_using_token(token, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
|
||||
"""Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
|
||||
from api.models import APIToken # lazy import model to avoid loading it at urls.py import time
|
||||
|
||||
user = None
|
||||
|
||||
submitted_empty_form = token in ('string', '', None)
|
||||
if submitted_empty_form:
|
||||
user = request.user # see if user is authed via django session and use that as the default
|
||||
else:
|
||||
try:
|
||||
return Token.objects.get(token=token).user
|
||||
except Token.DoesNotExist:
|
||||
token = APIToken.objects.get(token=token)
|
||||
if token.is_valid():
|
||||
user = token.user
|
||||
except APIToken.DoesNotExist:
|
||||
pass
|
||||
|
||||
if not user:
|
||||
print('[❌] Failed to authenticate API user using API Key:', request)
|
||||
|
||||
class AuthSchema(Schema):
|
||||
email: str
|
||||
password: str
|
||||
return None
|
||||
|
||||
|
||||
@router.post("/authenticate", auth=None) # overriding global auth
|
||||
def get_token(request, auth_data: AuthSchema):
|
||||
user = authenticate(username=auth_data.email, password=auth_data.password)
|
||||
if user:
|
||||
# Assuming a user can have multiple tokens and you want to create a new one every time
|
||||
new_token = Token.objects.create(user=user)
|
||||
return {"token": new_token.token, "expires": new_token.expiry_as_iso8601}
|
||||
def auth_using_password(username, password, request: Optional[HttpRequest]=None) -> Optional[AbstractBaseUser]:
|
||||
"""Given a username and password, check if they are valid and return the corresponding user"""
|
||||
user = None
|
||||
|
||||
submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
|
||||
if submitted_empty_form:
|
||||
user = request.user # see if user is authed via django session and use that as the default
|
||||
else:
|
||||
return {"error": "Invalid credentials"}
|
||||
user = authenticate(
|
||||
username=username,
|
||||
password=password,
|
||||
)
|
||||
|
||||
if not user:
|
||||
print('[❌] Failed to authenticate API user using API Key:', request)
|
||||
|
||||
return user
|
||||
|
||||
|
||||
class TokenValidationSchema(Schema):
|
||||
token: str
|
||||
### Base Auth Types
|
||||
|
||||
class APITokenAuthCheck:
|
||||
"""The base class for authentication methods that use an api.models.APIToken"""
|
||||
def authenticate(self, request: HttpRequest, key: Optional[str]=None) -> Optional[AbstractBaseUser]:
|
||||
user = auth_using_token(
|
||||
token=key,
|
||||
request=request,
|
||||
)
|
||||
if user is not None:
|
||||
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
|
||||
return user
|
||||
|
||||
class UserPassAuthCheck:
|
||||
"""The base class for authentication methods that use a username & password"""
|
||||
def authenticate(self, request: HttpRequest, username: Optional[str]=None, password: Optional[str]=None) -> Optional[AbstractBaseUser]:
|
||||
user = auth_using_password(
|
||||
username=username,
|
||||
password=password,
|
||||
request=request,
|
||||
)
|
||||
if user is not None:
|
||||
login(request, user, backend='django.contrib.auth.backends.ModelBackend')
|
||||
return user
|
||||
|
||||
|
||||
@router.post("/validate_token", auth=None) # No authentication required for this endpoint
|
||||
def validate_token(request, token_data: TokenValidationSchema):
|
||||
try:
|
||||
# Attempt to authenticate using the provided token
|
||||
user = GlobalAuth().authenticate(request, token_data.token)
|
||||
if user:
|
||||
return {"status": "valid"}
|
||||
else:
|
||||
return {"status": "invalid"}
|
||||
except Token.DoesNotExist:
|
||||
return {"status": "invalid"}
|
||||
### Django-Ninja-Provided Auth Methods
|
||||
|
||||
class UsernameAndPasswordAuth(UserPassAuthCheck, HttpBasicAuth):
|
||||
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
|
||||
pass
|
||||
|
||||
class QueryParamTokenAuth(APITokenAuthCheck, APIKeyQuery):
|
||||
"""Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
|
||||
param_name = "api_key"
|
||||
|
||||
class HeaderTokenAuth(APITokenAuthCheck, APIKeyHeader):
|
||||
"""Allow authenticating by passing X-API-Key=xyz as a request header"""
|
||||
param_name = "X-API-Key"
|
||||
|
||||
class BearerTokenAuth(APITokenAuthCheck, HttpBearer):
|
||||
"""Allow authenticating by passing Bearer=xyz as a request header"""
|
||||
pass
|
||||
|
||||
|
||||
### Enabled Auth Methods
|
||||
|
||||
API_AUTH_METHODS = [
|
||||
QueryParamTokenAuth(),
|
||||
HeaderTokenAuth(),
|
||||
BearerTokenAuth(),
|
||||
django_auth_superuser,
|
||||
UsernameAndPasswordAuth(),
|
||||
]
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
# Generated by Django 3.1.14 on 2024-04-09 18:52
|
||||
# Generated by Django 4.2.11 on 2024-04-25 04:19
|
||||
|
||||
import api.models
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import uuid
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
@ -16,13 +17,13 @@ class Migration(migrations.Migration):
|
|||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Token',
|
||||
name='APIToken',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('token', models.CharField(default=auth.models.hex_uuid, max_length=32, unique=True)),
|
||||
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||
('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
|
||||
('created', models.DateTimeField(auto_now_add=True)),
|
||||
('expiry', models.DateTimeField(blank=True, null=True)),
|
||||
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='tokens', to=settings.AUTH_USER_MODEL)),
|
||||
('expires', models.DateTimeField(blank=True, null=True)),
|
||||
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
|
17
archivebox/api/migrations/0002_alter_apitoken_options.py
Normal file
17
archivebox/api/migrations/0002_alter_apitoken_options.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# Generated by Django 5.0.4 on 2024-04-26 05:28
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('api', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='apitoken',
|
||||
options={'verbose_name': 'API Key', 'verbose_name_plural': 'API Keys'},
|
||||
),
|
||||
]
|
|
@ -1,30 +1,63 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
import uuid
|
||||
import secrets
|
||||
from datetime import timedelta
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
def hex_uuid():
|
||||
return uuid.uuid4().hex
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
|
||||
class Token(models.Model):
|
||||
user = models.ForeignKey(
|
||||
settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="tokens"
|
||||
)
|
||||
token = models.CharField(max_length=32, default=hex_uuid, unique=True)
|
||||
def generate_secret_token() -> str:
|
||||
# returns cryptographically secure string with len() == 32
|
||||
return secrets.token_hex(16)
|
||||
|
||||
|
||||
class APIToken(models.Model):
|
||||
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
|
||||
|
||||
user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
|
||||
token = models.CharField(max_length=32, default=generate_secret_token, unique=True)
|
||||
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
expiry = models.DateTimeField(null=True, blank=True)
|
||||
expires = models.DateTimeField(null=True, blank=True)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
verbose_name = "API Key"
|
||||
verbose_name_plural = "API Keys"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.token
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'<APIToken user={self.user.username} token=************{self.token[-4:]}>'
|
||||
|
||||
def __json__(self) -> dict:
|
||||
return {
|
||||
"TYPE": "APIToken",
|
||||
"id": str(self.id),
|
||||
"user_id": str(self.user.id),
|
||||
"user_username": self.user.username,
|
||||
"token": self.token,
|
||||
"created": self.created.isoformat(),
|
||||
"expires": self.expires_as_iso8601,
|
||||
}
|
||||
|
||||
@property
|
||||
def expiry_as_iso8601(self):
|
||||
def expires_as_iso8601(self):
|
||||
"""Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
|
||||
expiry_date = (
|
||||
self.expiry if self.expiry else timezone.now() + timedelta(days=365 * 100)
|
||||
)
|
||||
expiry_date = self.expires or (timezone.now() + timedelta(days=365 * 100))
|
||||
|
||||
return expiry_date.isoformat()
|
||||
|
||||
def __str__(self):
|
||||
return self.token
|
||||
def is_valid(self, for_date=None):
|
||||
for_date = for_date or timezone.now()
|
||||
|
||||
if self.expires and self.expires < for_date:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
|
|
@ -1,27 +1,30 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from django.test import TestCase
|
||||
from ninja.testing import TestClient
|
||||
from archivebox.api.archive import router as archive_router
|
||||
|
||||
class ArchiveBoxAPITestCase(TestCase):
|
||||
from .routes_cli import router
|
||||
|
||||
class ArchiveBoxCLIAPITestCase(TestCase):
|
||||
def setUp(self):
|
||||
self.client = TestClient(archive_router)
|
||||
self.client = TestClient(router)
|
||||
|
||||
def test_add_endpoint(self):
|
||||
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "test"})
|
||||
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "testTag1,testTag2"})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.json()["status"], "success")
|
||||
self.assertTrue(response.json()["success"])
|
||||
|
||||
def test_remove_endpoint(self):
|
||||
response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.json()["status"], "success")
|
||||
self.assertTrue(response.json()["success"])
|
||||
|
||||
def test_update_endpoint(self):
|
||||
response = self.client.post("/update", json={})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.json()["status"], "success")
|
||||
self.assertTrue(response.json()["success"])
|
||||
|
||||
def test_list_all_endpoint(self):
|
||||
response = self.client.post("/list_all", json={})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertTrue("success" in response.json()["status"])
|
||||
self.assertTrue(response.json()["success"])
|
||||
|
|
17
archivebox/api/urls.py
Normal file
17
archivebox/api/urls.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from django.urls import path
|
||||
from django.views.generic.base import RedirectView
|
||||
|
||||
from .v1_api import urls as v1_api_urls
|
||||
|
||||
urlpatterns = [
|
||||
path("", RedirectView.as_view(url='/api/v1')),
|
||||
|
||||
path("v1/", v1_api_urls),
|
||||
path("v1", RedirectView.as_view(url='/api/v1/docs')),
|
||||
|
||||
# ... v2 can be added here ...
|
||||
# path("v2/", v2_api_urls),
|
||||
# path("v2", RedirectView.as_view(url='/api/v2/docs')),
|
||||
]
|
111
archivebox/api/v1_api.py
Normal file
111
archivebox/api/v1_api.py
Normal file
|
@ -0,0 +1,111 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
|
||||
from io import StringIO
|
||||
from traceback import format_exception
|
||||
from contextlib import redirect_stdout, redirect_stderr
|
||||
|
||||
from django.http import HttpRequest, HttpResponse
|
||||
from django.core.exceptions import ObjectDoesNotExist, EmptyResultSet, PermissionDenied
|
||||
|
||||
from ninja import NinjaAPI, Swagger
|
||||
|
||||
# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
|
||||
|
||||
from api.auth import API_AUTH_METHODS
|
||||
from ..config import VERSION, COMMIT_HASH
|
||||
|
||||
|
||||
COMMIT_HASH = COMMIT_HASH or 'unknown'
|
||||
|
||||
html_description=f'''
|
||||
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
|
||||
<br/>
|
||||
<i><b>WARNING: This API is still in an early development stage and may change!</b></i>
|
||||
<br/>
|
||||
<ul>
|
||||
<li>⬅️ Manage your server: <a href="/admin/api/"><b>Setup API Keys</b></a>, <a href="/admin/">Go to your Server Admin UI</a>, <a href="/">Go to your Snapshots list</a>
|
||||
<li>💬 Ask questions and get help here: <a href="https://zulip.archivebox.io">ArchiveBox Chat Forum</a></li>
|
||||
<li>🐞 Report API bugs here: <a href="https://github.com/ArchiveBox/ArchiveBox/issues">Github Issues</a></li>
|
||||
<li>📚 ArchiveBox Documentation: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Github Wiki</a></li>
|
||||
<li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
|
||||
</ul>
|
||||
<small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
|
||||
'''
|
||||
|
||||
|
||||
def register_urls(api: NinjaAPI) -> NinjaAPI:
|
||||
api.add_router('/auth/', 'api.v1_auth.router')
|
||||
api.add_router('/core/', 'api.v1_core.router')
|
||||
api.add_router('/cli/', 'api.v1_cli.router')
|
||||
return api
|
||||
|
||||
|
||||
class NinjaAPIWithIOCapture(NinjaAPI):
|
||||
def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
|
||||
stdout, stderr = StringIO(), StringIO()
|
||||
|
||||
with redirect_stderr(stderr):
|
||||
with redirect_stdout(stdout):
|
||||
request.stdout = stdout
|
||||
request.stderr = stderr
|
||||
|
||||
response = super().create_temporal_response(request)
|
||||
|
||||
print('RESPONDING NOW', response)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
api = NinjaAPIWithIOCapture(
|
||||
title='ArchiveBox API',
|
||||
description=html_description,
|
||||
version='1.0.0',
|
||||
csrf=False,
|
||||
auth=API_AUTH_METHODS,
|
||||
urls_namespace="api",
|
||||
docs=Swagger(settings={"persistAuthorization": True}),
|
||||
# docs_decorator=login_required,
|
||||
# renderer=ORJSONRenderer(),
|
||||
)
|
||||
api = register_urls(api)
|
||||
urls = api.urls
|
||||
|
||||
|
||||
@api.exception_handler(Exception)
|
||||
def generic_exception_handler(request, err):
|
||||
status = 503
|
||||
if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
|
||||
status = 404
|
||||
|
||||
print(''.join(format_exception(err)))
|
||||
|
||||
return api.create_response(
|
||||
request,
|
||||
{
|
||||
"succeeded": False,
|
||||
"message": f'{err.__class__.__name__}: {err}',
|
||||
"errors": [
|
||||
''.join(format_exception(err)),
|
||||
# or send simpler parent-only traceback:
|
||||
# *([str(err.__context__)] if getattr(err, '__context__', None) else []),
|
||||
],
|
||||
},
|
||||
status=status,
|
||||
)
|
||||
|
||||
|
||||
|
||||
# import orjson
|
||||
# from ninja.renderers import BaseRenderer
|
||||
# class ORJSONRenderer(BaseRenderer):
|
||||
# media_type = "application/json"
|
||||
# def render(self, request, data, *, response_status):
|
||||
# return {
|
||||
# "success": True,
|
||||
# "errors": [],
|
||||
# "result": data,
|
||||
# "stdout": ansi_to_html(stdout.getvalue().strip()),
|
||||
# "stderr": ansi_to_html(stderr.getvalue().strip()),
|
||||
# }
|
||||
# return orjson.dumps(data)
|
52
archivebox/api/v1_auth.py
Normal file
52
archivebox/api/v1_auth.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ninja import Router, Schema
|
||||
|
||||
from api.models import APIToken
|
||||
from api.auth import auth_using_token, auth_using_password
|
||||
|
||||
|
||||
router = Router(tags=['Authentication'])
|
||||
|
||||
|
||||
class PasswordAuthSchema(Schema):
|
||||
"""Schema for a /get_api_token request"""
|
||||
username: Optional[str] = None
|
||||
password: Optional[str] = None
|
||||
|
||||
|
||||
@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet
|
||||
def get_api_token(request, auth_data: PasswordAuthSchema):
|
||||
user = auth_using_password(
|
||||
username=auth_data.username,
|
||||
password=auth_data.password,
|
||||
request=request,
|
||||
)
|
||||
|
||||
if user:
|
||||
# TODO: support multiple tokens in the future, for now we just have one per user
|
||||
api_token, created = APIToken.objects.get_or_create(user=user)
|
||||
|
||||
return api_token.__json__()
|
||||
|
||||
return {"success": False, "errors": ["Invalid credentials"]}
|
||||
|
||||
|
||||
|
||||
class TokenAuthSchema(Schema):
|
||||
"""Schema for a /check_api_token request"""
|
||||
token: str
|
||||
|
||||
|
||||
@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet
|
||||
def check_api_token(request, token_data: TokenAuthSchema):
|
||||
user = auth_using_token(
|
||||
token=token_data.token,
|
||||
request=request,
|
||||
)
|
||||
if user:
|
||||
return {"success": True, "user_id": str(user.id)}
|
||||
|
||||
return {"success": False, "user_id": None}
|
234
archivebox/api/v1_cli.py
Normal file
234
archivebox/api/v1_cli.py
Normal file
|
@ -0,0 +1,234 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from enum import Enum
|
||||
|
||||
from ninja import Router, Schema
|
||||
|
||||
from ..main import (
|
||||
add,
|
||||
remove,
|
||||
update,
|
||||
list_all,
|
||||
schedule,
|
||||
)
|
||||
from ..util import ansi_to_html
|
||||
from ..config import ONLY_NEW
|
||||
|
||||
|
||||
# router for API that exposes archivebox cli subcommands as REST endpoints
|
||||
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
|
||||
|
||||
|
||||
# Schemas
|
||||
|
||||
JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
|
||||
|
||||
class CLICommandResponseSchema(Schema):
|
||||
success: bool
|
||||
errors: List[str]
|
||||
result: JSONType
|
||||
stdout: str
|
||||
stderr: str
|
||||
|
||||
class FilterTypeChoices(str, Enum):
|
||||
exact = 'exact'
|
||||
substring = 'substring'
|
||||
regex = 'regex'
|
||||
domain = 'domain'
|
||||
tag = 'tag'
|
||||
timestamp = 'timestamp'
|
||||
|
||||
class StatusChoices(str, Enum):
|
||||
indexed = 'indexed'
|
||||
archived = 'archived'
|
||||
unarchived = 'unarchived'
|
||||
present = 'present'
|
||||
valid = 'valid'
|
||||
invalid = 'invalid'
|
||||
duplicate = 'duplicate'
|
||||
orphaned = 'orphaned'
|
||||
corrupted = 'corrupted'
|
||||
unrecognized = 'unrecognized'
|
||||
|
||||
|
||||
class AddCommandSchema(Schema):
|
||||
urls: List[str]
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
|
||||
update_all: bool = False
|
||||
index_only: bool = False
|
||||
overwrite: bool = False
|
||||
init: bool = False
|
||||
extractors: str = ""
|
||||
parser: str = "auto"
|
||||
|
||||
class UpdateCommandSchema(Schema):
|
||||
resume: Optional[float] = 0
|
||||
only_new: bool = ONLY_NEW
|
||||
index_only: bool = False
|
||||
overwrite: bool = False
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
status: Optional[StatusChoices] = StatusChoices.unarchived
|
||||
filter_type: Optional[str] = FilterTypeChoices.substring
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
extractors: Optional[str] = ""
|
||||
|
||||
class ScheduleCommandSchema(Schema):
|
||||
import_path: Optional[str] = None
|
||||
add: bool = False
|
||||
every: Optional[str] = None
|
||||
tag: str = ''
|
||||
depth: int = 0
|
||||
overwrite: bool = False
|
||||
update: bool = not ONLY_NEW
|
||||
clear: bool = False
|
||||
|
||||
class ListCommandSchema(Schema):
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
filter_type: str = FilterTypeChoices.substring
|
||||
status: Optional[StatusChoices] = StatusChoices.indexed
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
sort: str = 'added'
|
||||
as_json: bool = True
|
||||
as_html: bool = False
|
||||
as_csv: str | bool = 'timestamp,url'
|
||||
with_headers: bool = False
|
||||
|
||||
class RemoveCommandSchema(Schema):
|
||||
delete: bool = True
|
||||
after: Optional[float] = 0
|
||||
before: Optional[float] = 999999999999999
|
||||
filter_type: str = FilterTypeChoices.exact
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
|
||||
def cli_add(request, args: AddCommandSchema):
|
||||
result = add(
|
||||
urls=args.urls,
|
||||
tag=args.tag,
|
||||
depth=args.depth,
|
||||
update=args.update,
|
||||
update_all=args.update_all,
|
||||
index_only=args.index_only,
|
||||
overwrite=args.overwrite,
|
||||
init=args.init,
|
||||
extractors=args.extractors,
|
||||
parser=args.parser,
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
|
||||
def cli_update(request, args: UpdateCommandSchema):
|
||||
result = update(
|
||||
resume=args.resume,
|
||||
only_new=args.only_new,
|
||||
index_only=args.index_only,
|
||||
overwrite=args.overwrite,
|
||||
before=args.before,
|
||||
after=args.after,
|
||||
status=args.status,
|
||||
filter_type=args.filter_type,
|
||||
filter_patterns=args.filter_patterns,
|
||||
extractors=args.extractors,
|
||||
)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
|
||||
def cli_schedule(request, args: ScheduleCommandSchema):
|
||||
result = schedule(
|
||||
import_path=args.import_path,
|
||||
add=args.add,
|
||||
show=args.show,
|
||||
clear=args.clear,
|
||||
every=args.every,
|
||||
tag=args.tag,
|
||||
depth=args.depth,
|
||||
overwrite=args.overwrite,
|
||||
update=args.update,
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns]')
|
||||
def cli_list(request, args: ListCommandSchema):
|
||||
result = list_all(
|
||||
filter_patterns=args.filter_patterns,
|
||||
filter_type=args.filter_type,
|
||||
status=args.status,
|
||||
after=args.after,
|
||||
before=args.before,
|
||||
sort=args.sort,
|
||||
csv=args.as_csv,
|
||||
json=args.as_json,
|
||||
html=args.as_html,
|
||||
with_headers=args.with_headers,
|
||||
)
|
||||
|
||||
result_format = 'txt'
|
||||
if args.as_json:
|
||||
result_format = "json"
|
||||
elif args.as_html:
|
||||
result_format = "html"
|
||||
elif args.as_csv:
|
||||
result_format = "csv"
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"result_format": result_format,
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
|
||||
def cli_remove(request, args: RemoveCommandSchema):
|
||||
result = remove(
|
||||
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
||||
delete=args.delete,
|
||||
before=args.before,
|
||||
after=args.after,
|
||||
filter_type=args.filter_type,
|
||||
filter_patterns=args.filter_patterns,
|
||||
)
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
210
archivebox/api/v1_core.py
Normal file
210
archivebox/api/v1_core.py
Normal file
|
@ -0,0 +1,210 @@
|
|||
__package__ = 'archivebox.api'
|
||||
|
||||
from uuid import UUID
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from django.shortcuts import get_object_or_404
|
||||
|
||||
from ninja import Router, Schema, FilterSchema, Field, Query
|
||||
from ninja.pagination import paginate
|
||||
|
||||
from core.models import Snapshot, ArchiveResult, Tag
|
||||
|
||||
|
||||
router = Router(tags=['Core Models'])
|
||||
|
||||
|
||||
|
||||
|
||||
### ArchiveResult #########################################################################
|
||||
|
||||
class ArchiveResultSchema(Schema):
|
||||
id: UUID
|
||||
|
||||
snapshot_id: UUID
|
||||
snapshot_url: str
|
||||
snapshot_tags: str
|
||||
|
||||
extractor: str
|
||||
cmd: List[str]
|
||||
pwd: str
|
||||
cmd_version: str
|
||||
output: str
|
||||
status: str
|
||||
|
||||
created: datetime
|
||||
|
||||
@staticmethod
|
||||
def resolve_id(obj):
|
||||
return obj.uuid
|
||||
|
||||
@staticmethod
|
||||
def resolve_created(obj):
|
||||
return obj.start_ts
|
||||
|
||||
@staticmethod
|
||||
def resolve_snapshot_url(obj):
|
||||
return obj.snapshot.url
|
||||
|
||||
@staticmethod
|
||||
def resolve_snapshot_tags(obj):
|
||||
return obj.snapshot.tags_str()
|
||||
|
||||
|
||||
class ArchiveResultFilterSchema(FilterSchema):
|
||||
id: Optional[UUID] = Field(None, q='uuid')
|
||||
|
||||
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains'])
|
||||
snapshot_id: Optional[UUID] = Field(None, q='snapshot_id')
|
||||
snapshot_url: Optional[str] = Field(None, q='snapshot__url')
|
||||
snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name')
|
||||
|
||||
status: Optional[str] = Field(None, q='status')
|
||||
output: Optional[str] = Field(None, q='output__icontains')
|
||||
extractor: Optional[str] = Field(None, q='extractor__icontains')
|
||||
cmd: Optional[str] = Field(None, q='cmd__0__icontains')
|
||||
pwd: Optional[str] = Field(None, q='pwd__icontains')
|
||||
cmd_version: Optional[str] = Field(None, q='cmd_version')
|
||||
|
||||
created: Optional[datetime] = Field(None, q='updated')
|
||||
created__gte: Optional[datetime] = Field(None, q='updated__gte')
|
||||
created__lt: Optional[datetime] = Field(None, q='updated__lt')
|
||||
|
||||
|
||||
@router.get("/archiveresults", response=List[ArchiveResultSchema])
|
||||
@paginate
|
||||
def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
|
||||
qs = ArchiveResult.objects.all()
|
||||
results = filters.filter(qs)
|
||||
return results
|
||||
|
||||
|
||||
@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
|
||||
def get_archiveresult(request, archiveresult_id: str):
|
||||
archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||
return archiveresult
|
||||
|
||||
|
||||
# @router.post("/archiveresult", response=ArchiveResultSchema)
|
||||
# def create_archiveresult(request, payload: ArchiveResultSchema):
|
||||
# archiveresult = ArchiveResult.objects.create(**payload.dict())
|
||||
# return archiveresult
|
||||
#
|
||||
# @router.put("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
|
||||
# def update_archiveresult(request, archiveresult_id: str, payload: ArchiveResultSchema):
|
||||
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||
#
|
||||
# for attr, value in payload.dict().items():
|
||||
# setattr(archiveresult, attr, value)
|
||||
# archiveresult.save()
|
||||
#
|
||||
# return archiveresult
|
||||
#
|
||||
# @router.delete("/archiveresult/{archiveresult_id}")
|
||||
# def delete_archiveresult(request, archiveresult_id: str):
|
||||
# archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id)
|
||||
# archiveresult.delete()
|
||||
# return {"success": True}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
### Snapshot #########################################################################
|
||||
|
||||
|
||||
class SnapshotSchema(Schema):
|
||||
id: UUID
|
||||
|
||||
url: str
|
||||
tags: str
|
||||
title: Optional[str]
|
||||
timestamp: str
|
||||
bookmarked: datetime
|
||||
added: datetime
|
||||
updated: datetime
|
||||
archive_path: str
|
||||
|
||||
archiveresults: List[ArchiveResultSchema]
|
||||
|
||||
# @staticmethod
|
||||
# def resolve_id(obj):
|
||||
# return str(obj.id)
|
||||
|
||||
@staticmethod
|
||||
def resolve_tags(obj):
|
||||
return obj.tags_str()
|
||||
|
||||
@staticmethod
|
||||
def resolve_archiveresults(obj, context):
|
||||
if context['request'].with_archiveresults:
|
||||
return obj.archiveresult_set.all().distinct()
|
||||
return ArchiveResult.objects.none()
|
||||
|
||||
|
||||
class SnapshotFilterSchema(FilterSchema):
|
||||
id: Optional[UUID] = Field(None, q='id')
|
||||
|
||||
search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains'])
|
||||
url: Optional[str] = Field(None, q='url')
|
||||
tag: Optional[str] = Field(None, q='tags__name')
|
||||
title: Optional[str] = Field(None, q='title__icontains')
|
||||
|
||||
timestamp: Optional[str] = Field(None, q='timestamp__startswith')
|
||||
|
||||
added: Optional[datetime] = Field(None, q='added')
|
||||
added__gte: Optional[datetime] = Field(None, q='added__gte')
|
||||
added__lt: Optional[datetime] = Field(None, q='added__lt')
|
||||
|
||||
|
||||
@router.get("/snapshots", response=List[SnapshotSchema])
|
||||
@paginate
|
||||
def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True):
|
||||
request.with_archiveresults = with_archiveresults
|
||||
|
||||
qs = Snapshot.objects.all()
|
||||
results = filters.filter(qs)
|
||||
return results
|
||||
|
||||
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema)
|
||||
def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
|
||||
request.with_archiveresults = with_archiveresults
|
||||
snapshot = get_object_or_404(Snapshot, id=snapshot_id)
|
||||
return snapshot
|
||||
|
||||
|
||||
# @router.post("/snapshot", response=SnapshotSchema)
|
||||
# def create_snapshot(request, payload: SnapshotSchema):
|
||||
# snapshot = Snapshot.objects.create(**payload.dict())
|
||||
# return snapshot
|
||||
#
|
||||
# @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema)
|
||||
# def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema):
|
||||
# snapshot = get_object_or_404(Snapshot, id=snapshot_id)
|
||||
#
|
||||
# for attr, value in payload.dict().items():
|
||||
# setattr(snapshot, attr, value)
|
||||
# snapshot.save()
|
||||
#
|
||||
# return snapshot
|
||||
#
|
||||
# @router.delete("/snapshot/{snapshot_id}")
|
||||
# def delete_snapshot(request, snapshot_id: str):
|
||||
# snapshot = get_object_or_404(Snapshot, id=snapshot_id)
|
||||
# snapshot.delete()
|
||||
# return {"success": True}
|
||||
|
||||
|
||||
|
||||
### Tag #########################################################################
|
||||
|
||||
|
||||
class TagSchema(Schema):
|
||||
name: str
|
||||
slug: str
|
||||
|
||||
|
||||
@router.get("/tags", response=List[TagSchema])
|
||||
def list_tags(request):
|
||||
return Tag.objects.all()
|
|
@ -112,7 +112,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
|
||||
'LDAP_CREATE_SUPERUSER': {'type': bool, 'default': False},
|
||||
},
|
||||
|
||||
'ARCHIVE_METHOD_TOGGLES': {
|
||||
|
@ -265,7 +265,7 @@ CONFIG_ALIASES = {
|
|||
for key, default in section.items()
|
||||
for alias in default.get('aliases', ())
|
||||
}
|
||||
USER_CONFIG = {key for section in CONFIG_SCHEMA.values() for key in section.keys()}
|
||||
USER_CONFIG = {key: section[key] for section in CONFIG_SCHEMA.values() for key in section.keys()}
|
||||
|
||||
def get_real_name(key: str) -> str:
|
||||
"""get the current canonical name for a given deprecated config key"""
|
||||
|
@ -282,6 +282,7 @@ ARCHIVE_DIR_NAME = 'archive'
|
|||
SOURCES_DIR_NAME = 'sources'
|
||||
LOGS_DIR_NAME = 'logs'
|
||||
PERSONAS_DIR_NAME = 'personas'
|
||||
CRONTABS_DIR_NAME = 'crontabs'
|
||||
SQL_INDEX_FILENAME = 'index.sqlite3'
|
||||
JSON_INDEX_FILENAME = 'index.json'
|
||||
HTML_INDEX_FILENAME = 'index.html'
|
||||
|
@ -355,7 +356,7 @@ ALLOWED_IN_OUTPUT_DIR = {
|
|||
'static',
|
||||
'sonic',
|
||||
'search.sqlite3',
|
||||
'crontabs',
|
||||
CRONTABS_DIR_NAME,
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
|
@ -598,7 +599,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
|
||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
|
||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
||||
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
|
||||
|
@ -985,11 +985,6 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
|||
'enabled': True,
|
||||
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
|
||||
},
|
||||
'CUSTOM_TEMPLATES_DIR': {
|
||||
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
|
||||
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
|
||||
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
|
||||
},
|
||||
# 'NODE_MODULES_DIR': {
|
||||
# 'path': ,
|
||||
# 'enabled': ,
|
||||
|
@ -997,50 +992,25 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
|||
# },
|
||||
}
|
||||
|
||||
def get_external_locations(config: ConfigDict) -> ConfigValue:
|
||||
abspath = lambda path: None if path is None else Path(path).resolve()
|
||||
return {
|
||||
'CHROME_USER_DATA_DIR': {
|
||||
'path': abspath(config['CHROME_USER_DATA_DIR']),
|
||||
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
||||
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
|
||||
},
|
||||
'COOKIES_FILE': {
|
||||
'path': abspath(config['COOKIES_FILE']),
|
||||
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
||||
'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
|
||||
},
|
||||
}
|
||||
|
||||
def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
# OLD: migrating to personas
|
||||
# 'CHROME_USER_DATA_DIR': {
|
||||
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
|
||||
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
||||
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
|
||||
# },
|
||||
# 'COOKIES_FILE': {
|
||||
# 'path': os.path.abspath(config['COOKIES_FILE']),
|
||||
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
||||
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
|
||||
# },
|
||||
'OUTPUT_DIR': {
|
||||
'path': config['OUTPUT_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
|
||||
},
|
||||
'SOURCES_DIR': {
|
||||
'path': config['SOURCES_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['SOURCES_DIR'].exists(),
|
||||
},
|
||||
'LOGS_DIR': {
|
||||
'path': config['LOGS_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['LOGS_DIR'].exists(),
|
||||
},
|
||||
'PERSONAS_DIR': {
|
||||
'path': config['PERSONAS_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['PERSONAS_DIR'].exists(),
|
||||
},
|
||||
'ARCHIVE_DIR': {
|
||||
'path': config['ARCHIVE_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['ARCHIVE_DIR'].exists(),
|
||||
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
||||
},
|
||||
'CONFIG_FILE': {
|
||||
'path': config['CONFIG_FILE'].resolve(),
|
||||
'enabled': True,
|
||||
|
@ -1052,6 +1022,38 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
|
||||
},
|
||||
'ARCHIVE_DIR': {
|
||||
'path': config['ARCHIVE_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['ARCHIVE_DIR'].exists(),
|
||||
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
||||
},
|
||||
'SOURCES_DIR': {
|
||||
'path': config['SOURCES_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['SOURCES_DIR'].exists(),
|
||||
},
|
||||
'LOGS_DIR': {
|
||||
'path': config['LOGS_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['LOGS_DIR'].exists(),
|
||||
},
|
||||
'CUSTOM_TEMPLATES_DIR': {
|
||||
'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
|
||||
'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
|
||||
'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
|
||||
},
|
||||
'PERSONAS_DIR': {
|
||||
'path': config['PERSONAS_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['PERSONAS_DIR'].exists(),
|
||||
},
|
||||
# managed by bin/docker_entrypoint.sh and python-crontab:
|
||||
# 'CRONTABS_DIR': {
|
||||
# 'path': config['CRONTABS_DIR'].resolve(),
|
||||
# 'enabled': True,
|
||||
# 'is_valid': config['CRONTABS_DIR'].exists(),
|
||||
# },
|
||||
}
|
||||
|
||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
|
@ -1366,6 +1368,7 @@ def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=C
|
|||
stderr(' archivebox init')
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
|
||||
output_dir = out_dir or config['OUTPUT_DIR']
|
||||
from .index.sql import list_migrations
|
||||
|
|
|
@ -14,12 +14,17 @@ from django.shortcuts import render, redirect
|
|||
from django.contrib.auth import get_user_model
|
||||
from django import forms
|
||||
|
||||
|
||||
from signal_webhooks.apps import DjangoSignalWebhooksConfig
|
||||
from signal_webhooks.admin import WebhookAdmin, WebhookModel
|
||||
|
||||
from ..util import htmldecode, urldecode, ansi_to_html
|
||||
|
||||
from core.models import Snapshot, ArchiveResult, Tag
|
||||
from core.forms import AddLinkForm
|
||||
|
||||
from core.mixins import SearchResultsAdminMixin
|
||||
from api.models import APIToken
|
||||
|
||||
from index.html import snapshot_icons
|
||||
from logging_util import printable_filesize
|
||||
|
@ -98,10 +103,32 @@ class ArchiveBoxAdmin(admin.AdminSite):
|
|||
|
||||
return render(template_name='add.html', request=request, context=context)
|
||||
|
||||
|
||||
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
|
||||
DjangoSignalWebhooksConfig.verbose_name = 'API'
|
||||
WebhookModel._meta.get_field('name').help_text = 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).'
|
||||
WebhookModel._meta.get_field('signal').help_text = 'The type of event the webhook should fire for (e.g. Create, Update, Delete).'
|
||||
WebhookModel._meta.get_field('ref').help_text = 'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).'
|
||||
WebhookModel._meta.get_field('endpoint').help_text = 'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).'
|
||||
WebhookModel._meta.app_label = 'api'
|
||||
|
||||
|
||||
archivebox_admin = ArchiveBoxAdmin()
|
||||
archivebox_admin.register(get_user_model())
|
||||
archivebox_admin.register(APIToken)
|
||||
archivebox_admin.register(WebhookModel, WebhookAdmin)
|
||||
archivebox_admin.disable_action('delete_selected')
|
||||
|
||||
|
||||
# patch admin with methods to add data views
|
||||
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
|
||||
|
||||
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
|
||||
|
||||
class ArchiveResultInline(admin.TabularInline):
|
||||
model = ArchiveResult
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
|
@ -5,6 +7,22 @@ class CoreConfig(AppConfig):
|
|||
name = 'core'
|
||||
|
||||
def ready(self):
|
||||
# register our custom admin as the primary django admin
|
||||
from django.contrib import admin
|
||||
from django.contrib.admin import sites
|
||||
from core.admin import archivebox_admin
|
||||
|
||||
admin.site = archivebox_admin
|
||||
sites.site = archivebox_admin
|
||||
|
||||
|
||||
# register signal handlers
|
||||
from .auth import register_signals
|
||||
|
||||
register_signals()
|
||||
|
||||
|
||||
|
||||
# from django.contrib.admin.apps import AdminConfig
|
||||
# class CoreAdminConfig(AdminConfig):
|
||||
# default_site = "core.admin.get_admin_site"
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
from django.conf import settings
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
|
||||
from ..config import (
|
||||
LDAP
|
||||
)
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
from django.conf import settings
|
||||
from ..config import (
|
||||
LDAP_CREATE_SUPERUSER
|
||||
)
|
||||
|
||||
def create_user(sender, user=None, ldap_user=None, **kwargs):
|
||||
|
||||
if not user.id and LDAP_CREATE_SUPERUSER:
|
||||
user.is_superuser = True
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ from ..config import (
|
|||
CUSTOM_TEMPLATES_DIR,
|
||||
SQL_INDEX_FILENAME,
|
||||
OUTPUT_DIR,
|
||||
ARCHIVE_DIR,
|
||||
LOGS_DIR,
|
||||
TIMEZONE,
|
||||
|
||||
|
@ -63,6 +64,9 @@ INSTALLED_APPS = [
|
|||
'core',
|
||||
'api',
|
||||
|
||||
'admin_data_views',
|
||||
|
||||
'signal_webhooks',
|
||||
'django_extensions',
|
||||
]
|
||||
|
||||
|
@ -173,6 +177,17 @@ if DEBUG_TOOLBAR:
|
|||
]
|
||||
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
||||
|
||||
|
||||
# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar)
|
||||
# Must delete archivebox/templates/admin to use because it relies on some things we override
|
||||
# visit /__requests_tracker__/ to access
|
||||
DEBUG_REQUESTS_TRACKER = False
|
||||
if DEBUG_REQUESTS_TRACKER:
|
||||
INSTALLED_APPS += ["requests_tracker"]
|
||||
MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"]
|
||||
INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"]
|
||||
|
||||
|
||||
################################################################################
|
||||
### Staticfile and Template Settings
|
||||
################################################################################
|
||||
|
@ -242,6 +257,29 @@ CACHES = {
|
|||
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
||||
|
||||
|
||||
STORAGES = {
|
||||
"default": {
|
||||
"BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||
},
|
||||
"staticfiles": {
|
||||
"BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage",
|
||||
},
|
||||
"archive": {
|
||||
"BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||
"OPTIONS": {
|
||||
"base_url": "/archive/",
|
||||
"location": ARCHIVE_DIR,
|
||||
},
|
||||
},
|
||||
# "personas": {
|
||||
# "BACKEND": "django.core.files.storage.FileSystemStorage",
|
||||
# "OPTIONS": {
|
||||
# "base_url": "/personas/",
|
||||
# "location": PERSONAS_DIR,
|
||||
# },
|
||||
# },
|
||||
}
|
||||
|
||||
################################################################################
|
||||
### Security Settings
|
||||
################################################################################
|
||||
|
@ -368,3 +406,32 @@ LOGGING = {
|
|||
}
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Add default webhook configuration to the User model
|
||||
SIGNAL_WEBHOOKS = {
|
||||
"HOOKS": {
|
||||
"django.contrib.auth.models.User": ...,
|
||||
"core.models.Snapshot": ...,
|
||||
"core.models.ArchiveResult": ...,
|
||||
"core.models.Tag": ...,
|
||||
"api.models.APIToken": ...,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
ADMIN_DATA_VIEWS = {
|
||||
"NAME": "configuration",
|
||||
"URLS": [
|
||||
{
|
||||
"route": "live/",
|
||||
"view": "core.views.live_config_list_view",
|
||||
"name": "live",
|
||||
"items": {
|
||||
"route": "<str:key>/",
|
||||
"view": "core.views.live_config_value_view",
|
||||
"name": "live_config_value",
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .admin import archivebox_admin
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from django.urls import path, include
|
||||
from django.views import static
|
||||
|
@ -6,14 +6,9 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
|
|||
from django.conf import settings
|
||||
from django.views.generic.base import RedirectView
|
||||
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||
from .admin import archivebox_admin
|
||||
from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||
|
||||
from ninja import NinjaAPI
|
||||
from api.auth import GlobalAuth
|
||||
|
||||
api = NinjaAPI(auth=GlobalAuth())
|
||||
api.add_router("/auth/", "api.auth.router")
|
||||
api.add_router("/archive/", "api.archive.router")
|
||||
|
||||
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
||||
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
||||
|
@ -43,10 +38,10 @@ urlpatterns = [
|
|||
path('accounts/', include('django.contrib.auth.urls')),
|
||||
path('admin/', archivebox_admin.urls),
|
||||
|
||||
path("api/", api.urls),
|
||||
path("api/", include('api.urls')),
|
||||
|
||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||
path('error/', lambda _: 1/0),
|
||||
path('error/', lambda *_: 1/0),
|
||||
|
||||
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
||||
|
||||
|
@ -57,10 +52,10 @@ urlpatterns = [
|
|||
urlpatterns += staticfiles_urlpatterns()
|
||||
|
||||
if settings.DEBUG_TOOLBAR:
|
||||
import debug_toolbar
|
||||
urlpatterns += [
|
||||
path('__debug__/', include(debug_toolbar.urls)),
|
||||
]
|
||||
urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]
|
||||
|
||||
if settings.DEBUG_REQUESTS_TRACKER:
|
||||
urlpatterns += [path("__requests_tracker__/", include("requests_tracker.urls"))]
|
||||
|
||||
|
||||
# # Proposed FUTURE URLs spec
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
from typing import Callable
|
||||
|
||||
from io import StringIO
|
||||
from contextlib import redirect_stdout
|
||||
|
||||
from django.shortcuts import render, redirect
|
||||
from django.http import HttpResponse, Http404
|
||||
from django.http import HttpRequest, HttpResponse, Http404
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from django.views import View, static
|
||||
from django.views.generic.list import ListView
|
||||
|
@ -14,6 +16,10 @@ from django.contrib.auth.mixins import UserPassesTestMixin
|
|||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.utils.decorators import method_decorator
|
||||
|
||||
from admin_data_views.typing import TableContext, ItemContext
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
|
||||
from core.models import Snapshot
|
||||
from core.forms import AddLinkForm
|
||||
|
||||
|
@ -26,6 +32,10 @@ from ..config import (
|
|||
COMMIT_HASH,
|
||||
FOOTER_INFO,
|
||||
SNAPSHOTS_PER_PAGE,
|
||||
CONFIG,
|
||||
CONFIG_SCHEMA,
|
||||
DYNAMIC_CONFIG_SCHEMA,
|
||||
USER_CONFIG,
|
||||
)
|
||||
from ..main import add
|
||||
from ..util import base_url, ansi_to_html
|
||||
|
@ -124,9 +134,9 @@ class SnapshotView(View):
|
|||
'<center><br/><br/><br/>'
|
||||
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
|
||||
'{}'
|
||||
f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
|
||||
'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
|
||||
f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
||||
f'</code></b> does not exist in the <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
|
||||
'It\'s possible that this resource type is not available for the Snapshot,<br/>or that the archiving process has not completed yet.<br/>'
|
||||
f'<pre><code># if interrupted, run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
||||
'<div class="text-align: left; width: 100%; max-width: 400px">'
|
||||
'<i><b>Next steps:</i></b><br/>'
|
||||
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
|
||||
|
@ -312,3 +322,124 @@ class HealthCheckView(View):
|
|||
content_type='text/plain',
|
||||
status=200
|
||||
)
|
||||
|
||||
|
||||
def find_config_section(key: str) -> str:
|
||||
matching_sections = [
|
||||
name for name, opts in CONFIG_SCHEMA.items() if key in opts
|
||||
]
|
||||
section = matching_sections[0] if matching_sections else 'DYNAMIC'
|
||||
return section
|
||||
|
||||
def find_config_default(key: str) -> str:
|
||||
default_val = USER_CONFIG.get(key, {}).get('default', lambda: None)
|
||||
if isinstance(default_val, Callable):
|
||||
return None
|
||||
else:
|
||||
default_val = repr(default_val)
|
||||
return default_val
|
||||
|
||||
def find_config_type(key: str) -> str:
|
||||
if key in USER_CONFIG:
|
||||
return USER_CONFIG[key]['type'].__name__
|
||||
elif key in DYNAMIC_CONFIG_SCHEMA:
|
||||
return type(CONFIG[key]).__name__
|
||||
return 'str'
|
||||
|
||||
def key_is_safe(key: str) -> bool:
|
||||
for term in ('key', 'password', 'secret', 'token'):
|
||||
if term in key.lower():
|
||||
return False
|
||||
return True
|
||||
|
||||
@render_with_table_view
|
||||
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
rows = {
|
||||
"Section": [],
|
||||
"Key": [],
|
||||
"Type": [],
|
||||
"Value": [],
|
||||
"Default": [],
|
||||
# "Documentation": [],
|
||||
"Aliases": [],
|
||||
}
|
||||
|
||||
for section in CONFIG_SCHEMA.keys():
|
||||
for key in CONFIG_SCHEMA[section].keys():
|
||||
rows['Section'].append(section.replace('_', ' ').title().replace(' Config', ''))
|
||||
rows['Key'].append(ItemLink(key, key=key))
|
||||
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or 'See here...'}</code></a>'))
|
||||
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||
rows['Aliases'].append(', '.join(CONFIG_SCHEMA[section][key].get('aliases', [])))
|
||||
|
||||
section = 'DYNAMIC'
|
||||
for key in DYNAMIC_CONFIG_SCHEMA.keys():
|
||||
rows['Section'].append(section.replace('_', ' ').title().replace(' Config', ''))
|
||||
rows['Key'].append(ItemLink(key, key=key))
|
||||
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
|
||||
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
|
||||
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code"><code style="text-decoration: underline">{find_config_default(key) or 'See here...'}</code></a>'))
|
||||
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||
rows['Aliases'].append(ItemLink(key, key=key) if key in USER_CONFIG else '')
|
||||
|
||||
return TableContext(
|
||||
title="Computed Configuration Values",
|
||||
table=rows,
|
||||
)
|
||||
|
||||
@render_with_item_view
|
||||
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
aliases = USER_CONFIG.get(key, {}).get("aliases", [])
|
||||
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=key,
|
||||
data=[
|
||||
{
|
||||
"name": mark_safe(f'data / ArchiveBox.conf [{find_config_section(key)}] <b><code style="color: lightgray">{key}</code></b>' if key in USER_CONFIG else f'[DYNAMIC CONFIG] <b><code style="color: lightgray">{key}</code></b> <small>(calculated at runtime)</small>'),
|
||||
"description": None,
|
||||
"fields": {
|
||||
'Key': key,
|
||||
'Type': find_config_type(key),
|
||||
'Value': CONFIG[key] if key_is_safe(key) else '********',
|
||||
},
|
||||
"help_texts": {
|
||||
'Key': mark_safe(f'''
|
||||
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a>
|
||||
<span style="display: {"inline" if aliases else "none"}">
|
||||
Aliases: {", ".join(aliases)}
|
||||
</span>
|
||||
'''),
|
||||
'Type': mark_safe(f'''
|
||||
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
|
||||
See full definition in <code>archivebox/config.py</code>...
|
||||
</a>
|
||||
'''),
|
||||
'Value': mark_safe(f'''
|
||||
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
|
||||
Default: <a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig.py+%27{key}%27&type=code">
|
||||
<code>{find_config_default(key) or 'See 1here...'}</code>
|
||||
</a>
|
||||
<br/><br/>
|
||||
<p style="display: {"block" if key in USER_CONFIG else "none"}">
|
||||
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
|
||||
<br/><br/>
|
||||
<code>archivebox config --set {key}="{
|
||||
val.strip("'")
|
||||
if (val := find_config_default(key)) else
|
||||
(repr(CONFIG[key] if key_is_safe(key) else '********')).strip("'")
|
||||
}"</code>
|
||||
</p>
|
||||
'''),
|
||||
},
|
||||
},
|
||||
],
|
||||
)
|
||||
|
|
|
@ -4,6 +4,7 @@ WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED.
|
|||
|
||||
DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
|
||||
|
||||
These are the old types we used to use before ArchiveBox v0.4 (before we switched to Django).
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.index'
|
||||
|
|
|
@ -494,12 +494,12 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
|
|||
if delete:
|
||||
file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
|
||||
print(
|
||||
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' +
|
||||
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
|
||||
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
|
||||
)
|
||||
else:
|
||||
print(
|
||||
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' +
|
||||
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
|
||||
' (Pass --delete if you also want to permanently delete the data folders)'
|
||||
)
|
||||
|
||||
|
|
|
@ -104,7 +104,6 @@ from .config import (
|
|||
COMMIT_HASH,
|
||||
BUILD_TIME,
|
||||
CODE_LOCATIONS,
|
||||
EXTERNAL_LOCATIONS,
|
||||
DATA_LOCATIONS,
|
||||
DEPENDENCIES,
|
||||
CHROME_BINARY,
|
||||
|
@ -231,7 +230,7 @@ def version(quiet: bool=False,
|
|||
p = platform.uname()
|
||||
print(
|
||||
'ArchiveBox v{}'.format(get_version(CONFIG)),
|
||||
*((f'COMMIT_HASH={COMMIT_HASH[:7]}',) if COMMIT_HASH else ()),
|
||||
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||
f'BUILD_TIME={BUILD_TIME}',
|
||||
)
|
||||
print(
|
||||
|
@ -272,11 +271,6 @@ def version(quiet: bool=False,
|
|||
for name, path in CODE_LOCATIONS.items():
|
||||
print(printable_folder_status(name, path))
|
||||
|
||||
print()
|
||||
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
|
||||
for name, path in EXTERNAL_LOCATIONS.items():
|
||||
print(printable_folder_status(name, path))
|
||||
|
||||
print()
|
||||
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
|
||||
print('{white}[i] Data locations:{reset}'.format(**ANSI))
|
||||
|
@ -695,7 +689,7 @@ def add(urls: Union[str, List[str]],
|
|||
if CAN_UPGRADE:
|
||||
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
|
||||
|
||||
return all_links
|
||||
return new_links
|
||||
|
||||
@enforce_types
|
||||
def remove(filter_str: Optional[str]=None,
|
||||
|
@ -1362,7 +1356,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
|
||||
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
||||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||
stderr()
|
||||
stderr('')
|
||||
|
||||
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
|||
# versions of ./manage.py commands whenever possible. When that's not possible
|
||||
# (e.g. makemigrations), you can comment out this check temporarily
|
||||
|
||||
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
|
||||
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'startapp' in sys.argv):
|
||||
print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):")
|
||||
print()
|
||||
print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:')
|
||||
|
|
|
@ -7,7 +7,6 @@ For examples of supported import formats see tests/.
|
|||
|
||||
__package__ = 'archivebox.parsers'
|
||||
|
||||
import re
|
||||
from io import StringIO
|
||||
|
||||
from typing import IO, Tuple, List, Optional
|
||||
|
@ -28,7 +27,6 @@ from ..util import (
|
|||
htmldecode,
|
||||
download_url,
|
||||
enforce_types,
|
||||
URL_REGEX,
|
||||
)
|
||||
from ..index.schema import Link
|
||||
from ..logging_util import TimedProgress, log_source_saved
|
||||
|
@ -202,54 +200,3 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
|
|||
log_source_saved(source_file=source_path)
|
||||
|
||||
return source_path
|
||||
|
||||
|
||||
# Check that plain text regex URL parsing works as expected
|
||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||
# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
|
||||
# the consequences of bad URL parsing could be disastrous and lead to many
|
||||
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
|
||||
_test_url_strs = {
|
||||
'example.com': 0,
|
||||
'/example.com': 0,
|
||||
'//example.com': 0,
|
||||
':/example.com': 0,
|
||||
'://example.com': 0,
|
||||
'htt://example8.com': 0,
|
||||
'/htt://example.com': 0,
|
||||
'https://example': 1,
|
||||
'https://localhost/2345': 1,
|
||||
'https://localhost:1234/123': 1,
|
||||
'://': 0,
|
||||
'https://': 0,
|
||||
'http://': 0,
|
||||
'ftp://': 0,
|
||||
'ftp://example.com': 0,
|
||||
'https://example.com': 1,
|
||||
'https://example.com/': 1,
|
||||
'https://a.example.com': 1,
|
||||
'https://a.example.com/': 1,
|
||||
'https://a.example.com/what/is/happening.html': 1,
|
||||
'https://a.example.com/what/ís/happening.html': 1,
|
||||
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
|
||||
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
|
||||
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
|
||||
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
||||
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
||||
'<test>http://example7.com</test>': 1,
|
||||
'https://<test>': 0,
|
||||
'https://[test]': 0,
|
||||
'http://"test"': 0,
|
||||
'http://\'test\'': 0,
|
||||
'[https://example8.com/what/is/this.php?what=1]': 1,
|
||||
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
||||
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
||||
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
|
||||
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
|
||||
'<or>http://examplehttp://15.badc</that>': 2,
|
||||
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
|
||||
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
|
||||
}
|
||||
for url_str, num_urls in _test_url_strs.items():
|
||||
assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
|
||||
f'{url_str} does not contain {num_urls} urls')
|
||||
|
|
|
@ -10,7 +10,7 @@ from ..index.schema import Link
|
|||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
URL_REGEX,
|
||||
find_all_urls,
|
||||
)
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urljoin
|
||||
|
@ -40,10 +40,22 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
|||
parser.feed(line)
|
||||
for url in parser.urls:
|
||||
if root_url:
|
||||
# resolve relative urls /home.html -> https://example.com/home.html
|
||||
url = urljoin(root_url, url)
|
||||
|
||||
for archivable_url in re.findall(URL_REGEX, url):
|
||||
url_is_absolute = (url.lower().startswith('http://') or url.lower().startswith('https://'))
|
||||
# url = https://abc.com => True
|
||||
# url = /page.php?next=https://example.com => False
|
||||
|
||||
if not url_is_absolute: # resolve it by joining it with root_url
|
||||
relative_path = url
|
||||
|
||||
url = urljoin(root_url, relative_path) # https://example.com/somepage.html + /home.html
|
||||
# => https://example.com/home.html
|
||||
|
||||
# special case to handle bug around // handling, crucial for urls that contain sub-urls
|
||||
# e.g. https://web.archive.org/web/https://example.com
|
||||
if did_urljoin_misbehave(root_url, relative_path, url):
|
||||
url = fix_urljoin_bug(url)
|
||||
|
||||
for archivable_url in find_all_urls(url):
|
||||
yield Link(
|
||||
url=htmldecode(archivable_url),
|
||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
||||
|
@ -56,3 +68,74 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None,
|
|||
KEY = 'html'
|
||||
NAME = 'Generic HTML'
|
||||
PARSER = parse_generic_html_export
|
||||
|
||||
|
||||
#### WORKAROUND CODE FOR https://github.com/python/cpython/issues/96015 ####
|
||||
|
||||
def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
|
||||
"""
|
||||
Handle urljoin edge case bug where multiple slashes get turned into a single slash:
|
||||
- https://github.com/python/cpython/issues/96015
|
||||
- https://github.com/ArchiveBox/ArchiveBox/issues/1411
|
||||
|
||||
This workaround only fixes the most common case of a sub-URL inside an outer URL, e.g.:
|
||||
https://web.archive.org/web/https://example.com/some/inner/url
|
||||
|
||||
But there are other valid URLs containing // that are not fixed by this workaround, e.g.:
|
||||
https://example.com/drives/C//some/file
|
||||
"""
|
||||
|
||||
# if relative path is actually an absolute url, cut off its own scheme so we check the path component only
|
||||
relative_path = relative_path.lower()
|
||||
if relative_path.startswith('http://') or relative_path.startswith('https://'):
|
||||
relative_path = relative_path.split('://', 1)[-1]
|
||||
|
||||
# TODO: properly fix all double // getting stripped by urljoin, not just ://
|
||||
original_path_had_suburl = '://' in relative_path
|
||||
original_root_had_suburl = '://' in root_url[8:] # ignore first 8 chars because root always starts with https://
|
||||
final_joined_has_suburl = '://' in final_url[8:] # ignore first 8 chars because final always starts with https://
|
||||
|
||||
urljoin_broke_suburls = (
|
||||
(original_root_had_suburl or original_path_had_suburl)
|
||||
and not final_joined_has_suburl
|
||||
)
|
||||
return urljoin_broke_suburls
|
||||
|
||||
|
||||
def fix_urljoin_bug(url: str, nesting_limit=5):
|
||||
"""
|
||||
recursively replace broken suburls .../http:/... with .../http://...
|
||||
|
||||
basically equivalent to this for 99.9% of cases:
|
||||
url = url.replace('/http:/', '/http://')
|
||||
url = url.replace('/https:/', '/https://')
|
||||
except this handles:
|
||||
other schemes besides http/https (e.g. https://example.com/link/git+ssh://github.com/example)
|
||||
other preceding separators besides / (e.g. https://example.com/login/?next=https://example.com/home)
|
||||
fixing multiple suburls recursively
|
||||
"""
|
||||
input_url = url
|
||||
for _ in range(nesting_limit):
|
||||
url = re.sub(
|
||||
r'(?P<root>.+?)' # https://web.archive.org/web
|
||||
+ r'(?P<separator>[-=/_&+%$#@!*\(\\])' # /
|
||||
+ r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/' # http:/
|
||||
+ r'(?P<suburl>[^/\\]+)', # example.com
|
||||
r"\1\2\3://\4",
|
||||
input_url,
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
if url == input_url:
|
||||
break # nothing left to replace, all suburls are fixed
|
||||
input_url = url
|
||||
|
||||
return url
|
||||
|
||||
|
||||
# sanity check to make sure workaround code works as expected and doesnt introduce *more* bugs
|
||||
assert did_urljoin_misbehave('https://web.archive.org/web/https://example.com', 'abc.html', 'https://web.archive.org/web/https:/example.com/abc.html') == True
|
||||
assert did_urljoin_misbehave('http://example.com', 'https://web.archive.org/web/http://example.com/abc.html', 'https://web.archive.org/web/http:/example.com/abc.html') == True
|
||||
assert fix_urljoin_bug('https:/example.com') == 'https:/example.com' # should not modify original url's scheme, only sub-urls
|
||||
assert fix_urljoin_bug('https://web.archive.org/web/https:/example.com/abc.html') == 'https://web.archive.org/web/https://example.com/abc.html'
|
||||
assert fix_urljoin_bug('http://example.com/link/git+ssh:/github.com/example?next=ftp:/example.com') == 'http://example.com/link/git+ssh://github.com/example?next=ftp://example.com'
|
||||
|
||||
|
|
|
@ -72,21 +72,13 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
|
||||
json_file.seek(0)
|
||||
|
||||
try:
|
||||
links = json.load(json_file)
|
||||
if type(links) != list:
|
||||
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
|
||||
except json.decoder.JSONDecodeError:
|
||||
# sometimes the first line is a comment or other junk, so try without
|
||||
json_file.seek(0)
|
||||
first_line = json_file.readline()
|
||||
#print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
|
||||
links = json.load(json_file)
|
||||
# we may fail again, which means we really don't know what to do
|
||||
|
||||
links = json.load(json_file)
|
||||
if type(links) != list:
|
||||
raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
|
||||
|
||||
for link in links:
|
||||
if link:
|
||||
yield jsonObjectToLink(link,json_file.name)
|
||||
yield jsonObjectToLink(link, json_file.name)
|
||||
|
||||
KEY = 'json'
|
||||
NAME = 'Generic JSON'
|
||||
|
|
|
@ -3,11 +3,9 @@ __package__ = 'archivebox.parsers'
|
|||
import json
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
__description__ = 'Plain Text'
|
||||
|
||||
import re
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
@ -11,7 +9,7 @@ from ..index.schema import Link
|
|||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
URL_REGEX
|
||||
find_all_urls,
|
||||
)
|
||||
|
||||
|
||||
|
@ -39,7 +37,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
pass
|
||||
|
||||
# otherwise look for anything that looks like a URL in the line
|
||||
for url in re.findall(URL_REGEX, line):
|
||||
for url in find_all_urls(line):
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
||||
|
@ -48,17 +46,6 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
sources=[text_file.name],
|
||||
)
|
||||
|
||||
# look inside the URL for any sub-urls, e.g. for archive.org links
|
||||
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
||||
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
||||
for sub_url in re.findall(URL_REGEX, line[1:]):
|
||||
yield Link(
|
||||
url=htmldecode(sub_url),
|
||||
timestamp=str(datetime.now(timezone.utc).timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[text_file.name],
|
||||
)
|
||||
|
||||
KEY = 'txt'
|
||||
NAME = 'Generic TXT'
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
<a href="/admin/core/tag/">Tags</a> |
|
||||
<a href="/admin/core/archiveresult/?o=-1">Log</a>
|
||||
<a href="{% url 'Docs' %}" target="_blank" rel="noopener noreferrer">Docs</a> |
|
||||
<a href="/api">API</a> |
|
||||
<a href="{% url 'public-index' %}">Public</a> |
|
||||
<a href="/admin/">Admin</a>
|
||||
|
||||
|
@ -16,7 +17,7 @@
|
|||
{% endblock %}
|
||||
{% block userlinks %}
|
||||
{% if user.has_usable_password %}
|
||||
<a href="{% url 'admin:password_change' %}">Account</a> /
|
||||
<a href="{% url 'admin:password_change' %}" title="Change your account password">Account</a> /
|
||||
{% endif %}
|
||||
<a href="{% url 'admin:logout' %}">{% trans 'Log out' %}</a>
|
||||
{% endblock %}
|
||||
|
|
|
@ -62,12 +62,12 @@ COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m
|
|||
|
||||
# https://mathiasbynens.be/demo/url-regex
|
||||
URL_REGEX = re.compile(
|
||||
r'(?=(' +
|
||||
r'http[s]?://' + # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' + # followed by allowed alphanum characters
|
||||
r'|[-_$@.&+!*\(\),]' + # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||
r'|[^\u0000-\u007F])+' + # or allowed unicode bytes
|
||||
r'[^\]\[<>"\'\s]+' + # stop parsing at these symbols
|
||||
r'(?=('
|
||||
r'http[s]?://' # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||
r'|[^\u0000-\u007F])+' # or allowed unicode bytes
|
||||
r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
|
||||
r'))',
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
|
@ -90,6 +90,11 @@ def fix_url_from_markdown(url_str: str) -> str:
|
|||
helpful to fix URLs parsed from markdown e.g.
|
||||
input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
|
||||
result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
|
||||
|
||||
IMPORTANT ASSUMPTION: valid urls wont have unbalanced or incorrectly nested parentheses
|
||||
e.g. this will fail the user actually wants to ingest a url like 'https://example.com/some_wei)(rd_url'
|
||||
in that case it will return https://example.com/some_wei (truncated up to the first unbalanced paren)
|
||||
This assumption is true 99.9999% of the time, and for the rare edge case the user can use url_list parser.
|
||||
"""
|
||||
trimmed_url = url_str
|
||||
|
||||
|
@ -353,7 +358,8 @@ def chrome_cleanup():
|
|||
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
|
||||
remove_file("/home/archivebox/.config/chromium/SingletonLock")
|
||||
|
||||
def ansi_to_html(text):
|
||||
@enforce_types
|
||||
def ansi_to_html(text: str) -> str:
|
||||
"""
|
||||
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
|
||||
"""
|
||||
|
@ -439,11 +445,14 @@ class ExtendedEncoder(pyjson.JSONEncoder):
|
|||
|
||||
|
||||
### URL PARSING TESTS / ASSERTIONS
|
||||
# they run at runtime because I like having them inline in this file,
|
||||
# I like the peace of mind knowing it's enforced at runtime across all OS's (in case the regex engine ever has any weird locale-specific quirks),
|
||||
# and these assertions are basically instant, so not a big performance cost to do it on startup
|
||||
|
||||
assert fix_url_from_markdown('/a(b)c).x(y)z') == '/a(b)c'
|
||||
# Check that plain text regex URL parsing works as expected
|
||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||
# misbehaving due to some OS-level or environment level quirks (e.g. regex engine / cpython / locale differences)
|
||||
# the consequences of bad URL parsing could be disastrous and lead to many
|
||||
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
|
||||
|
||||
assert fix_url_from_markdown('http://example.com/a(b)c).x(y)z') == 'http://example.com/a(b)c'
|
||||
assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
|
||||
|
||||
URL_REGEX_TESTS = [
|
||||
|
@ -482,3 +491,50 @@ URL_REGEX_TESTS = [
|
|||
for urls_str, expected_url_matches in URL_REGEX_TESTS:
|
||||
url_matches = list(find_all_urls(urls_str))
|
||||
assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
|
||||
|
||||
|
||||
# More test cases
|
||||
_test_url_strs = {
|
||||
'example.com': 0,
|
||||
'/example.com': 0,
|
||||
'//example.com': 0,
|
||||
':/example.com': 0,
|
||||
'://example.com': 0,
|
||||
'htt://example8.com': 0,
|
||||
'/htt://example.com': 0,
|
||||
'https://example': 1,
|
||||
'https://localhost/2345': 1,
|
||||
'https://localhost:1234/123': 1,
|
||||
'://': 0,
|
||||
'https://': 0,
|
||||
'http://': 0,
|
||||
'ftp://': 0,
|
||||
'ftp://example.com': 0,
|
||||
'https://example.com': 1,
|
||||
'https://example.com/': 1,
|
||||
'https://a.example.com': 1,
|
||||
'https://a.example.com/': 1,
|
||||
'https://a.example.com/what/is/happening.html': 1,
|
||||
'https://a.example.com/what/ís/happening.html': 1,
|
||||
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
|
||||
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
|
||||
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
|
||||
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
||||
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
||||
'<test>http://example7.com</test>': 1,
|
||||
'https://<test>': 0,
|
||||
'https://[test]': 0,
|
||||
'http://"test"': 0,
|
||||
'http://\'test\'': 0,
|
||||
'[https://example8.com/what/is/this.php?what=1]': 1,
|
||||
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
||||
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
||||
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
|
||||
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
|
||||
'<or>http://examplehttp://15.badc</that>': 2,
|
||||
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
|
||||
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
|
||||
}
|
||||
for url_str, num_urls in _test_url_strs.items():
|
||||
assert len(list(find_all_urls(url_str))) == num_urls, (
|
||||
f'{url_str} does not contain {num_urls} urls')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue