Merge branch 'dev' into plugins-browsertrix

This commit is contained in:
Nick Sweeting 2024-04-24 16:29:36 -07:00 committed by GitHub
commit 33e82736f9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 3943 additions and 595 deletions

View file

5
archivebox/api/apps.py Normal file
View file

@ -0,0 +1,5 @@
from django.apps import AppConfig
class APIConfig(AppConfig):
name = 'api'

184
archivebox/api/archive.py Normal file
View file

@ -0,0 +1,184 @@
# archivebox_api.py
from typing import List, Optional
from enum import Enum
from pydantic import BaseModel
from ninja import Router
from main import (
add,
remove,
update,
list_all,
ONLY_NEW,
) # Assuming these functions are defined in main.py
# Schemas
class StatusChoices(str, Enum):
indexed = 'indexed'
archived = 'archived'
unarchived = 'unarchived'
present = 'present'
valid = 'valid'
invalid = 'invalid'
duplicate = 'duplicate'
orphaned = 'orphaned'
corrupted = 'corrupted'
unrecognized = 'unrecognized'
class AddURLSchema(BaseModel):
urls: List[str]
tag: str = ""
depth: int = 0
update: bool = not ONLY_NEW # Default to the opposite of ONLY_NEW
update_all: bool = False
index_only: bool = False
overwrite: bool = False
init: bool = False
extractors: str = ""
parser: str = "auto"
class RemoveURLSchema(BaseModel):
yes: bool = False
delete: bool = False
before: Optional[float] = None
after: Optional[float] = None
filter_type: str = "exact"
filter_patterns: Optional[List[str]] = None
class UpdateSchema(BaseModel):
resume: Optional[float] = None
only_new: Optional[bool] = None
index_only: Optional[bool] = False
overwrite: Optional[bool] = False
before: Optional[float] = None
after: Optional[float] = None
status: Optional[StatusChoices] = None
filter_type: Optional[str] = 'exact'
filter_patterns: Optional[List[str]] = None
extractors: Optional[str] = ""
class ListAllSchema(BaseModel):
filter_patterns: Optional[List[str]] = None
filter_type: str = 'exact'
status: Optional[StatusChoices] = None
after: Optional[float] = None
before: Optional[float] = None
sort: Optional[str] = None
csv: Optional[str] = None
json: bool = False
html: bool = False
with_headers: bool = False
# API Router
router = Router()
@router.post("/add", response={200: dict})
def api_add(request, payload: AddURLSchema):
try:
result = add(
urls=payload.urls,
tag=payload.tag,
depth=payload.depth,
update=payload.update,
update_all=payload.update_all,
index_only=payload.index_only,
overwrite=payload.overwrite,
init=payload.init,
extractors=payload.extractors,
parser=payload.parser,
)
# Currently the add function returns a list of ALL items in the DB, ideally only return new items
return {
"status": "success",
"message": "URLs added successfully.",
"result": str(result),
}
except Exception as e:
# Handle exceptions raised by the add function or during processing
return {"status": "error", "message": str(e)}
@router.post("/remove", response={200: dict})
def api_remove(request, payload: RemoveURLSchema):
try:
result = remove(
yes=payload.yes,
delete=payload.delete,
before=payload.before,
after=payload.after,
filter_type=payload.filter_type,
filter_patterns=payload.filter_patterns,
)
return {
"status": "success",
"message": "URLs removed successfully.",
"result": result,
}
except Exception as e:
# Handle exceptions raised by the remove function or during processing
return {"status": "error", "message": str(e)}
@router.post("/update", response={200: dict})
def api_update(request, payload: UpdateSchema):
try:
result = update(
resume=payload.resume,
only_new=payload.only_new,
index_only=payload.index_only,
overwrite=payload.overwrite,
before=payload.before,
after=payload.after,
status=payload.status,
filter_type=payload.filter_type,
filter_patterns=payload.filter_patterns,
extractors=payload.extractors,
)
return {
"status": "success",
"message": "Archive updated successfully.",
"result": result,
}
except Exception as e:
# Handle exceptions raised by the update function or during processing
return {"status": "error", "message": str(e)}
@router.post("/list_all", response={200: dict})
def api_list_all(request, payload: ListAllSchema):
try:
result = list_all(
filter_patterns=payload.filter_patterns,
filter_type=payload.filter_type,
status=payload.status,
after=payload.after,
before=payload.before,
sort=payload.sort,
csv=payload.csv,
json=payload.json,
html=payload.html,
with_headers=payload.with_headers,
)
# TODO: This is kind of bad, make the format a choice field
if payload.json:
return {"status": "success", "format": "json", "data": result}
elif payload.html:
return {"status": "success", "format": "html", "data": result}
elif payload.csv:
return {"status": "success", "format": "csv", "data": result}
else:
return {
"status": "success",
"message": "List generated successfully.",
"data": result,
}
except Exception as e:
# Handle exceptions raised by the list_all function or during processing
return {"status": "error", "message": str(e)}

48
archivebox/api/auth.py Normal file
View file

@ -0,0 +1,48 @@
from django.contrib.auth import authenticate
from ninja import Form, Router, Schema
from ninja.security import HttpBearer
from api.models import Token
router = Router()
class GlobalAuth(HttpBearer):
def authenticate(self, request, token):
try:
return Token.objects.get(token=token).user
except Token.DoesNotExist:
pass
class AuthSchema(Schema):
email: str
password: str
@router.post("/authenticate", auth=None) # overriding global auth
def get_token(request, auth_data: AuthSchema):
user = authenticate(username=auth_data.email, password=auth_data.password)
if user:
# Assuming a user can have multiple tokens and you want to create a new one every time
new_token = Token.objects.create(user=user)
return {"token": new_token.token, "expires": new_token.expiry_as_iso8601}
else:
return {"error": "Invalid credentials"}
class TokenValidationSchema(Schema):
token: str
@router.post("/validate_token", auth=None) # No authentication required for this endpoint
def validate_token(request, token_data: TokenValidationSchema):
try:
# Attempt to authenticate using the provided token
user = GlobalAuth().authenticate(request, token_data.token)
if user:
return {"status": "valid"}
else:
return {"status": "invalid"}
except Token.DoesNotExist:
return {"status": "invalid"}

View file

@ -0,0 +1,28 @@
# Generated by Django 3.1.14 on 2024-04-09 18:52
import api.models
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='Token',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('token', models.CharField(default=auth.models.hex_uuid, max_length=32, unique=True)),
('created', models.DateTimeField(auto_now_add=True)),
('expiry', models.DateTimeField(blank=True, null=True)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='tokens', to=settings.AUTH_USER_MODEL)),
],
),
]

View file

30
archivebox/api/models.py Normal file
View file

@ -0,0 +1,30 @@
import uuid
from datetime import timedelta
from django.conf import settings
from django.db import models
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
def hex_uuid():
return uuid.uuid4().hex
class Token(models.Model):
user = models.ForeignKey(
settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name="tokens"
)
token = models.CharField(max_length=32, default=hex_uuid, unique=True)
created = models.DateTimeField(auto_now_add=True)
expiry = models.DateTimeField(null=True, blank=True)
@property
def expiry_as_iso8601(self):
"""Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none."""
expiry_date = (
self.expiry if self.expiry else timezone.now() + timedelta(days=365 * 100)
)
return expiry_date.isoformat()
def __str__(self):
return self.token

27
archivebox/api/tests.py Normal file
View file

@ -0,0 +1,27 @@
from django.test import TestCase
from ninja.testing import TestClient
from archivebox.api.archive import router as archive_router
class ArchiveBoxAPITestCase(TestCase):
def setUp(self):
self.client = TestClient(archive_router)
def test_add_endpoint(self):
response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "test"})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.json()["status"], "success")
def test_remove_endpoint(self):
response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.json()["status"], "success")
def test_update_endpoint(self):
response = self.client.post("/update", json={})
self.assertEqual(response.status_code, 200)
self.assertEqual(response.json()["status"], "success")
def test_list_all_endpoint(self):
response = self.client.post("/list_all", json={})
self.assertEqual(response.status_code, 200)
self.assertTrue("success" in response.json()["status"])

View file

@ -68,6 +68,7 @@ INSTALLED_APPS = [
'core',
'api',
# Plugins

View file

@ -8,6 +8,18 @@ from django.views.generic.base import RedirectView
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
from ninja import NinjaAPI
from api.auth import GlobalAuth
api = NinjaAPI(auth=GlobalAuth())
api.add_router("/auth/", "api.auth.router")
api.add_router("/archive/", "api.archive.router")
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE}
# print('DEBUG', settings.DEBUG)
urlpatterns = [
@ -35,6 +47,8 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
path('admin/', archivebox_admin.urls),
path("api/", api.urls),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda _: 1/0),

0
archivebox/index.sqlite3 Normal file
View file

View file

@ -494,12 +494,12 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
if delete:
file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
print(
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' +
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
)
else:
print(
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' +
' (Pass --delete if you also want to permanently delete the data folders)'
)
@ -638,17 +638,15 @@ def printable_folder_status(name: str, folder: Dict) -> str:
@enforce_types
def printable_dependency_version(name: str, dependency: Dict) -> str:
version = None
color, symbol, note, version = 'red', 'X', 'invalid', '?'
if dependency['enabled']:
if dependency['is_valid']:
color, symbol, note, version = 'green', '', 'valid', ''
color, symbol, note = 'green', '', 'valid'
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
if parsed_version_num:
version = f'v{parsed_version_num[0]}'
if not version:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'

2371
archivebox/package-lock.json generated Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
{
"name": "archivebox",
"version": "0.7.3",
"version": "0.8.0",
"description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox",
@ -8,6 +8,6 @@
"dependencies": {
"@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor",
"single-file-cli": "^1.1.46"
"single-file-cli": "^1.1.54"
}
}

View file

@ -57,19 +57,57 @@ short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0]
ts_to_date_str = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M')
ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
# https://mathiasbynens.be/demo/url-regex
URL_REGEX = re.compile(
r'(?=('
r'http[s]?://' # start matching from allowed schemes
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
r'(?=(' +
r'http[s]?://' + # start matching from allowed schemes
r'(?:[a-zA-Z]|[0-9]' + # followed by allowed alphanum characters
r'|[-_$@.&+!*\(\),]' + # or allowed symbols (keep hyphen first to match literal hyphen)
r'|[^\u0000-\u007F])+' + # or allowed unicode bytes
r'[^\]\[<>"\'\s]+' + # stop parsing at these symbols
r'))',
re.IGNORECASE,
re.IGNORECASE | re.UNICODE,
)
COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
def parens_are_matched(string: str, open_char='(', close_char=')'):
"""check that all parentheses in a string are balanced and nested properly"""
count = 0
for c in string:
if c == open_char:
count += 1
elif c == close_char:
count -= 1
if count < 0:
return False
return count == 0
def fix_url_from_markdown(url_str: str) -> str:
"""
cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax
helpful to fix URLs parsed from markdown e.g.
input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext
result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def
"""
trimmed_url = url_str
# cut off one trailing character at a time
# until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c
while not parens_are_matched(trimmed_url):
trimmed_url = trimmed_url[:-1]
# make sure trimmed url is still valid
if re.findall(URL_REGEX, trimmed_url):
return trimmed_url
return url_str
def find_all_urls(urls_str: str):
for url in re.findall(URL_REGEX, urls_str):
yield fix_url_from_markdown(url)
def is_static_file(url: str):
# TODO: the proper way is with MIME type detection + ext, not only extension
@ -403,3 +441,48 @@ class ExtendedEncoder(pyjson.JSONEncoder):
return pyjson.JSONEncoder.default(self, obj)
### URL PARSING TESTS / ASSERTIONS
# they run at runtime because I like having them inline in this file,
# I like the peace of mind knowing it's enforced at runtime across all OS's (in case the regex engine ever has any weird locale-specific quirks),
# and these assertions are basically instant, so not a big performance cost to do it on startup
assert fix_url_from_markdown('/a(b)c).x(y)z') == '/a(b)c'
assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
URL_REGEX_TESTS = [
('https://example.com', ['https://example.com']),
('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
('<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc', ['https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ']),
('///a', []),
('http://', []),
('http://../', ['http://../']),
('http://-error-.invalid/', ['http://-error-.invalid/']),
('https://a(b)c+1#2?3&4/', ['https://a(b)c+1#2?3&4/']),
('http://उदाहरण.परीक्षा', ['http://उदाहरण.परीक्षा']),
('http://例子.测试', ['http://例子.测试']),
('http://➡.ws/䨹 htps://abc.1243?234', ['http://➡.ws/䨹']),
('http://⌘.ws">https://exa+mple.com//:abc ', ['http://⌘.ws', 'https://exa+mple.com//:abc']),
('http://مثال.إختبار/abc?def=ت&ب=abc#abc=234', ['http://مثال.إختبار/abc?def=ت&ب=abc#abc=234']),
('http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c\'om', ['http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c']),
('http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', ['http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', 'http://ex.co:19/a?_d=4#-a=2.3']),
('http://code.google.com/events/#&product=browser', ['http://code.google.com/events/#&product=browser']),
('http://foo.bar?q=Spaces should be encoded', ['http://foo.bar?q=Spaces']),
('http://foo.com/blah_(wikipedia)#c(i)t[e]-1', ['http://foo.com/blah_(wikipedia)#c(i)t']),
('http://foo.com/(something)?after=parens', ['http://foo.com/(something)?after=parens']),
('http://foo.com/unicode_(✪)_in_parens) abc', ['http://foo.com/unicode_(✪)_in_parens']),
('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
('[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff', ['http://a.b/?q=(Test)%20U']),
('[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123', ['http://a.b/?q=(Test)%20U', 'https://abc+123']),
('[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3', ['http://a.b/?q=(Test)%20U', 'https://a(b)c+12']),
('[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3', ['http://a.b/?q=(Test)a', 'https://a(b)c+12']),
('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
]
for urls_str, expected_url_matches in URL_REGEX_TESTS:
url_matches = list(find_all_urls(urls_str))
assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'

6
archivebox/vendor/requirements.txt vendored Normal file
View file

@ -0,0 +1,6 @@
# this folder contains vendored versions of these packages
atomicwrites==1.4.0
pocket==0.3.7
django-taggit==1.3.0
base32-crockford==0.3.0