API fixes and add actors endpoints

This commit is contained in:
Nick Sweeting 2024-11-17 20:09:06 -08:00
parent c8e186f21b
commit 8f8fbbb7a2
No known key found for this signature in database
12 changed files with 229 additions and 52 deletions

View file

@ -1,15 +1,14 @@
__package__ = 'archivebox.api'
from typing import Any, Optional, cast
from typing import Optional, cast
from datetime import timedelta
from django.http import HttpRequest
from django.utils import timezone
from django.contrib.auth import login
from django.contrib.auth import authenticate
from django.contrib.auth.models import AbstractBaseUser
from ninja.security import HttpBearer, APIKeyQuery, APIKeyHeader, HttpBasicAuth, django_auth_superuser
from ninja.security import HttpBearer, APIKeyQuery, APIKeyHeader, HttpBasicAuth
from ninja.errors import HttpError

117
archivebox/api/v1_actors.py Normal file
View file

@ -0,0 +1,117 @@
__package__ = 'archivebox.api'
from uuid import UUID
from typing import List, Any
from datetime import datetime
from ninja import Router, Schema
from .auth import API_AUTH_METHODS
router = Router(tags=['Workers and Tasks'], auth=API_AUTH_METHODS)
class TaskSchema(Schema):
TYPE: str
id: UUID
abid: str
description: str
status: str
retry_at: datetime | None
created_at: datetime
modified_at: datetime
created_by_id: int
@staticmethod
def resolve_description(obj) -> str:
return str(obj)
class ActorSchema(Schema):
# TYPE: str = 'actors.actor.ActorType'
# name: str
#pid: int | None
idle_count: int
launch_kwargs: dict[str, Any]
mode: str
model: str
statemachine: str
STATE_FIELD_NAME: str
# ACTIVE_STATE: str
FINAL_STATES: list[str]
EVENT_NAME: str
CLAIM_ORDER: list[str]
CLAIM_FROM_TOP_N: int
CLAIM_ATOMIC: bool
MAX_TICK_TIME: int
MAX_CONCURRENT_ACTORS: int
queue: list[TaskSchema]
past: list[TaskSchema]
@staticmethod
def resolve_model(obj) -> str:
return obj.Model.__name__
@staticmethod
def resolve_statemachine(obj) -> str:
return obj.StateMachineClass.__name__
@staticmethod
def resolve_name(obj) -> str:
return str(obj)
# @staticmethod
# def resolve_ACTIVE_STATE(obj) -> str:
# return str(obj.ACTIVE_STATE)
@staticmethod
def resolve_FINAL_STATES(obj) -> list[str]:
return [str(state) for state in obj.FINAL_STATES]
@staticmethod
def resolve_queue(obj) -> list[TaskSchema]:
return [obj for obj in obj.qs.filter(obj.pending_q | obj.future_q | obj.active_q | obj.stalled_q).order_by('-retry_at')]
@staticmethod
def resolve_past(obj) -> list[TaskSchema]:
return [obj for obj in obj.qs.filter(obj.final_q).order_by('-modified_at')]
class OrchestratorSchema(Schema):
# TYPE: str = 'actors.orchestrator.Orchestrator'
#pid: int | None
exit_on_idle: bool
mode: str
actors: list[ActorSchema]
@staticmethod
def resolve_actors(obj) -> list[ActorSchema]:
return [actor() for actor in obj.actor_types.values()]
@router.get("/orchestrators", response=List[OrchestratorSchema], url_name="get_orchestrators")
def get_orchestrators(request):
"""List all the task orchestrators (aka Orchestrators) that are currently running"""
from actors.orchestrator import Orchestrator
orchestrator = Orchestrator()
return [orchestrator]
@router.get("/actors", response=List[ActorSchema], url_name="get_actors")
def get_actors(request):
"""List all the task consumer workers (aka Actors) that are currently running"""
from actors.orchestrator import Orchestrator
orchestrator = Orchestrator()
return orchestrator.actor_types.values()

View file

@ -40,6 +40,7 @@ def register_urls(api: NinjaAPI) -> NinjaAPI:
api.add_router('/auth/', 'api.v1_auth.router')
api.add_router('/core/', 'api.v1_core.router')
api.add_router('/cli/', 'api.v1_cli.router')
api.add_router('/jobs/', 'api.v1_actors.router')
return api

View file

@ -1,5 +1,6 @@
__package__ = 'archivebox.api'
import json
from typing import List, Dict, Any, Optional
from enum import Enum
@ -30,6 +31,7 @@ class CLICommandResponseSchema(Schema):
success: bool
errors: List[str]
result: JSONType
result_format: str = 'str'
stdout: str
stderr: str
@ -97,7 +99,7 @@ class ListCommandSchema(Schema):
sort: str = 'bookmarked_at'
as_json: bool = True
as_html: bool = False
as_csv: str | bool = 'timestamp,url'
as_csv: str | None = 'timestamp,url'
with_headers: bool = False
class RemoveCommandSchema(Schema):
@ -182,7 +184,7 @@ def cli_schedule(request, args: ScheduleCommandSchema):
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns]')
@router.post("/list", response=CLICommandResponseSchema, summary='archivebox list [args] [filter_patterns] (use this endpoint with ?filter_type=search to search for snapshots)')
def cli_list(request, args: ListCommandSchema):
result = list_all(
filter_patterns=args.filter_patterns,
@ -200,6 +202,7 @@ def cli_list(request, args: ListCommandSchema):
result_format = 'txt'
if args.as_json:
result_format = "json"
result = json.loads(result)
elif args.as_html:
result_format = "html"
elif args.as_csv:

View file

@ -8,6 +8,7 @@ from datetime import datetime
from django.db.models import Q
from django.core.exceptions import ValidationError
from django.contrib.auth import get_user_model
from django.shortcuts import redirect
from ninja import Router, Schema, FilterSchema, Field, Query
from ninja.pagination import paginate, PaginationBase
@ -66,38 +67,36 @@ class MinimalArchiveResultSchema(Schema):
id: UUID
abid: str
modified_at: datetime
created_at: datetime
created_at: datetime | None
modified_at: datetime | None
created_by_id: str
created_by_username: str
extractor: str
cmd_version: Optional[str]
cmd: List[str]
pwd: str
status: str
output: str
retry_at: datetime | None
extractor: str
cmd_version: str | None
cmd: list[str] | None
pwd: str | None
output: str | None
start_ts: Optional[datetime]
end_ts: Optional[datetime]
start_ts: datetime | None
end_ts: datetime | None
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
@staticmethod
def resolve_created_by_username(obj):
def resolve_created_by_username(obj) -> str:
User = get_user_model()
return User.objects.get(id=obj.created_by_id).username
return User.objects.filter(pk=obj.created_by_id).values_list('username', flat=True)[0]
@staticmethod
def resolve_abid(obj):
return str(obj.ABID)
@staticmethod
def resolve_created_at(obj):
return obj.start_ts
@staticmethod
def resolve_snapshot_timestamp(obj):
return obj.snapshot.timestamp
@ -203,6 +202,9 @@ class SnapshotSchema(Schema):
created_by_username: str
created_at: datetime
modified_at: datetime
status: str
retry_at: datetime | None
bookmarked_at: datetime
downloaded_at: Optional[datetime]
@ -421,6 +423,9 @@ class SeedSchema(Schema):
User = get_user_model()
return User.objects.get(id=obj.created_by_id).username
@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
def get_seeds(request):
return Seed.objects.all().distinct()
@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
def get_seed(request, seed_id: str):
@ -445,11 +450,12 @@ class CrawlSchema(Schema):
created_at: datetime
created_by_id: str
created_by_username: str
status: str
retry_at: datetime | None
seed: SeedSchema
max_depth: int
status: str
retry_at: datetime
# snapshots: List[SnapshotSchema]
@ -469,9 +475,14 @@ class CrawlSchema(Schema):
return Snapshot.objects.none()
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
def get_crawls(request):
return Crawl.objects.all().distinct()
@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
"""Get a specific Crawl by id or abid."""
crawl = None
request.with_snapshots = with_snapshots
request.with_archiveresults = with_archiveresults
@ -488,9 +499,10 @@ def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archivere
return crawl
# [..., CrawlSchema]
@router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema], url_name="get_any")
@router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)")
def get_any(request, abid: str):
"""Get any object by its ABID or ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
request.with_snapshots = False
request.with_archiveresults = False
@ -516,12 +528,18 @@ def get_any(request, abid: str):
except Exception:
pass
# try:
# response = response or get_crawl(request, abid)
# except Exception:
# pass
try:
response = response or get_seed(request, abid)
except Exception:
pass
try:
response = response or get_crawl(request, abid)
except Exception:
pass
if response:
app_label, model_name = response._meta.app_label, response._meta.model_name
return redirect(f"/api/v1/{app_label}/{model_name}/{response.abid}?{request.META['QUERY_STRING']}")
if not response:
raise HttpError(404, 'Object with given ABID not found')
return response
raise HttpError(404, 'Object with given ABID not found')