mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-23 19:27:00 -04:00
add django_huey, huey_monitor, and replace Threads with huey tasks
This commit is contained in:
parent
4df90fbb40
commit
60154fba5f
19 changed files with 850 additions and 92 deletions
|
@ -12,6 +12,7 @@ from plugantic.base_plugin import BasePlugin
|
|||
from plugantic.base_configset import BaseConfigSet, ConfigSectionName
|
||||
from plugantic.base_binary import BaseBinary, env
|
||||
from plugantic.base_extractor import BaseExtractor
|
||||
from plugantic.base_queue import BaseQueue
|
||||
from plugantic.base_hook import BaseHook
|
||||
|
||||
# Depends on Other Plugins:
|
||||
|
@ -95,6 +96,13 @@ class SinglefileExtractor(BaseExtractor):
|
|||
SINGLEFILE_BINARY = SinglefileBinary()
|
||||
SINGLEFILE_EXTRACTOR = SinglefileExtractor()
|
||||
|
||||
class SinglefileQueue(BaseQueue):
|
||||
name: str = 'singlefile'
|
||||
|
||||
binaries: List[InstanceOf[BaseBinary]] = [SINGLEFILE_BINARY]
|
||||
|
||||
SINGLEFILE_QUEUE = SinglefileQueue()
|
||||
|
||||
class SinglefilePlugin(BasePlugin):
|
||||
app_label: str ='singlefile'
|
||||
verbose_name: str = 'SingleFile'
|
||||
|
|
40
archivebox/builtin_plugins/singlefile/tasks.py
Normal file
40
archivebox/builtin_plugins/singlefile/tasks.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
__package__ = 'archivebox.queues'
|
||||
|
||||
import time
|
||||
|
||||
from django.core.cache import cache
|
||||
|
||||
from huey import crontab
|
||||
from django_huey import db_task, on_startup, db_periodic_task
|
||||
from huey_monitor.models import TaskModel
|
||||
from huey_monitor.tqdm import ProcessInfo
|
||||
|
||||
@db_task(queue="singlefile", context=True)
|
||||
def extract(url, out_dir, config, task=None, parent_task_id=None):
|
||||
if task and parent_task_id:
|
||||
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
|
||||
|
||||
process_info = ProcessInfo(task, desc="extract_singlefile", parent_task_id=parent_task_id, total=1)
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
process_info.update(n=1)
|
||||
return {'output': 'singlefile.html', 'status': 'succeeded'}
|
||||
|
||||
|
||||
# @on_startup(queue='singlefile')
|
||||
# def start_singlefile_queue():
|
||||
# print("[+] Starting singlefile worker...")
|
||||
# update_version.call_local()
|
||||
|
||||
|
||||
# @db_periodic_task(crontab(minute='*/5'), queue='singlefile')
|
||||
# def update_version():
|
||||
# print('[*] Updating singlefile version... 5 minute interval')
|
||||
# from django.conf import settings
|
||||
|
||||
# bin = settings.BINARIES.SinglefileBinary.load()
|
||||
# if bin.version:
|
||||
# cache.set(f"bin:abspath:{bin.name}", bin.abspath)
|
||||
# cache.set(f"bin:version:{bin.name}:{bin.abspath}", bin.version)
|
||||
# print('[√] Updated singlefile version:', bin.version, bin.abspath)
|
|
@ -37,7 +37,7 @@ is_valid_cli_module = lambda module, subcommand: (
|
|||
)
|
||||
|
||||
|
||||
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread') # threads we dont have to wait for before exiting
|
||||
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting
|
||||
|
||||
|
||||
def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int:
|
||||
|
|
|
@ -30,6 +30,7 @@ from core.models import Snapshot, ArchiveResult, Tag
|
|||
from core.mixins import SearchResultsAdminMixin
|
||||
from api.models import APIToken
|
||||
from abid_utils.admin import ABIDModelAdmin
|
||||
from queues.tasks import bg_archive_links, bg_add
|
||||
|
||||
from index.html import snapshot_icons
|
||||
from logging_util import printable_filesize
|
||||
|
@ -137,6 +138,8 @@ class CustomUserAdmin(UserAdmin):
|
|||
) + f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
|
||||
|
||||
|
||||
|
||||
archivebox_admin = ArchiveBoxAdmin()
|
||||
archivebox_admin.register(get_user_model(), CustomUserAdmin)
|
||||
archivebox_admin.disable_action('delete_selected')
|
||||
|
@ -155,6 +158,28 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad
|
|||
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
|
||||
|
||||
from huey_monitor.apps import HueyMonitorConfig
|
||||
HueyMonitorConfig.verbose_name = 'Background Workers'
|
||||
|
||||
from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin
|
||||
archivebox_admin.register(SignalInfoModel, SignalInfoModelAdmin)
|
||||
|
||||
|
||||
class CustomTaskModelAdmin(TaskModelAdmin):
|
||||
actions = ["delete_selected"]
|
||||
|
||||
def has_delete_permission(self, request, obj=None):
|
||||
codename = get_permission_codename("delete", self.opts)
|
||||
return request.user.has_perm("%s.%s" % (self.opts.app_label, codename))
|
||||
|
||||
|
||||
archivebox_admin.register(TaskModel, CustomTaskModelAdmin)
|
||||
|
||||
def result_url(result: TaskModel) -> str:
|
||||
url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
|
||||
return format_html('<a href="{url}" class="fade-in-progress-url">See progress...</a>'.format(url=url))
|
||||
|
||||
|
||||
class AccelleratedPaginator(Paginator):
|
||||
"""
|
||||
Accellerated Pagniator ignores DISTINCT when counting total number of rows.
|
||||
|
@ -515,65 +540,53 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=CONFIG.OUTPUT_DIR)
|
||||
messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.")
|
||||
else:
|
||||
# otherwise run in a bg thread
|
||||
bg_thread = threading.Thread(
|
||||
target=archive_links,
|
||||
args=(links,),
|
||||
kwargs={"overwrite": True, "methods": ['title', 'favicon'], "out_dir": CONFIG.OUTPUT_DIR},
|
||||
# otherwise run in a background worker
|
||||
result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": CONFIG.OUTPUT_DIR})
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"),
|
||||
)
|
||||
bg_thread.setDaemon(True)
|
||||
bg_thread.start()
|
||||
messages.success(request, f"Title and favicon are updating in the background for {len(links)} URLs. (refresh in a few minutes to see results)")
|
||||
|
||||
@admin.action(
|
||||
description="⬇️ Get Missing"
|
||||
)
|
||||
def update_snapshots(self, request, queryset):
|
||||
links = [snapshot.as_link() for snapshot in queryset]
|
||||
bg_thread = threading.Thread(
|
||||
target=archive_links,
|
||||
args=(links,),
|
||||
kwargs={"overwrite": False, "out_dir": CONFIG.OUTPUT_DIR},
|
||||
)
|
||||
bg_thread.setDaemon(True)
|
||||
bg_thread.start()
|
||||
|
||||
result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": CONFIG.OUTPUT_DIR})
|
||||
|
||||
messages.success(
|
||||
request, f"Re-trying any previously failed methods for {len(links)} URLs in the background. (refresh in a few minutes to see results)"
|
||||
request,
|
||||
mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"),
|
||||
)
|
||||
|
||||
|
||||
@admin.action(
|
||||
description="📑 Archive again"
|
||||
description="🆕 Archive Again"
|
||||
)
|
||||
def resnapshot_snapshot(self, request, queryset):
|
||||
for snapshot in queryset:
|
||||
timestamp = timezone.now().isoformat('T', 'seconds')
|
||||
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
|
||||
|
||||
bg_thread = threading.Thread(target=add, args=(new_url,), kwargs={'tag': snapshot.tags_str()})
|
||||
bg_thread.setDaemon(True)
|
||||
bg_thread.start()
|
||||
result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
f"Creating new fresh snapshots for {len(queryset.count())} URLs in the background. (refresh in a few minutes to see results)",
|
||||
mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
description="♲ Redo"
|
||||
description="🔄 Redo"
|
||||
)
|
||||
def overwrite_snapshots(self, request, queryset):
|
||||
links = [snapshot.as_link() for snapshot in queryset]
|
||||
bg_thread = threading.Thread(
|
||||
target=archive_links,
|
||||
args=(links,),
|
||||
kwargs={"overwrite": True, "out_dir": CONFIG.OUTPUT_DIR},
|
||||
)
|
||||
bg_thread.setDaemon(True)
|
||||
bg_thread.start()
|
||||
|
||||
result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": CONFIG.OUTPUT_DIR})
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
f"Clearing all previous results and re-downloading {len(links)} URLs in the background. (refresh in a few minutes to see results)",
|
||||
mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"),
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
|
@ -583,7 +596,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
remove(snapshots=queryset, yes=True, delete=True, out_dir=CONFIG.OUTPUT_DIR)
|
||||
messages.success(
|
||||
request,
|
||||
f"Succesfully deleted {len(queryset.count())} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed.",
|
||||
mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
|
||||
)
|
||||
|
||||
|
||||
|
@ -597,7 +610,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
obj.tags.add(*tags)
|
||||
messages.success(
|
||||
request,
|
||||
f"Added {len(tags)} tags to {len(queryset.count())} Snapshots.",
|
||||
f"Added {len(tags)} tags to {queryset.count()} Snapshots.",
|
||||
)
|
||||
|
||||
|
||||
|
@ -611,7 +624,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
obj.tags.remove(*tags)
|
||||
messages.success(
|
||||
request,
|
||||
f"Removed {len(tags)} tags from {len(queryset.count())} Snapshots.",
|
||||
f"Removed {len(tags)} tags from {queryset.count()} Snapshots.",
|
||||
)
|
||||
|
||||
|
||||
|
@ -727,7 +740,6 @@ class ArchiveResultAdmin(ABIDModelAdmin):
|
|||
else:
|
||||
root_dir = str(snapshot_dir)
|
||||
|
||||
|
||||
# print(root_dir, str(list(os.walk(root_dir))))
|
||||
|
||||
for root, dirs, files in os.walk(root_dir):
|
||||
|
|
|
@ -87,6 +87,7 @@ INSTALLED_APPS = [
|
|||
'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
|
||||
|
||||
# Our ArchiveBox-provided apps
|
||||
'queues', # handles starting and managing background workers and processes
|
||||
'abid_utils', # handles ABID ID creation, handling, and models
|
||||
'plugantic', # ArchiveBox plugin API definition + finding/registering/calling interface
|
||||
'core', # core django model with Snapshot, ArchiveResult, etc.
|
||||
|
@ -98,6 +99,9 @@ INSTALLED_APPS = [
|
|||
# 3rd-party apps from PyPI that need to be loaded last
|
||||
'admin_data_views', # handles rendering some convenient automatic read-only views of data in Django admin
|
||||
'django_extensions', # provides Django Debug Toolbar (and other non-debug helpers)
|
||||
'django_huey', # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
|
||||
'bx_django_utils', # needed for huey_monitor https://github.com/boxine/bx_django_utils
|
||||
'huey_monitor', # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
|
||||
]
|
||||
|
||||
|
||||
|
@ -212,17 +216,28 @@ CACHE_DB_TABLE = 'django_cache'
|
|||
DATABASE_FILE = Path(CONFIG.OUTPUT_DIR) / CONFIG.SQL_INDEX_FILENAME
|
||||
DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE))
|
||||
|
||||
QUEUE_DATABASE_NAME = DATABASE_NAME.replace('index.sqlite3', 'queue.sqlite3')
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': DATABASE_NAME,
|
||||
'OPTIONS': {
|
||||
'timeout': 60,
|
||||
'check_same_thread': False,
|
||||
"default": {
|
||||
"ENGINE": "django.db.backends.sqlite3",
|
||||
"NAME": DATABASE_NAME,
|
||||
"OPTIONS": {
|
||||
"timeout": 60,
|
||||
"check_same_thread": False,
|
||||
},
|
||||
'TIME_ZONE': CONFIG.TIMEZONE,
|
||||
"TIME_ZONE": CONFIG.TIMEZONE,
|
||||
# DB setup is sometimes modified at runtime by setup_django() in config.py
|
||||
},
|
||||
"queue": {
|
||||
"ENGINE": "django.db.backends.sqlite3",
|
||||
"NAME": QUEUE_DATABASE_NAME,
|
||||
"OPTIONS": {
|
||||
"timeout": 60,
|
||||
"check_same_thread": False,
|
||||
},
|
||||
"TIME_ZONE": CONFIG.TIMEZONE,
|
||||
},
|
||||
# 'cache': {
|
||||
# 'ENGINE': 'django.db.backends.sqlite3',
|
||||
# 'NAME': CACHE_DB_PATH,
|
||||
|
@ -239,6 +254,64 @@ MIGRATION_MODULES = {'signal_webhooks': None}
|
|||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
|
||||
HUEY = {
|
||||
"huey_class": "huey.SqliteHuey",
|
||||
"filename": QUEUE_DATABASE_NAME,
|
||||
"name": "system_tasks",
|
||||
"results": True,
|
||||
"store_none": True,
|
||||
"immediate": False,
|
||||
"utc": True,
|
||||
"consumer": {
|
||||
"workers": 1,
|
||||
"worker_type": "thread",
|
||||
"initial_delay": 0.1, # Smallest polling interval, same as -d.
|
||||
"backoff": 1.15, # Exponential backoff using this rate, -b.
|
||||
"max_delay": 10.0, # Max possible polling interval, -m.
|
||||
"scheduler_interval": 1, # Check schedule every second, -s.
|
||||
"periodic": True, # Enable crontab feature.
|
||||
"check_worker_health": True, # Enable worker health checks.
|
||||
"health_check_interval": 1, # Check worker health every second.
|
||||
},
|
||||
}
|
||||
|
||||
# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
|
||||
# https://github.com/gaiacoop/django-huey
|
||||
DJANGO_HUEY = {
|
||||
"default": "system_tasks",
|
||||
"queues": {
|
||||
HUEY["name"]: HUEY.copy(),
|
||||
# more registered here at plugin import-time by BaseQueue.register()
|
||||
},
|
||||
}
|
||||
|
||||
class HueyDBRouter:
|
||||
"""A router to store all the Huey Monitor models in the queue.sqlite3 database."""
|
||||
|
||||
route_app_labels = {"huey_monitor", "django_huey", "djhuey"}
|
||||
|
||||
def db_for_read(self, model, **hints):
|
||||
if model._meta.app_label in self.route_app_labels:
|
||||
return "queue"
|
||||
return 'default'
|
||||
|
||||
def db_for_write(self, model, **hints):
|
||||
if model._meta.app_label in self.route_app_labels:
|
||||
return "queue"
|
||||
return 'default'
|
||||
|
||||
def allow_relation(self, obj1, obj2, **hints):
|
||||
if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
|
||||
return obj1._meta.app_label == obj2._meta.app_label
|
||||
return None
|
||||
|
||||
def allow_migrate(self, db, app_label, model_name=None, **hints):
|
||||
if app_label in self.route_app_labels:
|
||||
return db == "queue"
|
||||
return db == "default"
|
||||
|
||||
DATABASE_ROUTERS = ['core.settings.HueyDBRouter']
|
||||
|
||||
CACHES = {
|
||||
'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'},
|
||||
# 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'},
|
||||
|
|
|
@ -23,6 +23,9 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
|
|||
|
||||
from core.models import Snapshot
|
||||
from core.forms import AddLinkForm
|
||||
from core.admin import result_url
|
||||
|
||||
from queues.tasks import bg_add
|
||||
|
||||
from ..config import (
|
||||
OUTPUT_DIR,
|
||||
|
@ -478,15 +481,14 @@ class AddView(UserPassesTestMixin, FormView):
|
|||
if extractors:
|
||||
input_kwargs.update({"extractors": extractors})
|
||||
|
||||
bg_thread = threading.Thread(target=add, kwargs=input_kwargs)
|
||||
bg_thread.setDaemon(True)
|
||||
bg_thread.start()
|
||||
result = bg_add(input_kwargs, parent_task_id=None)
|
||||
print('Started background add job:', result)
|
||||
|
||||
rough_url_count = url.count('://')
|
||||
|
||||
messages.success(
|
||||
self.request,
|
||||
f"Adding {rough_url_count} URLs in the background. (refresh in a few minutes to see results)",
|
||||
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a few minutes to see results) {result_url(result)}"),
|
||||
)
|
||||
|
||||
return redirect("/admin/core/snapshot/")
|
||||
|
|
|
@ -148,17 +148,16 @@ def list_migrations(out_dir: Path=OUTPUT_DIR) -> List[Tuple[bool, str]]:
|
|||
@enforce_types
|
||||
def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]:
|
||||
from django.core.management import call_command
|
||||
null, out = StringIO(), StringIO()
|
||||
try:
|
||||
call_command("makemigrations", interactive=False, stdout=null)
|
||||
except Exception as e:
|
||||
print('[!] Failed to create some migrations. Please open an issue and copy paste this output for help: {}'.format(e))
|
||||
print()
|
||||
out1, out2 = StringIO(), StringIO()
|
||||
|
||||
call_command("migrate", interactive=False, stdout=out)
|
||||
out.seek(0)
|
||||
call_command("migrate", interactive=False, database='default', stdout=out1)
|
||||
out1.seek(0)
|
||||
call_command("migrate", "huey_monitor", interactive=False, database='queue', stdout=out2)
|
||||
out2.seek(0)
|
||||
|
||||
return [line.strip() for line in out.readlines() if line.strip()]
|
||||
return [
|
||||
line.strip() for line in out1.readlines() + out2.readlines() if line.strip()
|
||||
]
|
||||
|
||||
@enforce_types
|
||||
def get_admins(out_dir: Path=OUTPUT_DIR) -> List[str]:
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
__package__ = 'archivebox'
|
||||
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
import shutil
|
||||
import signal
|
||||
import platform
|
||||
import subprocess
|
||||
|
||||
|
@ -1352,6 +1354,7 @@ def server(runserver_args: Optional[List[str]]=None,
|
|||
if reload or debug:
|
||||
call_command("runserver", *runserver_args)
|
||||
else:
|
||||
|
||||
host = '127.0.0.1'
|
||||
port = '8000'
|
||||
|
||||
|
@ -1367,12 +1370,52 @@ def server(runserver_args: Optional[List[str]]=None,
|
|||
except IndexError:
|
||||
pass
|
||||
|
||||
from queues.supervisor_util import get_or_create_supervisord_process, start_worker, stop_worker, watch_worker
|
||||
|
||||
print()
|
||||
supervisor = get_or_create_supervisord_process(daemonize=False)
|
||||
|
||||
bg_workers = [
|
||||
{
|
||||
"name": "worker_system_tasks",
|
||||
"command": "archivebox manage djangohuey --queue system_tasks",
|
||||
"autostart": "true",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_system_tasks.log",
|
||||
"redirect_stderr": "true",
|
||||
},
|
||||
]
|
||||
fg_worker = {
|
||||
"name": "worker_daphne",
|
||||
"command": f"daphne --bind={host} --port={port} --application-close-timeout=600 archivebox.core.asgi:application",
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_daphne.log",
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
|
||||
print()
|
||||
for worker in bg_workers:
|
||||
start_worker(supervisor, worker)
|
||||
|
||||
print()
|
||||
start_worker(supervisor, fg_worker)
|
||||
print()
|
||||
|
||||
try:
|
||||
subprocess.run(['daphne', '--bind', host, '--port', port, 'archivebox.core.asgi:application'])
|
||||
except (SystemExit, KeyboardInterrupt):
|
||||
watch_worker(supervisor, "worker_daphne")
|
||||
except KeyboardInterrupt:
|
||||
print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
||||
except SystemExit:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(e)
|
||||
except BaseException as e:
|
||||
print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...")
|
||||
raise
|
||||
finally:
|
||||
stop_worker(supervisor, "worker_daphne")
|
||||
time.sleep(0.5)
|
||||
|
||||
print("\n[🟩] ArchiveBox server shut down gracefully.")
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -12,13 +12,13 @@ from ..config_stubs import AttrDict
|
|||
|
||||
class BaseBinProvider(BaseHook, BinProvider):
|
||||
hook_type: HookType = 'BINPROVIDER'
|
||||
|
||||
|
||||
# def on_get_abspath(self, bin_name: BinName, **context) -> Optional[HostBinPath]:
|
||||
# Class = super()
|
||||
# get_abspath_func = lambda: Class.on_get_abspath(bin_name, **context)
|
||||
# # return cache.get_or_set(f'bin:abspath:{bin_name}', get_abspath_func)
|
||||
# return get_abspath_func()
|
||||
|
||||
|
||||
# def on_get_version(self, bin_name: BinName, abspath: Optional[HostBinPath]=None, **context) -> SemVer | None:
|
||||
# Class = super()
|
||||
# get_version_func = lambda: Class.on_get_version(bin_name, abspath, **context)
|
||||
|
|
143
archivebox/plugantic/base_queue.py
Normal file
143
archivebox/plugantic/base_queue.py
Normal file
|
@ -0,0 +1,143 @@
|
|||
__package__ = 'archivebox.plugantic'
|
||||
|
||||
import importlib
|
||||
|
||||
from typing import Dict, List, TYPE_CHECKING
|
||||
from pydantic import Field, InstanceOf
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from huey.api import TaskWrapper
|
||||
|
||||
from .base_hook import BaseHook, HookType
|
||||
from .base_binary import BaseBinary
|
||||
from ..config_stubs import AttrDict
|
||||
|
||||
|
||||
|
||||
class BaseQueue(BaseHook):
|
||||
hook_type: HookType = 'QUEUE'
|
||||
|
||||
name: str = Field() # e.g. 'singlefile'
|
||||
|
||||
binaries: List[InstanceOf[BaseBinary]] = Field()
|
||||
|
||||
@property
|
||||
def tasks(self) -> Dict[str, 'TaskWrapper']:
|
||||
"""Return an AttrDict of all the background worker tasks defined in the plugin's tasks.py file."""
|
||||
tasks = importlib.import_module(f"{self.plugin_module}.tasks")
|
||||
|
||||
all_tasks = {}
|
||||
|
||||
for task_name, task in tasks.__dict__.items():
|
||||
# if attr is a Huey task and its queue_name matches our hook's queue name
|
||||
if hasattr(task, "task_class") and task.huey.name == self.name:
|
||||
all_tasks[task_name] = task
|
||||
|
||||
return AttrDict(all_tasks)
|
||||
|
||||
def get_huey_config(self, settings) -> dict:
|
||||
return {
|
||||
"huey_class": "huey.SqliteHuey",
|
||||
"filename": settings.QUEUE_DATABASE_NAME,
|
||||
"name": self.name,
|
||||
"results": True,
|
||||
"store_none": True,
|
||||
"immediate": False,
|
||||
"utc": True,
|
||||
"consumer": {
|
||||
"workers": 1,
|
||||
"worker_type": "thread",
|
||||
"initial_delay": 0.1, # Smallest polling interval, same as -d.
|
||||
"backoff": 1.15, # Exponential backoff using this rate, -b.
|
||||
"max_delay": 10.0, # Max possible polling interval, -m.
|
||||
"scheduler_interval": 1, # Check schedule every second, -s.
|
||||
"periodic": True, # Enable crontab feature.
|
||||
"check_worker_health": True, # Enable worker health checks.
|
||||
"health_check_interval": 1, # Check worker health every second.
|
||||
},
|
||||
}
|
||||
|
||||
def get_supervisor_config(self, settings) -> dict:
|
||||
return {
|
||||
"name": f"worker_{self.name}",
|
||||
"command": f"archivebox manage djangohuey --queue {self.name}",
|
||||
"stdout_logfile": f"logs/worker_{self.name}.log",
|
||||
"redirect_stderr": "true",
|
||||
"autorestart": "true",
|
||||
"autostart": "false",
|
||||
}
|
||||
|
||||
def start_supervisord_worker(self, settings, lazy=True):
|
||||
from queues.supervisor_util import get_or_create_supervisord_process, start_worker
|
||||
print()
|
||||
try:
|
||||
supervisor = get_or_create_supervisord_process(daemonize=False)
|
||||
except Exception as e:
|
||||
print(f"Error starting worker for queue {self.name}: {e}")
|
||||
return None
|
||||
print()
|
||||
worker = start_worker(supervisor, self.get_supervisor_config(settings), lazy=lazy)
|
||||
return worker
|
||||
|
||||
def register(self, settings, parent_plugin=None):
|
||||
# self._plugin = parent_plugin # for debugging only, never rely on this!
|
||||
|
||||
# Side effect: register queue with django-huey multiqueue dict
|
||||
settings.DJANGO_HUEY = getattr(settings, "DJANGO_HUEY", None) or AttrDict({"queues": {}})
|
||||
settings.DJANGO_HUEY["queues"][self.name] = self.get_huey_config(settings)
|
||||
|
||||
# Side effect: register some extra tasks with huey
|
||||
# on_startup(queue=self.name)(self.on_startup_task)
|
||||
# db_periodic_task(crontab(minute='*/5'))(self.on_periodic_task)
|
||||
|
||||
# Side effect: start consumer worker process under supervisord
|
||||
settings.WORKERS = getattr(settings, "WORKERS", None) or AttrDict({})
|
||||
settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
|
||||
|
||||
# Install queue into settings.QUEUES
|
||||
settings.QUEUES = getattr(settings, "QUEUES", None) or AttrDict({})
|
||||
settings.QUEUES[self.id] = self
|
||||
|
||||
# Record installed hook into settings.HOOKS
|
||||
super().register(settings, parent_plugin=parent_plugin)
|
||||
|
||||
|
||||
# class WgetToggleConfig(ConfigSet):
|
||||
# section: ConfigSectionName = 'ARCHIVE_METHOD_TOGGLES'
|
||||
|
||||
# SAVE_WGET: bool = True
|
||||
# SAVE_WARC: bool = True
|
||||
|
||||
# class WgetDependencyConfig(ConfigSet):
|
||||
# section: ConfigSectionName = 'DEPENDENCY_CONFIG'
|
||||
|
||||
# WGET_BINARY: str = Field(default='wget')
|
||||
# WGET_ARGS: Optional[List[str]] = Field(default=None)
|
||||
# WGET_EXTRA_ARGS: List[str] = []
|
||||
# WGET_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
||||
|
||||
# class WgetOptionsConfig(ConfigSet):
|
||||
# section: ConfigSectionName = 'ARCHIVE_METHOD_OPTIONS'
|
||||
|
||||
# # loaded from shared config
|
||||
# WGET_AUTO_COMPRESSION: bool = Field(default=True)
|
||||
# SAVE_WGET_REQUISITES: bool = Field(default=True)
|
||||
# WGET_USER_AGENT: str = Field(default='', alias='USER_AGENT')
|
||||
# WGET_TIMEOUT: int = Field(default=60, alias='TIMEOUT')
|
||||
# WGET_CHECK_SSL_VALIDITY: bool = Field(default=True, alias='CHECK_SSL_VALIDITY')
|
||||
# WGET_RESTRICT_FILE_NAMES: str = Field(default='windows', alias='RESTRICT_FILE_NAMES')
|
||||
# WGET_COOKIES_FILE: Optional[Path] = Field(default=None, alias='COOKIES_FILE')
|
||||
|
||||
|
||||
# CONFIG = {
|
||||
# 'CHECK_SSL_VALIDITY': False,
|
||||
# 'SAVE_WARC': False,
|
||||
# 'TIMEOUT': 999,
|
||||
# }
|
||||
|
||||
|
||||
# WGET_CONFIG = [
|
||||
# WgetToggleConfig(**CONFIG),
|
||||
# WgetDependencyConfig(**CONFIG),
|
||||
# WgetOptionsConfig(**CONFIG),
|
||||
# ]
|
0
archivebox/queues/__init__.py
Normal file
0
archivebox/queues/__init__.py
Normal file
6
archivebox/queues/apps.py
Normal file
6
archivebox/queues/apps.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class QueuesConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'queues'
|
0
archivebox/queues/migrations/__init__.py
Normal file
0
archivebox/queues/migrations/__init__.py
Normal file
18
archivebox/queues/settings.py
Normal file
18
archivebox/queues/settings.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
OUTPUT_DIR = settings.CONFIG.OUTPUT_DIR
|
||||
LOGS_DIR = settings.CONFIG.LOGS_DIR
|
||||
|
||||
TMP_DIR = OUTPUT_DIR / "tmp"
|
||||
|
||||
Path.mkdir(TMP_DIR, exist_ok=True)
|
||||
|
||||
|
||||
CONFIG_FILE = TMP_DIR / "supervisord.conf"
|
||||
PID_FILE = TMP_DIR / "supervisord.pid"
|
||||
SOCK_FILE = TMP_DIR / "supervisord.sock"
|
||||
LOG_FILE = TMP_DIR / "supervisord.log"
|
||||
WORKER_DIR = TMP_DIR / "workers"
|
261
archivebox/queues/supervisor_util.py
Normal file
261
archivebox/queues/supervisor_util.py
Normal file
|
@ -0,0 +1,261 @@
|
|||
__package__ = 'archivebox.queues'
|
||||
|
||||
import sys
|
||||
import time
|
||||
import signal
|
||||
import psutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from rich.pretty import pprint
|
||||
|
||||
from typing import Dict, cast
|
||||
|
||||
from supervisor.xmlrpc import SupervisorTransport
|
||||
from xmlrpc.client import ServerProxy
|
||||
|
||||
from .settings import CONFIG_FILE, PID_FILE, SOCK_FILE, LOG_FILE, WORKER_DIR, TMP_DIR, LOGS_DIR
|
||||
|
||||
|
||||
def create_supervisord_config():
|
||||
config_content = f"""
|
||||
[supervisord]
|
||||
nodaemon = true
|
||||
environment = IS_SUPERVISORD_PARENT="true"
|
||||
pidfile = %(here)s/{PID_FILE.name}
|
||||
logfile = %(here)s/../{LOGS_DIR.name}/{LOG_FILE.name}
|
||||
childlogdir = %(here)s/../{LOGS_DIR.name}
|
||||
directory = %(here)s/..
|
||||
strip_ansi = true
|
||||
nocleanup = true
|
||||
|
||||
[unix_http_server]
|
||||
file = %(here)s/{SOCK_FILE.name}
|
||||
chmod = 0700
|
||||
|
||||
[supervisorctl]
|
||||
serverurl = unix://%(here)s/{SOCK_FILE.name}
|
||||
|
||||
[rpcinterface:supervisor]
|
||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||
|
||||
[include]
|
||||
files = %(here)s/{WORKER_DIR.name}/*.conf
|
||||
|
||||
"""
|
||||
with open(CONFIG_FILE, "w") as f:
|
||||
f.write(config_content)
|
||||
|
||||
def create_worker_config(daemon):
|
||||
Path.mkdir(WORKER_DIR, exist_ok=True)
|
||||
|
||||
name = daemon['name']
|
||||
configfile = WORKER_DIR / f"{name}.conf"
|
||||
|
||||
config_content = f"[program:{name}]\n"
|
||||
for key, value in daemon.items():
|
||||
if key == 'name': continue
|
||||
config_content += f"{key}={value}\n"
|
||||
config_content += "\n"
|
||||
|
||||
with open(configfile, "w") as f:
|
||||
f.write(config_content)
|
||||
|
||||
|
||||
def get_existing_supervisord_process():
|
||||
try:
|
||||
transport = SupervisorTransport(None, None, f"unix://{SOCK_FILE}")
|
||||
server = ServerProxy("http://localhost", transport=transport)
|
||||
current_state = cast(Dict[str, int | str], server.supervisor.getState())
|
||||
if current_state["statename"] == "RUNNING":
|
||||
pid = server.supervisor.getPID()
|
||||
print(f"[🦸♂️] Supervisord connected (pid={pid}) via unix://{str(SOCK_FILE).replace(str(TMP_DIR), 'tmp')}.")
|
||||
return server.supervisor
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error connecting to existing supervisord: {str(e)}")
|
||||
return None
|
||||
|
||||
def stop_existing_supervisord_process():
|
||||
try:
|
||||
pid = int(PID_FILE.read_text())
|
||||
except FileNotFoundError:
|
||||
return
|
||||
except ValueError:
|
||||
PID_FILE.unlink()
|
||||
return
|
||||
|
||||
try:
|
||||
print(f"[🦸♂️] Stopping supervisord process (pid={pid})...")
|
||||
proc = psutil.Process(pid)
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
except Exception:
|
||||
raise
|
||||
try:
|
||||
PID_FILE.unlink()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def start_new_supervisord_process(daemonize=True):
|
||||
print(f"[🦸♂️] Supervisord starting{' in background' if daemonize else ''}...")
|
||||
# Create a config file in the current working directory
|
||||
create_supervisord_config()
|
||||
|
||||
# Start supervisord
|
||||
subprocess.Popen(
|
||||
f"supervisord --configuration={CONFIG_FILE}",
|
||||
stdin=None,
|
||||
shell=True,
|
||||
start_new_session=daemonize,
|
||||
)
|
||||
|
||||
def exit_signal_handler(signum, frame):
|
||||
if signum != 13:
|
||||
print(f"\n[🦸♂️] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...")
|
||||
stop_existing_supervisord_process()
|
||||
raise SystemExit(0)
|
||||
|
||||
# Monitor for termination signals and cleanup child processes
|
||||
if not daemonize:
|
||||
signal.signal(signal.SIGINT, exit_signal_handler)
|
||||
signal.signal(signal.SIGHUP, exit_signal_handler)
|
||||
signal.signal(signal.SIGPIPE, exit_signal_handler)
|
||||
signal.signal(signal.SIGTERM, exit_signal_handler)
|
||||
# otherwise supervisord will containue in background even if parent proc is ends (aka daemon mode)
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
return get_existing_supervisord_process()
|
||||
|
||||
def get_or_create_supervisord_process(daemonize=True):
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor is None:
|
||||
stop_existing_supervisord_process()
|
||||
supervisor = start_new_supervisord_process(daemonize=daemonize)
|
||||
|
||||
assert supervisor and supervisor.getPID(), "Failed to start supervisord or connect to it!"
|
||||
return supervisor
|
||||
|
||||
def start_worker(supervisor, daemon, lazy=False):
|
||||
assert supervisor.getPID()
|
||||
|
||||
print(f"[🦸♂️] Supervisord starting new subprocess worker: {daemon['name']}...")
|
||||
create_worker_config(daemon)
|
||||
|
||||
result = supervisor.reloadConfig()
|
||||
added, changed, removed = result[0]
|
||||
# print(f"Added: {added}, Changed: {changed}, Removed: {removed}")
|
||||
for removed in removed:
|
||||
supervisor.stopProcessGroup(removed)
|
||||
supervisor.removeProcessGroup(removed)
|
||||
for changed in changed:
|
||||
supervisor.stopProcessGroup(changed)
|
||||
supervisor.removeProcessGroup(changed)
|
||||
supervisor.addProcessGroup(changed)
|
||||
for added in added:
|
||||
supervisor.addProcessGroup(added)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
for _ in range(10):
|
||||
procs = supervisor.getAllProcessInfo()
|
||||
for proc in procs:
|
||||
if proc['name'] == daemon["name"]:
|
||||
# See process state diagram here: http://supervisord.org/subprocess.html
|
||||
if proc['statename'] == 'RUNNING':
|
||||
print(f" - Worker {daemon['name']}: already {proc['statename']} ({proc['description']})")
|
||||
return proc
|
||||
else:
|
||||
if not lazy:
|
||||
supervisor.startProcessGroup(daemon["name"], True)
|
||||
proc = supervisor.getProcessInfo(daemon["name"])
|
||||
print(f" - Worker {daemon['name']}: started {proc['statename']} ({proc['description']})")
|
||||
return proc
|
||||
|
||||
# retry in a second in case it's slow to launch
|
||||
time.sleep(0.5)
|
||||
|
||||
raise Exception(f"Failed to start worker {daemon['name']}! Only found: {procs}")
|
||||
|
||||
|
||||
def watch_worker(supervisor, daemon_name, interval=5):
|
||||
"""loop continuously and monitor worker's health"""
|
||||
while True:
|
||||
proc = get_worker(supervisor, daemon_name)
|
||||
if not proc:
|
||||
raise Exception("Worker dissapeared while running! " + daemon_name)
|
||||
|
||||
if proc['statename'] == 'STOPPED':
|
||||
return proc
|
||||
|
||||
if proc['statename'] == 'RUNNING':
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
if proc['statename'] in ('STARTING', 'BACKOFF', 'FATAL', 'EXITED', 'STOPPING'):
|
||||
print(f'[🦸♂️] WARNING: Worker {daemon_name} {proc["statename"]} {proc["description"]}')
|
||||
time.sleep(interval)
|
||||
continue
|
||||
|
||||
|
||||
def get_worker(supervisor, daemon_name):
|
||||
try:
|
||||
return supervisor.getProcessInfo(daemon_name)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def stop_worker(supervisor, daemon_name):
|
||||
proc = get_worker(supervisor, daemon_name)
|
||||
|
||||
for _ in range(10):
|
||||
if not proc:
|
||||
# worker does not exist (was never running or configured in the first place)
|
||||
return True
|
||||
|
||||
# See process state diagram here: http://supervisord.org/subprocess.html
|
||||
if proc['statename'] == 'STOPPED':
|
||||
# worker was configured but has already stopped for some reason
|
||||
supervisor.removeProcessGroup(daemon_name)
|
||||
return True
|
||||
else:
|
||||
# worker was configured and is running, stop it now
|
||||
supervisor.stopProcessGroup(daemon_name)
|
||||
|
||||
# wait 500ms and then re-check to make sure it's really stopped
|
||||
time.sleep(0.5)
|
||||
proc = get_worker(supervisor, daemon_name)
|
||||
|
||||
raise Exception(f"Failed to stop worker {daemon_name}!")
|
||||
|
||||
def main(daemons):
|
||||
supervisor = get_or_create_supervisord_process(daemonize=True)
|
||||
|
||||
worker = start_worker(supervisor, daemons["webworker"])
|
||||
pprint(worker)
|
||||
|
||||
print("All processes started in background.")
|
||||
|
||||
# Optionally you can block the main thread until an exit signal is received:
|
||||
# try:
|
||||
# signal.pause()
|
||||
# except KeyboardInterrupt:
|
||||
# pass
|
||||
# finally:
|
||||
# stop_existing_supervisord_process()
|
||||
|
||||
# if __name__ == "__main__":
|
||||
|
||||
# DAEMONS = {
|
||||
# "webworker": {
|
||||
# "name": "webworker",
|
||||
# "command": "python3 -m http.server 9000",
|
||||
# "directory": str(cwd),
|
||||
# "autostart": "true",
|
||||
# "autorestart": "true",
|
||||
# "stdout_logfile": cwd / "webworker.log",
|
||||
# "stderr_logfile": cwd / "webworker_error.log",
|
||||
# },
|
||||
# }
|
||||
# main(DAEMONS, cwd)
|
41
archivebox/queues/tasks.py
Normal file
41
archivebox/queues/tasks.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
__package__ = 'archivebox.queues'
|
||||
|
||||
from django_huey import db_task, task
|
||||
|
||||
from huey_monitor.models import TaskModel
|
||||
from huey_monitor.tqdm import ProcessInfo
|
||||
|
||||
@db_task(queue="system_tasks", context=True)
|
||||
def bg_add(add_kwargs, task=None, parent_task_id=None):
|
||||
from ..main import add
|
||||
|
||||
if task and parent_task_id:
|
||||
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
|
||||
|
||||
assert add_kwargs and add_kwargs.get("urls")
|
||||
rough_url_count = add_kwargs["urls"].count("://")
|
||||
|
||||
process_info = ProcessInfo(task, desc="add", parent_task_id=parent_task_id, total=rough_url_count)
|
||||
|
||||
result = add(**add_kwargs)
|
||||
process_info.update(n=rough_url_count)
|
||||
return result
|
||||
|
||||
|
||||
@task(queue="system_tasks", context=True)
|
||||
def bg_archive_links(args, kwargs=None, task=None, parent_task_id=None):
|
||||
from ..extractors import archive_links
|
||||
|
||||
if task and parent_task_id:
|
||||
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
|
||||
|
||||
assert args and args[0]
|
||||
kwargs = kwargs or {}
|
||||
|
||||
rough_count = len(args[0])
|
||||
|
||||
process_info = ProcessInfo(task, desc="archive_links", parent_task_id=parent_task_id, total=rough_count)
|
||||
|
||||
result = archive_links(*args, **kwargs)
|
||||
process_info.update(n=rough_count)
|
||||
return result
|
|
@ -329,3 +329,15 @@ tbody .output-link {
|
|||
box-shadow: 4px 4px 4px rgba(0,0,0,0.1);
|
||||
}
|
||||
tbody .output-link:hover {opacity: 1;}
|
||||
|
||||
|
||||
|
||||
@keyframes fadeIn {
|
||||
0% { opacity: 0; }
|
||||
20% { opacity: 0;}
|
||||
100% { opacity: 1; }
|
||||
}
|
||||
|
||||
.fade-in-progress-url {
|
||||
animation: fadeIn 8s;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue