mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-24 19:54:25 -04:00
add django_huey, huey_monitor, and replace Threads with huey tasks
This commit is contained in:
parent
4df90fbb40
commit
60154fba5f
19 changed files with 850 additions and 92 deletions
|
@ -30,6 +30,7 @@ from core.models import Snapshot, ArchiveResult, Tag
|
|||
from core.mixins import SearchResultsAdminMixin
|
||||
from api.models import APIToken
|
||||
from abid_utils.admin import ABIDModelAdmin
|
||||
from queues.tasks import bg_archive_links, bg_add
|
||||
|
||||
from index.html import snapshot_icons
|
||||
from logging_util import printable_filesize
|
||||
|
@ -137,6 +138,8 @@ class CustomUserAdmin(UserAdmin):
|
|||
) + f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
|
||||
|
||||
|
||||
|
||||
|
||||
archivebox_admin = ArchiveBoxAdmin()
|
||||
archivebox_admin.register(get_user_model(), CustomUserAdmin)
|
||||
archivebox_admin.disable_action('delete_selected')
|
||||
|
@ -155,6 +158,28 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad
|
|||
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
|
||||
|
||||
|
||||
from huey_monitor.apps import HueyMonitorConfig
|
||||
HueyMonitorConfig.verbose_name = 'Background Workers'
|
||||
|
||||
from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin
|
||||
archivebox_admin.register(SignalInfoModel, SignalInfoModelAdmin)
|
||||
|
||||
|
||||
class CustomTaskModelAdmin(TaskModelAdmin):
|
||||
actions = ["delete_selected"]
|
||||
|
||||
def has_delete_permission(self, request, obj=None):
|
||||
codename = get_permission_codename("delete", self.opts)
|
||||
return request.user.has_perm("%s.%s" % (self.opts.app_label, codename))
|
||||
|
||||
|
||||
archivebox_admin.register(TaskModel, CustomTaskModelAdmin)
|
||||
|
||||
def result_url(result: TaskModel) -> str:
|
||||
url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
|
||||
return format_html('<a href="{url}" class="fade-in-progress-url">See progress...</a>'.format(url=url))
|
||||
|
||||
|
||||
class AccelleratedPaginator(Paginator):
|
||||
"""
|
||||
Accellerated Pagniator ignores DISTINCT when counting total number of rows.
|
||||
|
@ -515,65 +540,53 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=CONFIG.OUTPUT_DIR)
|
||||
messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.")
|
||||
else:
|
||||
# otherwise run in a bg thread
|
||||
bg_thread = threading.Thread(
|
||||
target=archive_links,
|
||||
args=(links,),
|
||||
kwargs={"overwrite": True, "methods": ['title', 'favicon'], "out_dir": CONFIG.OUTPUT_DIR},
|
||||
# otherwise run in a background worker
|
||||
result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": CONFIG.OUTPUT_DIR})
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"),
|
||||
)
|
||||
bg_thread.setDaemon(True)
|
||||
bg_thread.start()
|
||||
messages.success(request, f"Title and favicon are updating in the background for {len(links)} URLs. (refresh in a few minutes to see results)")
|
||||
|
||||
@admin.action(
|
||||
description="⬇️ Get Missing"
|
||||
)
|
||||
def update_snapshots(self, request, queryset):
|
||||
links = [snapshot.as_link() for snapshot in queryset]
|
||||
bg_thread = threading.Thread(
|
||||
target=archive_links,
|
||||
args=(links,),
|
||||
kwargs={"overwrite": False, "out_dir": CONFIG.OUTPUT_DIR},
|
||||
)
|
||||
bg_thread.setDaemon(True)
|
||||
bg_thread.start()
|
||||
|
||||
result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": CONFIG.OUTPUT_DIR})
|
||||
|
||||
messages.success(
|
||||
request, f"Re-trying any previously failed methods for {len(links)} URLs in the background. (refresh in a few minutes to see results)"
|
||||
request,
|
||||
mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"),
|
||||
)
|
||||
|
||||
|
||||
@admin.action(
|
||||
description="📑 Archive again"
|
||||
description="🆕 Archive Again"
|
||||
)
|
||||
def resnapshot_snapshot(self, request, queryset):
|
||||
for snapshot in queryset:
|
||||
timestamp = timezone.now().isoformat('T', 'seconds')
|
||||
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
|
||||
|
||||
bg_thread = threading.Thread(target=add, args=(new_url,), kwargs={'tag': snapshot.tags_str()})
|
||||
bg_thread.setDaemon(True)
|
||||
bg_thread.start()
|
||||
result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
f"Creating new fresh snapshots for {len(queryset.count())} URLs in the background. (refresh in a few minutes to see results)",
|
||||
mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
description="♲ Redo"
|
||||
description="🔄 Redo"
|
||||
)
|
||||
def overwrite_snapshots(self, request, queryset):
|
||||
links = [snapshot.as_link() for snapshot in queryset]
|
||||
bg_thread = threading.Thread(
|
||||
target=archive_links,
|
||||
args=(links,),
|
||||
kwargs={"overwrite": True, "out_dir": CONFIG.OUTPUT_DIR},
|
||||
)
|
||||
bg_thread.setDaemon(True)
|
||||
bg_thread.start()
|
||||
|
||||
result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": CONFIG.OUTPUT_DIR})
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
f"Clearing all previous results and re-downloading {len(links)} URLs in the background. (refresh in a few minutes to see results)",
|
||||
mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"),
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
|
@ -583,7 +596,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
remove(snapshots=queryset, yes=True, delete=True, out_dir=CONFIG.OUTPUT_DIR)
|
||||
messages.success(
|
||||
request,
|
||||
f"Succesfully deleted {len(queryset.count())} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed.",
|
||||
mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
|
||||
)
|
||||
|
||||
|
||||
|
@ -597,7 +610,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
obj.tags.add(*tags)
|
||||
messages.success(
|
||||
request,
|
||||
f"Added {len(tags)} tags to {len(queryset.count())} Snapshots.",
|
||||
f"Added {len(tags)} tags to {queryset.count()} Snapshots.",
|
||||
)
|
||||
|
||||
|
||||
|
@ -611,7 +624,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
|
|||
obj.tags.remove(*tags)
|
||||
messages.success(
|
||||
request,
|
||||
f"Removed {len(tags)} tags from {len(queryset.count())} Snapshots.",
|
||||
f"Removed {len(tags)} tags from {queryset.count()} Snapshots.",
|
||||
)
|
||||
|
||||
|
||||
|
@ -727,7 +740,6 @@ class ArchiveResultAdmin(ABIDModelAdmin):
|
|||
else:
|
||||
root_dir = str(snapshot_dir)
|
||||
|
||||
|
||||
# print(root_dir, str(list(os.walk(root_dir))))
|
||||
|
||||
for root, dirs, files in os.walk(root_dir):
|
||||
|
|
|
@ -87,6 +87,7 @@ INSTALLED_APPS = [
|
|||
'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
|
||||
|
||||
# Our ArchiveBox-provided apps
|
||||
'queues', # handles starting and managing background workers and processes
|
||||
'abid_utils', # handles ABID ID creation, handling, and models
|
||||
'plugantic', # ArchiveBox plugin API definition + finding/registering/calling interface
|
||||
'core', # core django model with Snapshot, ArchiveResult, etc.
|
||||
|
@ -98,6 +99,9 @@ INSTALLED_APPS = [
|
|||
# 3rd-party apps from PyPI that need to be loaded last
|
||||
'admin_data_views', # handles rendering some convenient automatic read-only views of data in Django admin
|
||||
'django_extensions', # provides Django Debug Toolbar (and other non-debug helpers)
|
||||
'django_huey', # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
|
||||
'bx_django_utils', # needed for huey_monitor https://github.com/boxine/bx_django_utils
|
||||
'huey_monitor', # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
|
||||
]
|
||||
|
||||
|
||||
|
@ -212,17 +216,28 @@ CACHE_DB_TABLE = 'django_cache'
|
|||
DATABASE_FILE = Path(CONFIG.OUTPUT_DIR) / CONFIG.SQL_INDEX_FILENAME
|
||||
DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE))
|
||||
|
||||
QUEUE_DATABASE_NAME = DATABASE_NAME.replace('index.sqlite3', 'queue.sqlite3')
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': DATABASE_NAME,
|
||||
'OPTIONS': {
|
||||
'timeout': 60,
|
||||
'check_same_thread': False,
|
||||
"default": {
|
||||
"ENGINE": "django.db.backends.sqlite3",
|
||||
"NAME": DATABASE_NAME,
|
||||
"OPTIONS": {
|
||||
"timeout": 60,
|
||||
"check_same_thread": False,
|
||||
},
|
||||
'TIME_ZONE': CONFIG.TIMEZONE,
|
||||
"TIME_ZONE": CONFIG.TIMEZONE,
|
||||
# DB setup is sometimes modified at runtime by setup_django() in config.py
|
||||
},
|
||||
"queue": {
|
||||
"ENGINE": "django.db.backends.sqlite3",
|
||||
"NAME": QUEUE_DATABASE_NAME,
|
||||
"OPTIONS": {
|
||||
"timeout": 60,
|
||||
"check_same_thread": False,
|
||||
},
|
||||
"TIME_ZONE": CONFIG.TIMEZONE,
|
||||
},
|
||||
# 'cache': {
|
||||
# 'ENGINE': 'django.db.backends.sqlite3',
|
||||
# 'NAME': CACHE_DB_PATH,
|
||||
|
@ -239,6 +254,64 @@ MIGRATION_MODULES = {'signal_webhooks': None}
|
|||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
|
||||
HUEY = {
|
||||
"huey_class": "huey.SqliteHuey",
|
||||
"filename": QUEUE_DATABASE_NAME,
|
||||
"name": "system_tasks",
|
||||
"results": True,
|
||||
"store_none": True,
|
||||
"immediate": False,
|
||||
"utc": True,
|
||||
"consumer": {
|
||||
"workers": 1,
|
||||
"worker_type": "thread",
|
||||
"initial_delay": 0.1, # Smallest polling interval, same as -d.
|
||||
"backoff": 1.15, # Exponential backoff using this rate, -b.
|
||||
"max_delay": 10.0, # Max possible polling interval, -m.
|
||||
"scheduler_interval": 1, # Check schedule every second, -s.
|
||||
"periodic": True, # Enable crontab feature.
|
||||
"check_worker_health": True, # Enable worker health checks.
|
||||
"health_check_interval": 1, # Check worker health every second.
|
||||
},
|
||||
}
|
||||
|
||||
# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
|
||||
# https://github.com/gaiacoop/django-huey
|
||||
DJANGO_HUEY = {
|
||||
"default": "system_tasks",
|
||||
"queues": {
|
||||
HUEY["name"]: HUEY.copy(),
|
||||
# more registered here at plugin import-time by BaseQueue.register()
|
||||
},
|
||||
}
|
||||
|
||||
class HueyDBRouter:
|
||||
"""A router to store all the Huey Monitor models in the queue.sqlite3 database."""
|
||||
|
||||
route_app_labels = {"huey_monitor", "django_huey", "djhuey"}
|
||||
|
||||
def db_for_read(self, model, **hints):
|
||||
if model._meta.app_label in self.route_app_labels:
|
||||
return "queue"
|
||||
return 'default'
|
||||
|
||||
def db_for_write(self, model, **hints):
|
||||
if model._meta.app_label in self.route_app_labels:
|
||||
return "queue"
|
||||
return 'default'
|
||||
|
||||
def allow_relation(self, obj1, obj2, **hints):
|
||||
if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
|
||||
return obj1._meta.app_label == obj2._meta.app_label
|
||||
return None
|
||||
|
||||
def allow_migrate(self, db, app_label, model_name=None, **hints):
|
||||
if app_label in self.route_app_labels:
|
||||
return db == "queue"
|
||||
return db == "default"
|
||||
|
||||
DATABASE_ROUTERS = ['core.settings.HueyDBRouter']
|
||||
|
||||
CACHES = {
|
||||
'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'},
|
||||
# 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'},
|
||||
|
|
|
@ -23,6 +23,9 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
|
|||
|
||||
from core.models import Snapshot
|
||||
from core.forms import AddLinkForm
|
||||
from core.admin import result_url
|
||||
|
||||
from queues.tasks import bg_add
|
||||
|
||||
from ..config import (
|
||||
OUTPUT_DIR,
|
||||
|
@ -478,15 +481,14 @@ class AddView(UserPassesTestMixin, FormView):
|
|||
if extractors:
|
||||
input_kwargs.update({"extractors": extractors})
|
||||
|
||||
bg_thread = threading.Thread(target=add, kwargs=input_kwargs)
|
||||
bg_thread.setDaemon(True)
|
||||
bg_thread.start()
|
||||
result = bg_add(input_kwargs, parent_task_id=None)
|
||||
print('Started background add job:', result)
|
||||
|
||||
rough_url_count = url.count('://')
|
||||
|
||||
messages.success(
|
||||
self.request,
|
||||
f"Adding {rough_url_count} URLs in the background. (refresh in a few minutes to see results)",
|
||||
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a few minutes to see results) {result_url(result)}"),
|
||||
)
|
||||
|
||||
return redirect("/admin/core/snapshot/")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue