add sessions and seeds models

This commit is contained in:
Nick Sweeting 2024-10-15 22:32:13 -07:00
parent 0866f4aaf3
commit c9f88f5875
No known key found for this signature in database
15 changed files with 169 additions and 2 deletions

View file

@ -100,10 +100,13 @@ INSTALLED_APPS = [
'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions 'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
# Our ArchiveBox-provided apps # Our ArchiveBox-provided apps
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) # 'abid_utils', # handles ABID ID creation, handling, and models
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. 'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
'queues', # handles starting and managing background workers and processes 'queues', # handles starting and managing background workers and processes
'abid_utils', # handles ABID ID creation, handling, and models 'seeds', # handles Seed model and URL source management
'crawls', # handles Crawl and CrawlSchedule models and management
'sessions', # handles Persona and session management
'core', # core django model with Snapshot, ArchiveResult, etc. 'core', # core django model with Snapshot, ArchiveResult, etc.
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. 'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.

View file

View file

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

6
archivebox/seeds/apps.py Normal file
View file

@ -0,0 +1,6 @@
from django.apps import AppConfig
class SeedsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "seeds"

View file

View file

@ -0,0 +1,67 @@
__package__ = 'archivebox.seeds'
from datetime import datetime
from django_stubs_ext.db.models import TypedModelMeta
from django.db import models
from django.db.models import Q
from django.core.validators import MaxValueValidator, MinValueValidator
from django.conf import settings
from django.utils import timezone
from django.utils.functional import cached_property
from django.urls import reverse_lazy
from pathlib import Path
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
class Seed(ABIDModel, ModelWithHealthStats):
"""
A fountain that produces URLs (+metadata) each time it's queried e.g.
- file:///data/sources/2024-01-02_11-57-51__cli_add.txt
- file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
- file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
- https://getpocket.com/user/nikisweeting/feed
- https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
- ...
Each query of a Seed can produce the same list of URLs, or a different list each time.
The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
The outlinks then get turned into new pending Snapshots under the same crawl,
and the cycle repeats until Crawl.max_depth.
Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
stateful remote services, files with contents that change, directories that have new files within, etc.
"""
abid_prefix = 'src_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.uri'
abid_subtype_src = 'self.extractor'
abid_rand_src = 'self.id'
abid_drift_allowed = True
uri = models.URLField(max_length=255, blank=False, null=False, unique=True) # unique source location where URLs will be loaded from
extractor = models.CharField(default='auto', max_length=32) # suggested extractor to use to load this URL source
tags_str = models.CharField(max_length=255, null=False, blank=True, default='') # tags to attach to any URLs that come from this source
config = models.JSONField(default=dict) # extra config to put in scope when loading URLs from this source
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
@property
def source_type(self):
# e.g. http/https://
# file://
# pocketapi://
# s3://
# etc..
return self.uri.split('://')[0].lower()

View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View file

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

View file

View file

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

View file

@ -0,0 +1,6 @@
from django.apps import AppConfig
class SessionsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "sessions"

View file

@ -0,0 +1,67 @@
from django.db import models
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
from django.conf import settings
class Persona(ABIDModel, ModelWithHealthStats):
"""Aka a "SessionType", its a template for a crawler browsing session containing some config."""
abid_prefix = 'prs_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.name'
abid_subtype_src = 'self.created_by'
abid_rand_src = 'self.id'
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
name = models.CharField(max_length=100, blank=False, null=False, editable=False)
persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False)
config = models.JSONField(default=dict)
# e.g. {
# USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
# COOKIES_TXT_FILE: '/path/to/cookies.txt',
# CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
# CHECK_SSL_VALIDITY: False,
# SAVE_ARCHIVE_DOT_ORG: True,
# CHROME_BINARY: 'chromium'
# ...
# }
# domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='')
# domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
class Meta:
verbose_name = 'Session Type'
verbose_name_plural = 'Session Types'
unique_together = (('created_by', 'name'),)
def clean(self):
self.persona_dir = settings.PERSONAS_DIR / self.name
assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name'
# make sure config keys all exist in FLAT_CONFIG
# make sure config values all match expected types
pass
def save(self, *args, **kwargs):
self.full_clean()
# make sure basic file structure is present in persona_dir:
# - PERSONAS_DIR / self.name /
# - chrome_profile/
# - chrome_downloads/
# - chrome_extensions/
# - cookies.txt
# - auth.json
# - config.json # json dump of the model
super().save(*args, **kwargs)

View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View file

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.