From c9f88f58754578eb681ea548feefa8afd82b933b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 15 Oct 2024 22:32:13 -0700 Subject: [PATCH] add sessions and seeds models --- archivebox/core/settings.py | 7 ++- archivebox/seeds/__init__.py | 0 archivebox/seeds/admin.py | 3 + archivebox/seeds/apps.py | 6 ++ archivebox/seeds/migrations/__init__.py | 0 archivebox/seeds/models.py | 67 ++++++++++++++++++++++ archivebox/seeds/tests.py | 3 + archivebox/seeds/views.py | 3 + archivebox/sessions/__init__.py | 0 archivebox/sessions/admin.py | 3 + archivebox/sessions/apps.py | 6 ++ archivebox/sessions/migrations/__init__.py | 0 archivebox/sessions/models.py | 67 ++++++++++++++++++++++ archivebox/sessions/tests.py | 3 + archivebox/sessions/views.py | 3 + 15 files changed, 169 insertions(+), 2 deletions(-) create mode 100644 archivebox/seeds/__init__.py create mode 100644 archivebox/seeds/admin.py create mode 100644 archivebox/seeds/apps.py create mode 100644 archivebox/seeds/migrations/__init__.py create mode 100644 archivebox/seeds/models.py create mode 100644 archivebox/seeds/tests.py create mode 100644 archivebox/seeds/views.py create mode 100644 archivebox/sessions/__init__.py create mode 100644 archivebox/sessions/admin.py create mode 100644 archivebox/sessions/apps.py create mode 100644 archivebox/sessions/migrations/__init__.py create mode 100644 archivebox/sessions/models.py create mode 100644 archivebox/sessions/tests.py create mode 100644 archivebox/sessions/views.py diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 5ec0b7e8..caa18f25 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -100,10 +100,13 @@ INSTALLED_APPS = [ 'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions # Our ArchiveBox-provided apps - 'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) + # 'abid_utils', # handles ABID ID creation, handling, and models + 'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. 'queues', # handles starting and managing background workers and processes - 'abid_utils', # handles ABID ID creation, handling, and models + 'seeds', # handles Seed model and URL source management + 'crawls', # handles Crawl and CrawlSchedule models and management + 'sessions', # handles Persona and session management 'core', # core django model with Snapshot, ArchiveResult, etc. 'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. diff --git a/archivebox/seeds/__init__.py b/archivebox/seeds/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/seeds/admin.py b/archivebox/seeds/admin.py new file mode 100644 index 00000000..8c38f3f3 --- /dev/null +++ b/archivebox/seeds/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/archivebox/seeds/apps.py b/archivebox/seeds/apps.py new file mode 100644 index 00000000..38eb4fde --- /dev/null +++ b/archivebox/seeds/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class SeedsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "seeds" diff --git a/archivebox/seeds/migrations/__init__.py b/archivebox/seeds/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/seeds/models.py b/archivebox/seeds/models.py new file mode 100644 index 00000000..b0d83b2e --- /dev/null +++ b/archivebox/seeds/models.py @@ -0,0 +1,67 @@ +__package__ = 'archivebox.seeds' + + +from datetime import datetime + +from django_stubs_ext.db.models import TypedModelMeta + +from django.db import models +from django.db.models import Q +from django.core.validators import MaxValueValidator, MinValueValidator +from django.conf import settings +from django.utils import timezone +from django.utils.functional import cached_property +from django.urls import reverse_lazy + +from pathlib import Path + + +from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats + + +class Seed(ABIDModel, ModelWithHealthStats): + """ + A fountain that produces URLs (+metadata) each time it's queried e.g. + - file:///data/sources/2024-01-02_11-57-51__cli_add.txt + - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt + - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks + - https://getpocket.com/user/nikisweeting/feed + - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml + - ... + Each query of a Seed can produce the same list of URLs, or a different list each time. + The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots. + + When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI. + The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks. + The outlinks then get turned into new pending Snapshots under the same crawl, + and the cycle repeats until Crawl.max_depth. + + Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to + stateful remote services, files with contents that change, directories that have new files within, etc. + """ + + abid_prefix = 'src_' + abid_ts_src = 'self.created_at' + abid_uri_src = 'self.uri' + abid_subtype_src = 'self.extractor' + abid_rand_src = 'self.id' + abid_drift_allowed = True + + uri = models.URLField(max_length=255, blank=False, null=False, unique=True) # unique source location where URLs will be loaded from + + extractor = models.CharField(default='auto', max_length=32) # suggested extractor to use to load this URL source + tags_str = models.CharField(max_length=255, null=False, blank=True, default='') # tags to attach to any URLs that come from this source + config = models.JSONField(default=dict) # extra config to put in scope when loading URLs from this source + + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) + + @property + def source_type(self): + # e.g. http/https:// + # file:// + # pocketapi:// + # s3:// + # etc.. + return self.uri.split('://')[0].lower() diff --git a/archivebox/seeds/tests.py b/archivebox/seeds/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/seeds/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/seeds/views.py b/archivebox/seeds/views.py new file mode 100644 index 00000000..91ea44a2 --- /dev/null +++ b/archivebox/seeds/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/archivebox/sessions/__init__.py b/archivebox/sessions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/sessions/admin.py b/archivebox/sessions/admin.py new file mode 100644 index 00000000..8c38f3f3 --- /dev/null +++ b/archivebox/sessions/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/archivebox/sessions/apps.py b/archivebox/sessions/apps.py new file mode 100644 index 00000000..cecec044 --- /dev/null +++ b/archivebox/sessions/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class SessionsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "sessions" diff --git a/archivebox/sessions/migrations/__init__.py b/archivebox/sessions/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/sessions/models.py b/archivebox/sessions/models.py new file mode 100644 index 00000000..1c9c85d4 --- /dev/null +++ b/archivebox/sessions/models.py @@ -0,0 +1,67 @@ +from django.db import models + +from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats + +from django.conf import settings + + +class Persona(ABIDModel, ModelWithHealthStats): + """Aka a "SessionType", its a template for a crawler browsing session containing some config.""" + + abid_prefix = 'prs_' + abid_ts_src = 'self.created_at' + abid_uri_src = 'self.name' + abid_subtype_src = 'self.created_by' + abid_rand_src = 'self.id' + + id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') + abid = ABIDField(prefix=abid_prefix) + + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + + name = models.CharField(max_length=100, blank=False, null=False, editable=False) + + persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False) + config = models.JSONField(default=dict) + # e.g. { + # USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', + # COOKIES_TXT_FILE: '/path/to/cookies.txt', + # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir', + # CHECK_SSL_VALIDITY: False, + # SAVE_ARCHIVE_DOT_ORG: True, + # CHROME_BINARY: 'chromium' + # ... + # } + # domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='') + # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='') + + class Meta: + verbose_name = 'Session Type' + verbose_name_plural = 'Session Types' + unique_together = (('created_by', 'name'),) + + + def clean(self): + self.persona_dir = settings.PERSONAS_DIR / self.name + assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name' + + + # make sure config keys all exist in FLAT_CONFIG + # make sure config values all match expected types + pass + + def save(self, *args, **kwargs): + self.full_clean() + + # make sure basic file structure is present in persona_dir: + # - PERSONAS_DIR / self.name / + # - chrome_profile/ + # - chrome_downloads/ + # - chrome_extensions/ + # - cookies.txt + # - auth.json + # - config.json # json dump of the model + + super().save(*args, **kwargs) diff --git a/archivebox/sessions/tests.py b/archivebox/sessions/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/sessions/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/sessions/views.py b/archivebox/sessions/views.py new file mode 100644 index 00000000..91ea44a2 --- /dev/null +++ b/archivebox/sessions/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here.