move crawl model out of core

This commit is contained in:
Nick Sweeting 2024-10-14 15:42:36 -07:00
parent f75ae805f8
commit a0bef4e27b
No known key found for this signature in database

View file

@ -15,7 +15,6 @@ from django.utils.text import slugify
from django.core.cache import cache from django.core.cache import cache
from django.urls import reverse, reverse_lazy from django.urls import reverse, reverse_lazy
from django.db.models import Case, When, Value, IntegerField from django.db.models import Case, When, Value, IntegerField
from django.core.validators import MaxValueValidator, MinValueValidator
from django.contrib import admin from django.contrib import admin
from django.conf import settings from django.conf import settings
@ -30,7 +29,6 @@ from archivebox.misc.util import parse_date, base_url
from ..index.schema import Link from ..index.schema import Link
from ..index.html import snapshot_icons from ..index.html import snapshot_icons
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
from ..parsers import PARSERS
# class BaseModel(models.Model): # class BaseModel(models.Model):
@ -68,7 +66,7 @@ class Tag(ABIDModel):
# slug is autoset on save from name, never set it manually # slug is autoset on save from name, never set it manually
snapshot_set: models.Manager['Snapshot'] snapshot_set: models.Manager['Snapshot']
crawl_set: models.Manager['Crawl'] # crawl_set: models.Manager['Crawl']
class Meta(TypedModelMeta): class Meta(TypedModelMeta):
verbose_name = "Tag" verbose_name = "Tag"
@ -136,69 +134,6 @@ class SnapshotTag(models.Model):
# unique_together = [('crawl', 'tag')] # unique_together = [('crawl', 'tag')]
class Crawl(ABIDModel):
abid_prefix = 'crl_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.urls'
abid_subtype_src = 'self.crawler'
abid_rand_src = 'self.id'
abid_drift_allowed = True
# CRAWLER_CHOICES = (
# ('breadth_first', 'Breadth-First'),
# ('depth_first', 'Depth-First'),
# )
PARSER_CHOICES = (
('auto', 'auto'),
*((parser_key, value[0]) for parser_key, value in PARSERS.items()),
)
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set')
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
urls = models.TextField(blank=False, null=False)
depth = models.PositiveSmallIntegerField(default=1, validators=[MinValueValidator(0), MaxValueValidator(2)])
parser = models.CharField(choices=PARSER_CHOICES, default='auto', max_length=32)
# crawler = models.CharField(choices=CRAWLER_CHOICES, default='breadth_first', max_length=32)
# tags = models.ManyToManyField(Tag, blank=True, related_name='crawl_set', through='CrawlTag')
# schedule = models.JSONField()
# config = models.JSONField()
class Meta(TypedModelMeta):
verbose_name = 'Crawl'
verbose_name_plural = 'Crawls'
def __str__(self):
return self.parser
@cached_property
def crawl_dir(self):
return Path()
@property
def api_url(self) -> str:
# /api/v1/core/crawl/{uulid}
return reverse_lazy('api-1:get_crawl', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
@property
def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
# def get_absolute_url(self):
# return f'/crawls/{self.abid}'
def crawl(self):
# write self.urls to sources/crawl__<user>__YYYYMMDDHHMMSS.txt
# run parse_links(sources/crawl__<user>__YYYYMMDDHHMMSS.txt, parser=self.parser) and for each resulting link:
# create a Snapshot
# enqueue task bg_archive_snapshot(snapshot)
pass
@ -561,9 +496,10 @@ class ArchiveResult(ABIDModel):
# return f'[{self.abid}] 📅 {self.start_ts.strftime("%Y-%m-%d %H:%M")} 📄 {self.extractor} {self.snapshot.url}' # return f'[{self.abid}] 📅 {self.start_ts.strftime("%Y-%m-%d %H:%M")} 📄 {self.extractor} {self.snapshot.url}'
return self.extractor return self.extractor
@cached_property # TODO: finish connecting machine.models
def machine(self): # @cached_property
return self.iface.machine if self.iface else None # def machine(self):
# return self.iface.machine if self.iface else None
@cached_property @cached_property
def snapshot_dir(self): def snapshot_dir(self):