mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-06-02 15:49:51 -04:00
move crawl model out of core
This commit is contained in:
parent
f75ae805f8
commit
a0bef4e27b
1 changed files with 5 additions and 69 deletions
|
@ -15,7 +15,6 @@ from django.utils.text import slugify
|
||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
from django.urls import reverse, reverse_lazy
|
from django.urls import reverse, reverse_lazy
|
||||||
from django.db.models import Case, When, Value, IntegerField
|
from django.db.models import Case, When, Value, IntegerField
|
||||||
from django.core.validators import MaxValueValidator, MinValueValidator
|
|
||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
|
@ -30,7 +29,6 @@ from archivebox.misc.util import parse_date, base_url
|
||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..index.html import snapshot_icons
|
from ..index.html import snapshot_icons
|
||||||
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||||
from ..parsers import PARSERS
|
|
||||||
|
|
||||||
|
|
||||||
# class BaseModel(models.Model):
|
# class BaseModel(models.Model):
|
||||||
|
@ -68,7 +66,7 @@ class Tag(ABIDModel):
|
||||||
# slug is autoset on save from name, never set it manually
|
# slug is autoset on save from name, never set it manually
|
||||||
|
|
||||||
snapshot_set: models.Manager['Snapshot']
|
snapshot_set: models.Manager['Snapshot']
|
||||||
crawl_set: models.Manager['Crawl']
|
# crawl_set: models.Manager['Crawl']
|
||||||
|
|
||||||
class Meta(TypedModelMeta):
|
class Meta(TypedModelMeta):
|
||||||
verbose_name = "Tag"
|
verbose_name = "Tag"
|
||||||
|
@ -136,69 +134,6 @@ class SnapshotTag(models.Model):
|
||||||
# unique_together = [('crawl', 'tag')]
|
# unique_together = [('crawl', 'tag')]
|
||||||
|
|
||||||
|
|
||||||
class Crawl(ABIDModel):
|
|
||||||
abid_prefix = 'crl_'
|
|
||||||
abid_ts_src = 'self.created_at'
|
|
||||||
abid_uri_src = 'self.urls'
|
|
||||||
abid_subtype_src = 'self.crawler'
|
|
||||||
abid_rand_src = 'self.id'
|
|
||||||
abid_drift_allowed = True
|
|
||||||
|
|
||||||
# CRAWLER_CHOICES = (
|
|
||||||
# ('breadth_first', 'Breadth-First'),
|
|
||||||
# ('depth_first', 'Depth-First'),
|
|
||||||
# )
|
|
||||||
PARSER_CHOICES = (
|
|
||||||
('auto', 'auto'),
|
|
||||||
*((parser_key, value[0]) for parser_key, value in PARSERS.items()),
|
|
||||||
)
|
|
||||||
|
|
||||||
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
|
||||||
abid = ABIDField(prefix=abid_prefix)
|
|
||||||
|
|
||||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set')
|
|
||||||
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
|
|
||||||
modified_at = models.DateTimeField(auto_now=True)
|
|
||||||
|
|
||||||
urls = models.TextField(blank=False, null=False)
|
|
||||||
depth = models.PositiveSmallIntegerField(default=1, validators=[MinValueValidator(0), MaxValueValidator(2)])
|
|
||||||
parser = models.CharField(choices=PARSER_CHOICES, default='auto', max_length=32)
|
|
||||||
|
|
||||||
# crawler = models.CharField(choices=CRAWLER_CHOICES, default='breadth_first', max_length=32)
|
|
||||||
# tags = models.ManyToManyField(Tag, blank=True, related_name='crawl_set', through='CrawlTag')
|
|
||||||
# schedule = models.JSONField()
|
|
||||||
# config = models.JSONField()
|
|
||||||
|
|
||||||
|
|
||||||
class Meta(TypedModelMeta):
|
|
||||||
verbose_name = 'Crawl'
|
|
||||||
verbose_name_plural = 'Crawls'
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.parser
|
|
||||||
|
|
||||||
@cached_property
|
|
||||||
def crawl_dir(self):
|
|
||||||
return Path()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def api_url(self) -> str:
|
|
||||||
# /api/v1/core/crawl/{uulid}
|
|
||||||
return reverse_lazy('api-1:get_crawl', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
|
|
||||||
|
|
||||||
@property
|
|
||||||
def api_docs_url(self) -> str:
|
|
||||||
return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
|
|
||||||
|
|
||||||
# def get_absolute_url(self):
|
|
||||||
# return f'/crawls/{self.abid}'
|
|
||||||
|
|
||||||
def crawl(self):
|
|
||||||
# write self.urls to sources/crawl__<user>__YYYYMMDDHHMMSS.txt
|
|
||||||
# run parse_links(sources/crawl__<user>__YYYYMMDDHHMMSS.txt, parser=self.parser) and for each resulting link:
|
|
||||||
# create a Snapshot
|
|
||||||
# enqueue task bg_archive_snapshot(snapshot)
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -561,9 +496,10 @@ class ArchiveResult(ABIDModel):
|
||||||
# return f'[{self.abid}] 📅 {self.start_ts.strftime("%Y-%m-%d %H:%M")} 📄 {self.extractor} {self.snapshot.url}'
|
# return f'[{self.abid}] 📅 {self.start_ts.strftime("%Y-%m-%d %H:%M")} 📄 {self.extractor} {self.snapshot.url}'
|
||||||
return self.extractor
|
return self.extractor
|
||||||
|
|
||||||
@cached_property
|
# TODO: finish connecting machine.models
|
||||||
def machine(self):
|
# @cached_property
|
||||||
return self.iface.machine if self.iface else None
|
# def machine(self):
|
||||||
|
# return self.iface.machine if self.iface else None
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def snapshot_dir(self):
|
def snapshot_dir(self):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue