fix Crawl models

This commit is contained in:
Nick Sweeting 2024-10-21 00:38:29 -07:00
parent d48a99ab64
commit a97cc82979
No known key found for this signature in database

View file

@ -1,7 +1,5 @@
__package__ = 'archivebox.crawls' __package__ = 'archivebox.crawls'
import time
from django_stubs_ext.db.models import TypedModelMeta from django_stubs_ext.db.models import TypedModelMeta
from django.db import models from django.db import models
@ -9,11 +7,8 @@ from django.db.models import Q
from django.core.validators import MaxValueValidator, MinValueValidator from django.core.validators import MaxValueValidator, MinValueValidator
from django.conf import settings from django.conf import settings
from django.utils import timezone from django.utils import timezone
from django.utils.functional import cached_property
from django.urls import reverse_lazy from django.urls import reverse_lazy
from pathlib import Path
from seeds.models import Seed from seeds.models import Seed
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
@ -116,7 +111,8 @@ class Outlink(models.Model):
src = models.URLField() # parent page where the outlink/href was found e.g. https://example.com/downloads src = models.URLField() # parent page where the outlink/href was found e.g. https://example.com/downloads
dst = models.URLField() # remote location the child outlink/href points to e.g. https://example.com/downloads/some_file.pdf dst = models.URLField() # remote location the child outlink/href points to e.g. https://example.com/downloads/some_file.pdf
via = models.ForeignKey(ArchiveResult, related_name='outlink_set') crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, blank=False, related_name='outlink_set')
via = models.ForeignKey('core.ArchiveResult', on_delete=models.SET_NULL, null=True, blank=True, related_name='outlink_set')
class Meta: class Meta:
unique_together = (('src', 'dst', 'via'),) unique_together = (('src', 'dst', 'via'),)
@ -125,44 +121,44 @@ class Outlink(models.Model):
@abx.hookimpl.on_archiveresult_created # @abx.hookimpl.on_archiveresult_created
def exec_archiveresult_extractor_effects(archiveresult): # def exec_archiveresult_extractor_effects(archiveresult):
config = get_scope_config(...) # config = get_scope_config(...)
# abx.archivebox.writes.update_archiveresult_started(archiveresult, start_ts=timezone.now()) # # abx.archivebox.writes.update_archiveresult_started(archiveresult, start_ts=timezone.now())
# abx.archivebox.events.on_archiveresult_updated(archiveresult) # # abx.archivebox.events.on_archiveresult_updated(archiveresult)
# check if it should be skipped # # check if it should be skipped
if not abx.archivebox.reads.get_archiveresult_should_run(archiveresult, config): # if not abx.archivebox.reads.get_archiveresult_should_run(archiveresult, config):
abx.archivebox.writes.update_archiveresult_skipped(archiveresult, status='skipped') # abx.archivebox.writes.update_archiveresult_skipped(archiveresult, status='skipped')
abx.archivebox.events.on_archiveresult_skipped(archiveresult, config) # abx.archivebox.events.on_archiveresult_skipped(archiveresult, config)
return # return
# run the extractor method and save the output back to the archiveresult # # run the extractor method and save the output back to the archiveresult
try: # try:
output = abx.archivebox.effects.exec_archiveresult_extractor(archiveresult, config) # output = abx.archivebox.effects.exec_archiveresult_extractor(archiveresult, config)
abx.archivebox.writes.update_archiveresult_succeeded(archiveresult, output=output, error=None, end_ts=timezone.now()) # abx.archivebox.writes.update_archiveresult_succeeded(archiveresult, output=output, error=None, end_ts=timezone.now())
except Exception as e: # except Exception as e:
abx.archivebox.writes.update_archiveresult_failed(archiveresult, error=e, end_ts=timezone.now()) # abx.archivebox.writes.update_archiveresult_failed(archiveresult, error=e, end_ts=timezone.now())
# bump the modified time on the archiveresult and Snapshot # # bump the modified time on the archiveresult and Snapshot
abx.archivebox.events.on_archiveresult_updated(archiveresult) # abx.archivebox.events.on_archiveresult_updated(archiveresult)
abx.archivebox.events.on_snapshot_updated(archiveresult.snapshot) # abx.archivebox.events.on_snapshot_updated(archiveresult.snapshot)
@abx.hookimpl.reads.get_outlink_parents # @abx.hookimpl.reads.get_outlink_parents
def get_outlink_parents(url, crawl_pk=None, config=None): # def get_outlink_parents(url, crawl_pk=None, config=None):
scope = Q(dst=url) # scope = Q(dst=url)
if crawl_pk: # if crawl_pk:
scope = scope | Q(via__snapshot__crawl_id=crawl_pk) # scope = scope | Q(via__snapshot__crawl_id=crawl_pk)
parent = list(Outlink.objects.filter(scope)) # parent = list(Outlink.objects.filter(scope))
if not parent: # if not parent:
# base case: we reached the top of the chain, no more parents left # # base case: we reached the top of the chain, no more parents left
return [] # return []
# recursive case: there is another parent above us, get its parents # # recursive case: there is another parent above us, get its parents
yield parent[0] # yield parent[0]
yield from get_outlink_parents(parent[0].src, crawl_pk=crawl_pk, config=config) # yield from get_outlink_parents(parent[0].src, crawl_pk=crawl_pk, config=config)