From a97cc8297935a34794114d5c50aeeeac565cce55 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <github@sweeting.me>
Date: Mon, 21 Oct 2024 00:38:29 -0700
Subject: [PATCH] fix Crawl models

---
 archivebox/crawls/models.py | 70 +++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 37 deletions(-)

diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index 2addf521..a806d889 100644
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -1,7 +1,5 @@
 __package__ = 'archivebox.crawls'
 
-import time
-
 from django_stubs_ext.db.models import TypedModelMeta
 
 from django.db import models
@@ -9,11 +7,8 @@ from django.db.models import Q
 from django.core.validators import MaxValueValidator, MinValueValidator 
 from django.conf import settings
 from django.utils import timezone
-from django.utils.functional import cached_property
 from django.urls import reverse_lazy
 
-from pathlib import Path
-
 from seeds.models import Seed
 
 from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
@@ -116,7 +111,8 @@ class Outlink(models.Model):
     src = models.URLField()   # parent page where the outlink/href was found       e.g. https://example.com/downloads
     dst = models.URLField()   # remote location the child outlink/href points to   e.g. https://example.com/downloads/some_file.pdf
     
-    via = models.ForeignKey(ArchiveResult, related_name='outlink_set')
+    crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, blank=False, related_name='outlink_set')
+    via = models.ForeignKey('core.ArchiveResult', on_delete=models.SET_NULL, null=True, blank=True, related_name='outlink_set')
 
     class Meta:
         unique_together = (('src', 'dst', 'via'),)
@@ -125,44 +121,44 @@ class Outlink(models.Model):
 
 
         
-@abx.hookimpl.on_archiveresult_created
-def exec_archiveresult_extractor_effects(archiveresult):
-    config = get_scope_config(...)
+# @abx.hookimpl.on_archiveresult_created
+# def exec_archiveresult_extractor_effects(archiveresult):
+#     config = get_scope_config(...)
     
-    # abx.archivebox.writes.update_archiveresult_started(archiveresult, start_ts=timezone.now())
-    # abx.archivebox.events.on_archiveresult_updated(archiveresult)
+#     # abx.archivebox.writes.update_archiveresult_started(archiveresult, start_ts=timezone.now())
+#     # abx.archivebox.events.on_archiveresult_updated(archiveresult)
     
-    # check if it should be skipped
-    if not abx.archivebox.reads.get_archiveresult_should_run(archiveresult, config):
-        abx.archivebox.writes.update_archiveresult_skipped(archiveresult, status='skipped')
-        abx.archivebox.events.on_archiveresult_skipped(archiveresult, config)
-        return
+#     # check if it should be skipped
+#     if not abx.archivebox.reads.get_archiveresult_should_run(archiveresult, config):
+#         abx.archivebox.writes.update_archiveresult_skipped(archiveresult, status='skipped')
+#         abx.archivebox.events.on_archiveresult_skipped(archiveresult, config)
+#         return
     
-    # run the extractor method and save the output back to the archiveresult
-    try:
-        output = abx.archivebox.effects.exec_archiveresult_extractor(archiveresult, config)
-        abx.archivebox.writes.update_archiveresult_succeeded(archiveresult, output=output, error=None, end_ts=timezone.now())
-    except Exception as e:
-        abx.archivebox.writes.update_archiveresult_failed(archiveresult, error=e, end_ts=timezone.now())
+#     # run the extractor method and save the output back to the archiveresult
+#     try:
+#         output = abx.archivebox.effects.exec_archiveresult_extractor(archiveresult, config)
+#         abx.archivebox.writes.update_archiveresult_succeeded(archiveresult, output=output, error=None, end_ts=timezone.now())
+#     except Exception as e:
+#         abx.archivebox.writes.update_archiveresult_failed(archiveresult, error=e, end_ts=timezone.now())
     
-    # bump the modified time on the archiveresult and Snapshot
-    abx.archivebox.events.on_archiveresult_updated(archiveresult)
-    abx.archivebox.events.on_snapshot_updated(archiveresult.snapshot)
+#     # bump the modified time on the archiveresult and Snapshot
+#     abx.archivebox.events.on_archiveresult_updated(archiveresult)
+#     abx.archivebox.events.on_snapshot_updated(archiveresult.snapshot)
     
 
-@abx.hookimpl.reads.get_outlink_parents
-def get_outlink_parents(url, crawl_pk=None, config=None):
-    scope = Q(dst=url)
-    if crawl_pk:
-        scope = scope | Q(via__snapshot__crawl_id=crawl_pk)
+# @abx.hookimpl.reads.get_outlink_parents
+# def get_outlink_parents(url, crawl_pk=None, config=None):
+#     scope = Q(dst=url)
+#     if crawl_pk:
+#         scope = scope | Q(via__snapshot__crawl_id=crawl_pk)
     
-    parent = list(Outlink.objects.filter(scope))
-    if not parent:
-        # base case: we reached the top of the chain, no more parents left
-        return []
+#     parent = list(Outlink.objects.filter(scope))
+#     if not parent:
+#         # base case: we reached the top of the chain, no more parents left
+#         return []
     
-    # recursive case: there is another parent above us, get its parents
-    yield parent[0]
-    yield from get_outlink_parents(parent[0].src, crawl_pk=crawl_pk, config=config)
+#     # recursive case: there is another parent above us, get its parents
+#     yield parent[0]
+#     yield from get_outlink_parents(parent[0].src, crawl_pk=crawl_pk, config=config)