mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-20 18:05:19 -04:00
fix ABID and uniqueness for new Seed models
This commit is contained in:
parent
59b669691f
commit
2ebd28aebd
1 changed files with 41 additions and 13 deletions
|
@ -28,21 +28,32 @@ from ..extractors import EXTRACTOR_CHOICES
|
||||||
|
|
||||||
class Seed(ABIDModel, ModelWithHealthStats):
|
class Seed(ABIDModel, ModelWithHealthStats):
|
||||||
"""
|
"""
|
||||||
A fountain that produces URLs (+metadata) e.g.
|
A fountain that produces URLs (+metadata) each time it's queried e.g.
|
||||||
- file://data/sources/2024-01-02_11-57-51__cli_add.txt
|
- file:///data/sources/2024-01-02_11-57-51__cli_add.txt
|
||||||
- file://data/sources/2024-01-02_11-57-51__web_ui_add.txt
|
- file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt
|
||||||
- file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
|
- file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks
|
||||||
- https://getpocket.com/user/nikisweeting/feed
|
- https://getpocket.com/user/nikisweeting/feed
|
||||||
|
- https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
|
||||||
- ...
|
- ...
|
||||||
|
Each query of a Seed can produce the same list of URLs, or a different list each time.
|
||||||
|
The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots.
|
||||||
|
|
||||||
When a crawl is created, a root_snapshot is initially created whos URI is the Seed URI.
|
When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI.
|
||||||
The seed's preferred extractor is executed on the Snapshot, which produces an ArchiveResult.
|
The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks.
|
||||||
The ArchiveResult (ideally) then contains some outlink URLs, which get turned into new Snapshots.
|
The outlinks then get turned into new pending Snapshots under the same crawl,
|
||||||
Then the cycle repeats up until Crawl.max_depth.
|
and the cycle repeats until Crawl.max_depth.
|
||||||
|
|
||||||
Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
|
Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to
|
||||||
stateful remote services, files whos contents change, etc.
|
stateful remote services, files with contents that change, directories that have new files within, etc.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
abid_prefix = 'src_'
|
||||||
|
abid_ts_src = 'self.created_at'
|
||||||
|
abid_uri_src = 'self.uri'
|
||||||
|
abid_subtype_src = 'self.extractor'
|
||||||
|
abid_rand_src = 'self.id'
|
||||||
|
abid_drift_allowed = True
|
||||||
|
|
||||||
uri = models.URLField(max_length=255, blank=False, null=False, unique=True) # unique source location where URLs will be loaded from
|
uri = models.URLField(max_length=255, blank=False, null=False, unique=True) # unique source location where URLs will be loaded from
|
||||||
|
|
||||||
extractor = models.CharField(choices=EXTRACTOR_CHOICES, default='auto', max_length=32) # suggested extractor to use to load this URL source
|
extractor = models.CharField(choices=EXTRACTOR_CHOICES, default='auto', max_length=32) # suggested extractor to use to load this URL source
|
||||||
|
@ -60,7 +71,7 @@ class Seed(ABIDModel, ModelWithHealthStats):
|
||||||
# pocketapi://
|
# pocketapi://
|
||||||
# s3://
|
# s3://
|
||||||
# etc..
|
# etc..
|
||||||
return self.uri.split(':')[0]
|
return self.uri.split('://')[0].lower()
|
||||||
|
|
||||||
|
|
||||||
class CrawlSchedule(ABIDModel, ModelWithHealthStats):
|
class CrawlSchedule(ABIDModel, ModelWithHealthStats):
|
||||||
|
@ -72,8 +83,8 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
|
||||||
"""
|
"""
|
||||||
abid_prefix = 'sch_'
|
abid_prefix = 'sch_'
|
||||||
abid_ts_src = 'self.created_at'
|
abid_ts_src = 'self.created_at'
|
||||||
abid_uri_src = 'self.crawl.abid'
|
abid_uri_src = 'self.created_by_id'
|
||||||
abid_subtype_src = '"04"'
|
abid_subtype_src = 'self.schedule'
|
||||||
abid_rand_src = 'self.id'
|
abid_rand_src = 'self.id'
|
||||||
|
|
||||||
schedule = models.CharField(max_length=64, blank=False, null=False)
|
schedule = models.CharField(max_length=64, blank=False, null=False)
|
||||||
|
@ -83,6 +94,13 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
|
||||||
modified_at = models.DateTimeField(auto_now=True)
|
modified_at = models.DateTimeField(auto_now=True)
|
||||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
|
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
|
||||||
|
|
||||||
|
crawl_set: models.Manager['Crawl']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def template(self):
|
||||||
|
"""The base crawl that each new scheduled job should copy as a template"""
|
||||||
|
return self.crawl_set.first()
|
||||||
|
|
||||||
|
|
||||||
class Crawl(ABIDModel, ModelWithHealthStats):
|
class Crawl(ABIDModel, ModelWithHealthStats):
|
||||||
"""
|
"""
|
||||||
|
@ -94,7 +112,7 @@ class Crawl(ABIDModel, ModelWithHealthStats):
|
||||||
"""
|
"""
|
||||||
abid_prefix = 'crl_'
|
abid_prefix = 'crl_'
|
||||||
abid_ts_src = 'self.created_at'
|
abid_ts_src = 'self.created_at'
|
||||||
abid_uri_src = 'self.seed_id'
|
abid_uri_src = 'self.seed.uri'
|
||||||
abid_subtype_src = 'self.persona_id'
|
abid_subtype_src = 'self.persona_id'
|
||||||
abid_rand_src = 'self.id'
|
abid_rand_src = 'self.id'
|
||||||
abid_drift_allowed = True
|
abid_drift_allowed = True
|
||||||
|
@ -126,6 +144,13 @@ class Crawl(ABIDModel, ModelWithHealthStats):
|
||||||
verbose_name = 'Crawl'
|
verbose_name = 'Crawl'
|
||||||
verbose_name_plural = 'Crawls'
|
verbose_name_plural = 'Crawls'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def template(self):
|
||||||
|
"""If this crawl was created under a ScheduledCrawl, returns the original template Crawl it was based off"""
|
||||||
|
if not self.schedule:
|
||||||
|
return None
|
||||||
|
return self.schedule.template
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def api_url(self) -> str:
|
def api_url(self) -> str:
|
||||||
# /api/v1/core/crawl/{uulid}
|
# /api/v1/core/crawl/{uulid}
|
||||||
|
@ -138,6 +163,7 @@ class Crawl(ABIDModel, ModelWithHealthStats):
|
||||||
|
|
||||||
|
|
||||||
class Outlink(models.Model):
|
class Outlink(models.Model):
|
||||||
|
"""A record of a link found on a page, pointing to another page."""
|
||||||
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
||||||
|
|
||||||
src = models.URLField() # parent page where the outlink/href was found e.g. https://example.com/downloads
|
src = models.URLField() # parent page where the outlink/href was found e.g. https://example.com/downloads
|
||||||
|
@ -145,6 +171,8 @@ class Outlink(models.Model):
|
||||||
|
|
||||||
via = models.ForeignKey(ArchiveResult, related_name='outlink_set')
|
via = models.ForeignKey(ArchiveResult, related_name='outlink_set')
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
unique_together = (('src', 'dst', 'via'),)
|
||||||
|
|
||||||
|
|
||||||
def scheduler_runloop():
|
def scheduler_runloop():
|
||||||
|
@ -182,7 +210,7 @@ def create_crawl_from_ui_action(urls, extractor, credentials, depth, tags_str, p
|
||||||
|
|
||||||
|
|
||||||
@abx.hookimpl.on_crawl_schedule_tick
|
@abx.hookimpl.on_crawl_schedule_tick
|
||||||
def create_crawl_from_crawl_schedule_if_due(crawl_schedule):
|
def create_crawl_from_crawlschedule_if_due(crawl_schedule):
|
||||||
# make sure it's not too early to run this scheduled import (makes this function indepmpotent / safe to call multiple times / every second)
|
# make sure it's not too early to run this scheduled import (makes this function indepmpotent / safe to call multiple times / every second)
|
||||||
if timezone.now() < crawl_schedule.next_run_at:
|
if timezone.now() < crawl_schedule.next_run_at:
|
||||||
# it's not time to run it yet, wait for the next tick
|
# it's not time to run it yet, wait for the next tick
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue