From d0fefc02793d3205582a004f1ea2f86d726bddce Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 27 Aug 2024 19:28:00 -0700 Subject: [PATCH] add chunk_size=500 to more iterator calls --- .../core/migrations/0027_update_snapshot_ids.py | 4 ++-- ..._archiveresult_old_id_alter_archiveresult_uuid.py | 2 +- ...hottag_snapshot_alter_snapshottag_snapshot_old.py | 2 +- archivebox/core/migrations/0059_tag_id.py | 2 +- ...0063_snapshottag_tag_alter_snapshottag_old_tag.py | 2 +- archivebox/extractors/__init__.py | 2 +- archivebox/index/__init__.py | 12 ++++++------ 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/archivebox/core/migrations/0027_update_snapshot_ids.py b/archivebox/core/migrations/0027_update_snapshot_ids.py index ad197c04..6b8dcf4a 100644 --- a/archivebox/core/migrations/0027_update_snapshot_ids.py +++ b/archivebox/core/migrations/0027_update_snapshot_ids.py @@ -52,7 +52,7 @@ def update_snapshot_ids(apps, schema_editor): Snapshot = apps.get_model("core", "Snapshot") num_total = Snapshot.objects.all().count() print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...') - for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()): + for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)): assert snapshot.abid snapshot.abid_prefix = 'snp_' snapshot.abid_ts_src = 'self.added' @@ -72,7 +72,7 @@ def update_archiveresult_ids(apps, schema_editor): ArchiveResult = apps.get_model("core", "ArchiveResult") num_total = ArchiveResult.objects.all().count() print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)') - for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()): + for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)): assert result.abid result.abid_prefix = 'res_' result.snapshot = Snapshot.objects.get(pk=result.snapshot_id) diff --git a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py index 121a2154..dd6da1f5 100644 --- a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py +++ b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py @@ -11,7 +11,7 @@ def update_archiveresult_ids(apps, schema_editor): ArchiveResult = apps.get_model("core", "ArchiveResult") num_total = ArchiveResult.objects.all().count() print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)') - for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()): + for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)): assert result.abid result.uuid = ABID.parse(result.abid).uuid result.save(update_fields=["uuid"]) diff --git a/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py b/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py index ddb7afbb..9866f69c 100644 --- a/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py +++ b/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py @@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor): SnapshotTag = apps.get_model("core", "SnapshotTag") num_total = SnapshotTag.objects.all().count() print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)') - for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator()): + for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)): assert snapshottag.snapshot_old_id snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id) snapshottag.snapshot_id = snapshot.id diff --git a/archivebox/core/migrations/0059_tag_id.py b/archivebox/core/migrations/0059_tag_id.py index f09e9ffb..a81e022f 100644 --- a/archivebox/core/migrations/0059_tag_id.py +++ b/archivebox/core/migrations/0059_tag_id.py @@ -49,7 +49,7 @@ def update_archiveresult_ids(apps, schema_editor): Tag = apps.get_model("core", "Tag") num_total = Tag.objects.all().count() print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...') - for idx, tag in enumerate(Tag.objects.all().iterator()): + for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)): if not tag.slug: tag.slug = tag.name.lower().replace(' ', '_') if not tag.name: diff --git a/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py b/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py index 6c574669..bb067acf 100644 --- a/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py +++ b/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py @@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor): SnapshotTag = apps.get_model("core", "SnapshotTag") num_total = SnapshotTag.objects.all().count() print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)') - for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator()): + for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)): assert snapshottag.old_tag_id tag = Tag.objects.get(old_id=snapshottag.old_tag_id) snapshottag.tag_id = tag.id diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 036ff73c..22d6a405 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -218,7 +218,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa if type(all_links) is QuerySet: num_links: int = all_links.count() get_link = lambda x: x.as_link_with_details() - all_links = all_links.iterator() + all_links = all_links.iterator(chunk_size=500) else: num_links: int = len(all_links) get_link = lambda x: x diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 1bc5a104..1edd3caf 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity""" - links = (snapshot.as_link() for snapshot in snapshots.iterator()) + links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) return { link.link_dir: link for link in links @@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are archived with a valid data directory""" - links = (snapshot.as_link() for snapshot in snapshots.iterator()) + links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) return { link.link_dir: link for link in filter(is_archived, links) @@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are unarchived with no data directory or an empty data directory""" - links = (snapshot.as_link() for snapshot in snapshots.iterator()) + links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500)) return { link.link_dir: link for link in filter(is_unarchived, links) @@ -448,7 +448,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs with a valid index matched to the main index and archived content""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] + links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)] return { link.link_dir: link for link in filter(is_valid, links) @@ -475,7 +475,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists() ) - for path in chain(snapshots.iterator(), data_folders): + for path in chain(snapshots.iterator(chunk_size=500), data_folders): link = None if type(path) is not str: path = path.as_link().link_dir @@ -518,7 +518,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs that don't contain a valid index and aren't listed in the main index""" corrupted = {} - for snapshot in snapshots.iterator(): + for snapshot in snapshots.iterator(chunk_size=500): link = snapshot.as_link() if is_corrupt(link): corrupted[link.link_dir] = link