mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2025-05-13 14:44:29 -04:00
add chunk_size=500 to more iterator calls
This commit is contained in:
parent
44849e1ba2
commit
d0fefc0279
7 changed files with 13 additions and 13 deletions
|
@ -52,7 +52,7 @@ def update_snapshot_ids(apps, schema_editor):
|
||||||
Snapshot = apps.get_model("core", "Snapshot")
|
Snapshot = apps.get_model("core", "Snapshot")
|
||||||
num_total = Snapshot.objects.all().count()
|
num_total = Snapshot.objects.all().count()
|
||||||
print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
|
print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
|
||||||
for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()):
|
for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
|
||||||
assert snapshot.abid
|
assert snapshot.abid
|
||||||
snapshot.abid_prefix = 'snp_'
|
snapshot.abid_prefix = 'snp_'
|
||||||
snapshot.abid_ts_src = 'self.added'
|
snapshot.abid_ts_src = 'self.added'
|
||||||
|
@ -72,7 +72,7 @@ def update_archiveresult_ids(apps, schema_editor):
|
||||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||||
num_total = ArchiveResult.objects.all().count()
|
num_total = ArchiveResult.objects.all().count()
|
||||||
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
|
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
|
||||||
for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()):
|
for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
|
||||||
assert result.abid
|
assert result.abid
|
||||||
result.abid_prefix = 'res_'
|
result.abid_prefix = 'res_'
|
||||||
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
|
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
|
||||||
|
|
|
@ -11,7 +11,7 @@ def update_archiveresult_ids(apps, schema_editor):
|
||||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||||
num_total = ArchiveResult.objects.all().count()
|
num_total = ArchiveResult.objects.all().count()
|
||||||
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
|
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
|
||||||
for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()):
|
for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
|
||||||
assert result.abid
|
assert result.abid
|
||||||
result.uuid = ABID.parse(result.abid).uuid
|
result.uuid = ABID.parse(result.abid).uuid
|
||||||
result.save(update_fields=["uuid"])
|
result.save(update_fields=["uuid"])
|
||||||
|
|
|
@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
|
||||||
SnapshotTag = apps.get_model("core", "SnapshotTag")
|
SnapshotTag = apps.get_model("core", "SnapshotTag")
|
||||||
num_total = SnapshotTag.objects.all().count()
|
num_total = SnapshotTag.objects.all().count()
|
||||||
print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
|
print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
|
||||||
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator()):
|
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
|
||||||
assert snapshottag.snapshot_old_id
|
assert snapshottag.snapshot_old_id
|
||||||
snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
|
snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
|
||||||
snapshottag.snapshot_id = snapshot.id
|
snapshottag.snapshot_id = snapshot.id
|
||||||
|
|
|
@ -49,7 +49,7 @@ def update_archiveresult_ids(apps, schema_editor):
|
||||||
Tag = apps.get_model("core", "Tag")
|
Tag = apps.get_model("core", "Tag")
|
||||||
num_total = Tag.objects.all().count()
|
num_total = Tag.objects.all().count()
|
||||||
print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
|
print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
|
||||||
for idx, tag in enumerate(Tag.objects.all().iterator()):
|
for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
|
||||||
if not tag.slug:
|
if not tag.slug:
|
||||||
tag.slug = tag.name.lower().replace(' ', '_')
|
tag.slug = tag.name.lower().replace(' ', '_')
|
||||||
if not tag.name:
|
if not tag.name:
|
||||||
|
|
|
@ -9,7 +9,7 @@ def update_snapshottag_ids(apps, schema_editor):
|
||||||
SnapshotTag = apps.get_model("core", "SnapshotTag")
|
SnapshotTag = apps.get_model("core", "SnapshotTag")
|
||||||
num_total = SnapshotTag.objects.all().count()
|
num_total = SnapshotTag.objects.all().count()
|
||||||
print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
|
print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
|
||||||
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator()):
|
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
|
||||||
assert snapshottag.old_tag_id
|
assert snapshottag.old_tag_id
|
||||||
tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
|
tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
|
||||||
snapshottag.tag_id = tag.id
|
snapshottag.tag_id = tag.id
|
||||||
|
|
|
@ -218,7 +218,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
|
||||||
if type(all_links) is QuerySet:
|
if type(all_links) is QuerySet:
|
||||||
num_links: int = all_links.count()
|
num_links: int = all_links.count()
|
||||||
get_link = lambda x: x.as_link_with_details()
|
get_link = lambda x: x.as_link_with_details()
|
||||||
all_links = all_links.iterator()
|
all_links = all_links.iterator(chunk_size=500)
|
||||||
else:
|
else:
|
||||||
num_links: int = len(all_links)
|
num_links: int = len(all_links)
|
||||||
get_link = lambda x: x
|
get_link = lambda x: x
|
||||||
|
|
|
@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
||||||
|
|
||||||
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links without checking archive status or data directory validity"""
|
"""indexed links without checking archive status or data directory validity"""
|
||||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in links
|
for link in links
|
||||||
|
@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
||||||
|
|
||||||
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links that are archived with a valid data directory"""
|
"""indexed links that are archived with a valid data directory"""
|
||||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_archived, links)
|
for link in filter(is_archived, links)
|
||||||
|
@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
||||||
|
|
||||||
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||||
links = (snapshot.as_link() for snapshot in snapshots.iterator())
|
links = (snapshot.as_link() for snapshot in snapshots.iterator(chunk_size=500))
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_unarchived, links)
|
for link in filter(is_unarchived, links)
|
||||||
|
@ -448,7 +448,7 @@ def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
|
||||||
|
|
||||||
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs with a valid index matched to the main index and archived content"""
|
"""dirs with a valid index matched to the main index and archived content"""
|
||||||
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
|
links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator(chunk_size=500)]
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_valid, links)
|
for link in filter(is_valid, links)
|
||||||
|
@ -475,7 +475,7 @@ def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Opti
|
||||||
if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
|
if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
|
||||||
)
|
)
|
||||||
|
|
||||||
for path in chain(snapshots.iterator(), data_folders):
|
for path in chain(snapshots.iterator(chunk_size=500), data_folders):
|
||||||
link = None
|
link = None
|
||||||
if type(path) is not str:
|
if type(path) is not str:
|
||||||
path = path.as_link().link_dir
|
path = path.as_link().link_dir
|
||||||
|
@ -518,7 +518,7 @@ def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
|
||||||
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs that don't contain a valid index and aren't listed in the main index"""
|
"""dirs that don't contain a valid index and aren't listed in the main index"""
|
||||||
corrupted = {}
|
corrupted = {}
|
||||||
for snapshot in snapshots.iterator():
|
for snapshot in snapshots.iterator(chunk_size=500):
|
||||||
link = snapshot.as_link()
|
link = snapshot.as_link()
|
||||||
if is_corrupt(link):
|
if is_corrupt(link):
|
||||||
corrupted[link.link_dir] = link
|
corrupted[link.link_dir] = link
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue