Update file_migrations.py
Some checks failed
Build Pip package / build (push) Has been cancelled
CodeQL / Analyze (python) (push) Has been cancelled
Build Debian package / build (push) Has been cancelled
Deploy static content to Pages / deploy (push) Has been cancelled
Build Homebrew package / build (push) Has been cancelled
Build GitHub Pages website / build (push) Has been cancelled
Run linters / lint (push) Has been cancelled
Run tests / python_tests (ubuntu-22.04, 3.11) (push) Has been cancelled
Run tests / docker_tests (push) Has been cancelled
Build GitHub Pages website / deploy (push) Has been cancelled

This commit is contained in:
Nick Sweeting 2025-01-02 23:58:59 -08:00 committed by GitHub
parent a851ad4c87
commit 55a347c32e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,99 +1,99 @@
__package__ = 'archivebox.filestore' # __package__ = 'archivebox.filestore'
import re # import re
from pathlib import Path # from pathlib import Path
from functools import wraps # from functools import wraps
from enum import Enum # from enum import Enum
import archivebox # import archivebox
from archivebox import CONSTANTS # from archivebox import CONSTANTS
from core.models import Snapshot # from core.models import Snapshot
from .models import File # from .models import File
class FilestoreVersion(Enum): # class FilestoreVersion(Enum):
v0_7_2 = 'v0.7.2' # v0_7_2 = 'v0.7.2'
v0_8_6 = 'v0.8.6' # v0_8_6 = 'v0.8.6'
v0_9_0 = 'v0.9.0' # v0_9_0 = 'v0.9.0'
LATEST_VERSION = FilestoreVersion.v0_9_0 # LATEST_VERSION = FilestoreVersion.v0_9_0
def migration(src_ver: FilestoreVersion, dst_ver: FilestoreVersion, pattern: str, timeout_seconds: int = 600): # def migration(src_ver: FilestoreVersion, dst_ver: FilestoreVersion, pattern: str, timeout_seconds: int = 600):
"""Decorator for a migration function that will only run on files that match the given pattern and are at the given version.""" # """Decorator for a migration function that will only run on files that match the given pattern and are at the given version."""
def decorator(migration_func): # def decorator(migration_func):
@wraps(migration_func) # @wraps(migration_func)
def wrapper(file: File) -> None: # def wrapper(file: File) -> None:
# skip if this migration doesn't apply to this file # # skip if this migration doesn't apply to this file
if file.version != src_ver: # if file.version != src_ver:
return None # return None
if not re.match(pattern, file.file.name): # if not re.match(pattern, file.file.name):
return None # return None
# acquire lock, run migration + update version, then unlock # # acquire lock, run migration + update version, then unlock
try: # try:
file.acquire_lock(timeout_seconds) # file.acquire_lock(timeout_seconds)
migration_func(file) # migration_func(file)
file.version = dst_ver # file.version = dst_ver
except Exception as e: # except Exception as e:
# logger.error(f"Failed to migrate file {file.id}: {e}") # # logger.error(f"Failed to migrate file {file.id}: {e}")
print(f"Failed to migrate file {file.id}: {e}") # print(f"Failed to migrate file {file.id}: {e}")
file.version = src_ver # roll back version to original version # file.version = src_ver # roll back version to original version
finally: # finally:
file.release_lock() # file.release_lock()
file.save() # file.save()
wrapper.src_ver = src_ver # type: ignore # wrapper.src_ver = src_ver # type: ignore
wrapper.dst_ver = dst_ver # type: ignore # wrapper.dst_ver = dst_ver # type: ignore
wrapper.pattern = pattern # type: ignore # wrapper.pattern = pattern # type: ignore
wrapper.timeout_seconds = timeout_seconds # type: ignore # wrapper.timeout_seconds = timeout_seconds # type: ignore
return wrapper # return wrapper
return decorator # return decorator
def detect_archiveresult(path: Path) -> 'ArchiveResult' | None: # def detect_archiveresult(path: Path) -> 'ArchiveResult' | None:
# archive/1723423525.0/singlefile.html # # archive/1723423525.0/singlefile.html
timestamp = path.parts[1] # timestamp = path.parts[1]
snapshot = Snapshot.objects.filter(timestamp=timestamp).last() # snapshot = Snapshot.objects.filter(timestamp=timestamp).last()
if not snapshot: # if not snapshot:
return # return
result = snapshot.archiveresult_set.filter(output=path.name).last() # result = snapshot.archiveresult_set.filter(output=path.name).last()
if not result: # if not result:
return # return
return result # return result
# @hookimpl(hook_name='migrate_file') # # @hookimpl(hook_name='migrate_file')
@migration(FilestoreVersion.v0_7_2, FilestoreVersion.v0_8_6, r'archive/([0-9\.]+)/.+', timeout_seconds=600) # @migration(FilestoreVersion.v0_7_2, FilestoreVersion.v0_8_6, r'archive/([0-9\.]+)/.+', timeout_seconds=600)
def migrate_v07_to_v08_singlefile(file: File) -> None: # def migrate_v07_to_v08_singlefile(file: File) -> None:
result = detect_archiveresult(file.relpath) # result = detect_archiveresult(file.relpath)
new_path = result.OUTPUT_DIR / 'index.html' # new_path = result.OUTPUT_DIR / 'index.html'
file.move_to(new_path) # file.move_to(new_path)
# @hookimpl(hook_name='migrate_file') # # @hookimpl(hook_name='migrate_file')
@migration(FilestoreVersion.v0_8_6, FilestoreVersion.v0_9_0, r'archive/([0-9\.]+)/singlefile.html', timeout_seconds=600) # @migration(FilestoreVersion.v0_8_6, FilestoreVersion.v0_9_0, r'archive/([0-9\.]+)/singlefile.html', timeout_seconds=600)
def migrate_v08_to_v09_singlefile(file: File) -> None: # def migrate_v08_to_v09_singlefile(file: File) -> None:
result = detect_archiveresult(file.relpath) # result = detect_archiveresult(file.relpath)
new_path = result.OUTPUT_DIR / 'index.html' # new_path = result.OUTPUT_DIR / 'index.html'
file.move_to(new_path) # file.move_to(new_path)
def migrate_all_files(target=LATEST_VERSION, batch_size: int = 100): # def migrate_all_files(target=LATEST_VERSION, batch_size: int = 100):
File.release_expired_locks() # File.release_expired_locks()
pending_files = ( # pending_files = (
File.objects # File.objects
.filter(status='unlocked') # .filter(status='unlocked')
.exclude(version=target) # .exclude(version=target)
.iterator(chunk_size=batch_size) # .iterator(chunk_size=batch_size)
) # )
for file in pending_files: # for file in pending_files:
try: # try:
archivebox.pm.hook.migrate_file(file=file) # archivebox.pm.hook.migrate_file(file=file)
except Exception as e: # except Exception as e:
print(f"Failed to migrate file {file.id}: {e}") # print(f"Failed to migrate file {file.id}: {e}")