From 5418e70526a501df54267ce1dd2c5d27ff22aa06 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Jun 2024 22:13:48 +0000 Subject: [PATCH 01/47] Bump uuid6 from 2023.5.2 to 2024.1.12 Bumps [uuid6](https://github.com/oittaa/uuid6-python) from 2023.5.2 to 2024.1.12. - [Release notes](https://github.com/oittaa/uuid6-python/releases) - [Commits](https://github.com/oittaa/uuid6-python/compare/2023.05.02...2024.01.12) --- updated-dependencies: - dependency-name: uuid6 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f6b110cc..95fa38f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -73,7 +73,7 @@ tzdata==2024.1; sys_platform == "win32" or platform_system == "Windows" tzlocal==5.2 ulid-py==1.1.0 urllib3==2.2.1 -uuid6==2023.5.2 +uuid6==2024.1.12 w3lib==2.1.2 wcwidth==0.2.13 websockets==12.0 From de489d3c604727946202d49a3960c83cd3961193 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 3 Jun 2024 04:00:18 -0700 Subject: [PATCH 02/47] minor snapshot details ui fixes and migrations log msg improvements --- .../core/migrations/0024_auto_20240513_1143.py | 3 +++ archivebox/core/views.py | 14 ++++++++------ archivebox/templates/core/snapshot_live.html | 14 +++++++------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/archivebox/core/migrations/0024_auto_20240513_1143.py b/archivebox/core/migrations/0024_auto_20240513_1143.py index 31f1e773..95652a07 100644 --- a/archivebox/core/migrations/0024_auto_20240513_1143.py +++ b/archivebox/core/migrations/0024_auto_20240513_1143.py @@ -47,12 +47,14 @@ def calculate_abid(self): def copy_snapshot_uuids(apps, schema_editor): + print(' Copying snapshot.id -> snapshot.uuid...') Snapshot = apps.get_model("core", "Snapshot") for snapshot in Snapshot.objects.all(): snapshot.uuid = snapshot.id snapshot.save(update_fields=["uuid"]) def generate_snapshot_abids(apps, schema_editor): + print(' Generating snapshot.abid values...') Snapshot = apps.get_model("core", "Snapshot") for snapshot in Snapshot.objects.all(): snapshot.abid_prefix = 'snp_' @@ -65,6 +67,7 @@ def generate_snapshot_abids(apps, schema_editor): snapshot.save(update_fields=["abid"]) def generate_archiveresult_abids(apps, schema_editor): + print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)') ArchiveResult = apps.get_model("core", "ArchiveResult") Snapshot = apps.get_model("core", "Snapshot") for result in ArchiveResult.objects.all(): diff --git a/archivebox/core/views.py b/archivebox/core/views.py index efaca2f5..3b491b8e 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -90,7 +90,7 @@ class SnapshotView(View): archiveresults[result.extractor] = result_info existing_files = {result['path'] for result in archiveresults.values()} - min_size_threshold = 128 # bytes + min_size_threshold = 10_000 # bytes allowed_extensions = { 'txt', 'html', @@ -108,12 +108,14 @@ class SnapshotView(View): 'md', } + # iterate through all the files in the snapshot dir and add the biggest ones to the result list - for result_file in Path(snapshot.link_dir).glob('*/*/*'): + snap_dir = Path(snapshot.link_dir) + for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')): extension = result_file.suffix.lstrip('.').lower() if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions: continue - if result_file.name in existing_files: + if result_file.name in existing_files or result_file.name == 'index.html': continue file_size = result_file.stat().st_size or 0 @@ -121,7 +123,7 @@ class SnapshotView(View): if file_size > min_size_threshold: archiveresults[result_file.name] = { 'name': result_file.stem, - 'path': result_file.relative_to(snapshot.link_dir), + 'path': result_file.relative_to(snap_dir), 'ts': ts_to_date_str(result_file.stat().st_mtime or 0), 'size': file_size, } @@ -140,7 +142,7 @@ class SnapshotView(View): link_info = link._asdict(extended=True) try: - warc_path = 'warc/' + list(Path(snapshot.link_dir).glob('warc/*.warc.*'))[0].name + warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name except IndexError: warc_path = 'warc/' @@ -160,7 +162,7 @@ class SnapshotView(View): 'warc_path': warc_path, 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, 'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS, - 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name'])), + 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']), 'best_result': best_result, # 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234', } diff --git a/archivebox/templates/core/snapshot_live.html b/archivebox/templates/core/snapshot_live.html index 32957516..73af92a5 100644 --- a/archivebox/templates/core/snapshot_live.html +++ b/archivebox/templates/core/snapshot_live.html @@ -401,13 +401,13 @@ {% endfor %} @@ -419,7 +419,7 @@

Headers, JSON, etc.

- + @@ -430,7 +430,7 @@ - + @@ -444,9 +444,9 @@ this.src = this.src + '#toolbar=0' } this.onload = function() { - if (this.src.endsWith('.pdf')) { + if (this.src.includes('.pdf')) { this.removeAttribute('sandbox') - this.src = this.src + '#toolbar=0' + this.src = this.src.split('?autoplay=')[0] + '#toolbar=0' } try { // doesnt work if frame origin rules prevent accessing its DOM via JS From d11173eaa4ee129f714dbf2004fc5c2e434426d4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 3 Jun 2024 04:09:55 -0700 Subject: [PATCH 03/47] fix dockerign --- .dockerignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.dockerignore b/.dockerignore index b5c3c630..9f03a946 100644 --- a/.dockerignore +++ b/.dockerignore @@ -35,3 +35,5 @@ docker/ data/ data*/ output/ +index.sqlite3 +index.sqlite3-wal From 5b369246fdf83cea8ff8567c906ab18845fac69d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 4 Jun 2024 04:14:59 -0700 Subject: [PATCH 04/47] update gitignore and attrs --- .gitattributes | 2 ++ .gitignore | 3 +++ 2 files changed, 5 insertions(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..afb03617 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +**/*.lock +**/*-lock.json diff --git a/.gitignore b/.gitignore index 7e3fbe26..7d771164 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,9 @@ data/ data*/ output/ index.sqlite3 +*.sqlite* +data.* # vim *.sw? +.vscode From 99e6f0c93fdc4ab63813f541b6fb24927aa4ab78 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 4 Jun 2024 04:16:52 -0700 Subject: [PATCH 05/47] bump versions --- archivebox/package-lock.json | 6 +++--- package-lock.json | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/archivebox/package-lock.json b/archivebox/package-lock.json index f1f0bc14..cbd7a8ea 100644 --- a/archivebox/package-lock.json +++ b/archivebox/package-lock.json @@ -236,9 +236,9 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "20.14.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.0.tgz", - "integrity": "sha512-5cHBxFGJx6L4s56Bubp4fglrEpmyJypsqI6RgzMfBHWUJQGWAAi8cWcgetEbZXHYXo9C2Fa4EEds/uSyS4cxmA==", + "version": "20.14.1", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.1.tgz", + "integrity": "sha512-T2MzSGEu+ysB/FkWfqmhV3PLyQlowdptmmgD20C6QxsS8Fmv5SjpZ1ayXaEC0S21/h5UJ9iA6W/5vSNU5l00OA==", "license": "MIT", "optional": true, "dependencies": { diff --git a/package-lock.json b/package-lock.json index f1f0bc14..cbd7a8ea 100644 --- a/package-lock.json +++ b/package-lock.json @@ -236,9 +236,9 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "20.14.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.0.tgz", - "integrity": "sha512-5cHBxFGJx6L4s56Bubp4fglrEpmyJypsqI6RgzMfBHWUJQGWAAi8cWcgetEbZXHYXo9C2Fa4EEds/uSyS4cxmA==", + "version": "20.14.1", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.1.tgz", + "integrity": "sha512-T2MzSGEu+ysB/FkWfqmhV3PLyQlowdptmmgD20C6QxsS8Fmv5SjpZ1ayXaEC0S21/h5UJ9iA6W/5vSNU5l00OA==", "license": "MIT", "optional": true, "dependencies": { From ba14ee0e5e359ab67f7251c2c41ed12dc363251a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 4 Jun 2024 04:17:26 -0700 Subject: [PATCH 06/47] fix ghcr image names --- bin/build_docker.sh | 10 +++++----- bin/release_docker.sh | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/build_docker.sh b/bin/build_docker.sh index ac8cca65..2be9a28d 100755 --- a/bin/build_docker.sh +++ b/bin/build_docker.sh @@ -84,8 +84,8 @@ docker buildx build --platform "$SELECTED_PLATFORMS" --load . \ -t archivebox/archivebox:$GIT_SHA \ -t nikisweeting/archivebox:$TAG_NAME \ -t nikisweeting/archivebox:$GIT_SHA \ - -t ghcr.io/archivebox/archivebox/archivebox:$TAG_NAME \ - -t ghcr.io/archivebox/archivebox/archivebox:$GIT_SHA + -t ghcr.io/archivebox/archivebox:$TAG_NAME \ + -t ghcr.io/archivebox/archivebox:$GIT_SHA # -t archivebox/archivebox \ # -t archivebox/archivebox:$VERSION \ # -t archivebox/archivebox:$SHORT_VERSION \ @@ -94,6 +94,6 @@ docker buildx build --platform "$SELECTED_PLATFORMS" --load . \ # -t nikisweeting/archivebox:$VERSION \ # -t nikisweeting/archivebox:$SHORT_VERSION \ # -t nikisweeting/archivebox:latest \ - # -t ghcr.io/archivebox/archivebox/archivebox:$VERSION \ - # -t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \ - # -t ghcr.io/archivebox/archivebox/archivebox:latest + # -t ghcr.io/archivebox/archivebox:$VERSION \ + # -t ghcr.io/archivebox/archivebox:$SHORT_VERSION \ + # -t ghcr.io/archivebox/archivebox:latest diff --git a/bin/release_docker.sh b/bin/release_docker.sh index f3095c44..a40e0c4a 100755 --- a/bin/release_docker.sh +++ b/bin/release_docker.sh @@ -35,8 +35,8 @@ docker buildx build --platform "$SELECTED_PLATFORMS" --push . \ -t archivebox/archivebox:$GIT_SHA \ -t nikisweeting/archivebox:$TAG_NAME \ -t nikisweeting/archivebox:$GIT_SHA \ - -t ghcr.io/archivebox/archivebox/archivebox:$TAG_NAME \ - -t ghcr.io/archivebox/archivebox/archivebox:$GIT_SHA + -t ghcr.io/archivebox/archivebox:$TAG_NAME \ + -t ghcr.io/archivebox/archivebox:$GIT_SHA # -t archivebox/archivebox \ # -t archivebox/archivebox:$VERSION \ # -t archivebox/archivebox:$SHORT_VERSION \ @@ -45,6 +45,6 @@ docker buildx build --platform "$SELECTED_PLATFORMS" --push . \ # -t nikisweeting/archivebox:$VERSION \ # -t nikisweeting/archivebox:$SHORT_VERSION \ # -t nikisweeting/archivebox:latest \ - # -t ghcr.io/archivebox/archivebox/archivebox:$VERSION \ - # -t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \ + # -t ghcr.io/archivebox/archivebox:$VERSION \ + # -t ghcr.io/archivebox/archivebox:$SHORT_VERSION \ From 1b39430951c2539e33673405c940a281fcfc9eec Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 11 Jul 2024 16:21:06 -0700 Subject: [PATCH 07/47] Update README.md add saltbox app platform link --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 44e8796c..a2bbdc8e 100644 --- a/README.md +++ b/README.md @@ -411,6 +411,7 @@ See below for usage examples using the CLI, W
  • UnRaid
  • Yunohost
  • Cloudron
  • +
  • Saltbox
  • AppImage
  • Runtipi
  • Umbrel (need contributors...)
  • From c384d15d5d48d45c7a96dfeda37c83d1c06f98a1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 11 Jul 2024 18:00:56 -0700 Subject: [PATCH 08/47] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a2bbdc8e..5136d29f 100644 --- a/README.md +++ b/README.md @@ -568,7 +568,7 @@ ls ./archive/*/index.html # or inspect snapshot data directly on the filesystem
    -🖥  Web UI Usage +🖥  Web UI & API Usage
    
     # Start the server on bare metal (pip/apt/brew/etc):
     archivebox manage createsuperuser              # create a new admin user via CLI
    
    From 4642ccdae85113736554fe7e954c239b27c5991b Mon Sep 17 00:00:00 2001
    From: Nick Sweeting 
    Date: Thu, 11 Jul 2024 18:04:47 -0700
    Subject: [PATCH 09/47] Update README.md
    
    ---
     README.md | 4 ++--
     1 file changed, 2 insertions(+), 2 deletions(-)
    
    diff --git a/README.md b/README.md
    index 5136d29f..22199840 100644
    --- a/README.md
    +++ b/README.md
    @@ -757,8 +757,8 @@ The configuration is documented here: **[Configuration Wiki](https://github.com/
     # e.g. archivebox config --set TIMEOUT=120
     # or   docker compose run archivebox config --set TIMEOUT=120
     
    -TIMEOUT=120 # default: 60 add more seconds on slower networks -CHECK_SSL_VALIDITY=True # default: False True = allow saving URLs w/ bad SSL +TIMEOUT=240 # default: 60 add more seconds on slower networks +CHECK_SSL_VALIDITY=False # default: True False = allow saving URLs w/ bad SSL SAVE_ARCHIVE_DOT_ORG=False # default: True False = disable Archive.org saving MAX_MEDIA_SIZE=1500m # default: 750m raise/lower youtubedl output size
    From 62eb87c51a01a083bbc5d0f7ff2ab74867fc351a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 11 Jul 2024 18:06:50 -0700 Subject: [PATCH 10/47] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 22199840..6a22b13a 100644 --- a/README.md +++ b/README.md @@ -777,7 +777,7 @@ CURL_USER_AGENT="Mozilla/5.0 ..." To achieve high-fidelity archives in as many situations as possible, ArchiveBox depends on a variety of 3rd-party libraries and tools that specialize in extracting different types of content. -> Under-the-hood, ArchiveBox uses [Django](https://www.djangoproject.com/start/overview/) to power its [Web UI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage) and [SQlite](https://www.sqlite.org/locrsf.html) + the filesystem to provide [fast & durable metadata storage](https://www.sqlite.org/locrsf.html) w/ [determinisitc upgrades](https://stackoverflow.com/a/39976321/2156113). +> Under-the-hood, ArchiveBox uses [Django](https://www.djangoproject.com/start/overview/) to power its [Web UI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage), [Django Ninja](https://django-ninja.dev/) for the REST API, and [SQlite](https://www.sqlite.org/locrsf.html) + the filesystem to provide [fast & durable metadata storage](https://www.sqlite.org/locrsf.html) w/ [determinisitc upgrades](https://stackoverflow.com/a/39976321/2156113). ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install), [`wget`, `yt-dlp`, `readability`, etc.](#dependencies) internally, and its operation can be [tuned, secured, and extended](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed for many different applications. From 201a5e625d87b32a71f723cde7d36ff5d69aa90b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 11 Jul 2024 18:10:41 -0700 Subject: [PATCH 11/47] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6a22b13a..ba334428 100644 --- a/README.md +++ b/README.md @@ -786,7 +786,7 @@ ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.c Expand to learn more about ArchiveBox's internals & dependencies...
    -

    TIP: For better security, easier updating, and to avoid polluting your host system with extra dependencies,it is strongly recommended to use the ⭐️ official Docker image with everything pre-installed for the best experience.

    +

    TIP: For better security while running ArchiveBox, and to avoid polluting your host system with a bunch of sub-dependencies that you need to keep up-to-date,it is strongly recommended to use the ⭐️ official Docker image which provides everything in an easy container with simple one-liner upgrades.

    These optional dependencies used for archiving sites include: From fb210e279b8dc5fb02c9d0f167594671a717159c Mon Sep 17 00:00:00 2001 From: Ben Harris Date: Sun, 4 Aug 2024 14:11:28 -0400 Subject: [PATCH 12/47] fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ba334428..09202d70 100644 --- a/README.md +++ b/README.md @@ -777,7 +777,7 @@ CURL_USER_AGENT="Mozilla/5.0 ..." To achieve high-fidelity archives in as many situations as possible, ArchiveBox depends on a variety of 3rd-party libraries and tools that specialize in extracting different types of content. -> Under-the-hood, ArchiveBox uses [Django](https://www.djangoproject.com/start/overview/) to power its [Web UI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage), [Django Ninja](https://django-ninja.dev/) for the REST API, and [SQlite](https://www.sqlite.org/locrsf.html) + the filesystem to provide [fast & durable metadata storage](https://www.sqlite.org/locrsf.html) w/ [determinisitc upgrades](https://stackoverflow.com/a/39976321/2156113). +> Under-the-hood, ArchiveBox uses [Django](https://www.djangoproject.com/start/overview/) to power its [Web UI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage), [Django Ninja](https://django-ninja.dev/) for the REST API, and [SQlite](https://www.sqlite.org/locrsf.html) + the filesystem to provide [fast & durable metadata storage](https://www.sqlite.org/locrsf.html) w/ [deterministic upgrades](https://stackoverflow.com/a/39976321/2156113). ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install), [`wget`, `yt-dlp`, `readability`, etc.](#dependencies) internally, and its operation can be [tuned, secured, and extended](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed for many different applications. From 15ea392864238c19d969b0ef49348ee7b8a17ae5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 12 Aug 2024 17:23:24 -0400 Subject: [PATCH 13/47] Clarify 501c3 status is as an FSP --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 09202d70..84221ed2 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ ArchiveBox is free for everyone to self-host, but we also provide support, secur > ***[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your org wants help using ArchiveBox professionally.* (we are also seeking [grant funding](https://github.com/ArchiveBox/ArchiveBox/issues/1126#issuecomment-1487431394)) > We offer: setup & support, CAPTCHA/ratelimit unblocking, SSO, audit logging/chain-of-custody, and more -> *ArchiveBox has 🏛️ 501(c)(3) [nonprofit status](https://hackclub.com/hcb/) and all our work supports open-source development.* +> *ArchiveBox is a 🏛️ 501(c)(3) [nonprofit FSP](https://hackclub.com/hcb/) and all our work supports open-source development.*
    @@ -1609,7 +1609,7 @@ Extractors take the URL of a page to archive, write their output to the filesyst  
    -ArchiveBox operates as a US 501(c)(3) nonprofit (sponsored by HCB), direct donations are tax-deductible. +ArchiveBox operates as a US 501(c)(3) nonprofit FSP (sponsored by HCB), direct donations are tax-deductible.

        From fbb58c51a7bf8488a6ec019009da081228b9cb42 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 12 Aug 2024 23:36:45 -0700 Subject: [PATCH 14/47] Add portainer template to install options list --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 84221ed2..93ade3b0 100644 --- a/README.md +++ b/README.md @@ -412,6 +412,7 @@ See below for usage examples using the CLI, W
  • Yunohost
  • Cloudron
  • Saltbox
  • +
  • Portainer
  • AppImage
  • Runtipi
  • Umbrel (need contributors...)
  • From b7745ae37a7e0d68be06490078ca1ecb69408986 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 17 Aug 2024 19:30:58 -0700 Subject: [PATCH 15/47] add uri salt when creating ABID --- archivebox/abid_utils/abid.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py index 48597813..23d6dec5 100644 --- a/archivebox/abid_utils/abid.py +++ b/archivebox/abid_utils/abid.py @@ -21,6 +21,11 @@ ABID_RAND_LEN = 6 DEFAULT_ABID_PREFIX = 'obj_' +# allows people to keep their uris secret on a per-instance basis by changing the salt. +# the default means everyone can share the same namespace for URI hashes, +# meaning anyone who has a URI and wants to check if you have it can guess the ABID +DEFAULT_ABID_URI_SALT = '687c2fff14e3a7780faa5a40c237b19b5b51b089' + class ABID(NamedTuple): """ @@ -97,7 +102,7 @@ class ABID(NamedTuple): #################################################### -def uri_hash(uri: Union[str, bytes]) -> str: +def uri_hash(uri: Union[str, bytes], salt: str=DEFAULT_ABID_URI_SALT) -> str: """ 'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25' """ @@ -115,7 +120,7 @@ def uri_hash(uri: Union[str, bytes]) -> str: except AttributeError: pass - uri_bytes = uri_str.encode('utf-8') + uri_bytes = uri_str.encode('utf-8') + salt.encode('utf-8') return hashlib.sha256(uri_bytes).hexdigest().upper() @@ -130,12 +135,12 @@ def abid_part_from_prefix(prefix: Optional[str]) -> str: assert len(prefix) == 3 return prefix + '_' -def abid_part_from_uri(uri: str) -> str: +def abid_part_from_uri(uri: str, salt: str=DEFAULT_ABID_URI_SALT) -> str: """ 'E4A5CCD9' # takes first 8 characters of sha256(url) """ uri = str(uri) - return uri_hash(uri)[:ABID_URI_LEN] + return uri_hash(uri, salt=salt)[:ABID_URI_LEN] def abid_part_from_ts(ts: Optional[datetime]) -> str: """ From 5e0cc926f1ecb23956b7c4b307be89dcceb54700 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 17 Aug 2024 19:31:18 -0700 Subject: [PATCH 16/47] show ulid in archivebox admin --- archivebox/api/models.py | 5 +++++ archivebox/core/admin.py | 30 +++++++++++++++++------------- archivebox/core/models.py | 2 +- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/archivebox/api/models.py b/archivebox/api/models.py index 177b275f..d8598002 100644 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -56,6 +56,7 @@ class APIToken(ABIDModel): return { "TYPE": "APIToken", "uuid": str(self.id), + "ulid": str(self.ulid), "abid": str(self.get_abid()), "user_id": str(self.user.id), "user_username": self.user.username, @@ -64,6 +65,10 @@ class APIToken(ABIDModel): "expires": self.expires_as_iso8601, } + @property + def ulid(self): + return self.get_abid().ulid + @property def expires_as_iso8601(self): """Returns the expiry date of the token in ISO 8601 format or a date 100 years in the future if none.""" diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 4bcbc222..7e1aa7f9 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -168,26 +168,30 @@ def get_abid_info(self, obj): return format_html( # URL Hash: {}
    ''' -     ABID:  {}
    -     TS:                  {} ({})
    -     URI:                 {} ({})
    -     SUBTYPE:       {} ({})
    -     RAND:              {} ({})

    -     ABID AS UUID:  {}    

    - -     .uuid:                 {}    
    -     .id:                      {}    
    -     .pk:                     {}    

    +     DB ID:      {}
    +        .id:                      {}    
    +        .uuid:                  {}    
    +
    +
    +     ABID:       {}
    +         TS:                  {} ({})
    +         URI:                 {} ({})
    +         SUBTYPE:       {} ({})     +         RAND:              {} ({})

    +         as ULID:               {}
    +         as UUID:              {}

    +
    ''', + obj.pk, + obj.id, + obj.uuid, obj.abid, obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'], obj.ABID.uri, str(obj.abid_values['uri']), obj.ABID.subtype, str(obj.abid_values['subtype']), obj.ABID.rand, str(obj.abid_values['rand'])[-7:], + obj.ABID.ulid, obj.ABID.uuid, - obj.uuid, - obj.id, - obj.pk, ) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 1b896217..2c9a9969 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -326,8 +326,8 @@ class ArchiveResult(ABIDModel): abid_rand_src = 'self.uuid' EXTRACTOR_CHOICES = EXTRACTOR_CHOICES + id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # legacy pk TODO: move to UUIDField # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # legacy pk uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) abid = ABIDField(prefix=abid_prefix) From 3a87a7fb8cbb4542c3e510c41725ebe6d3cab2e6 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 17 Aug 2024 19:31:34 -0700 Subject: [PATCH 17/47] re-arrange snapshot detail page preference and add opus support --- archivebox/core/views.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 3b491b8e..7e14e8c1 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -104,12 +104,13 @@ class SnapshotView(View): 'webm', 'mp4', 'mp3', + 'opus', 'pdf', 'md', } - # iterate through all the files in the snapshot dir and add the biggest ones to the result list + # iterate through all the files in the snapshot dir and add the biggest ones to1 the result list snap_dir = Path(snapshot.link_dir) for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')): extension = result_file.suffix.lstrip('.').lower() @@ -128,7 +129,7 @@ class SnapshotView(View): 'size': file_size, } - preferred_types = ('singlefile', 'wget', 'screenshot', 'dom', 'media', 'pdf', 'readability', 'mercury') + preferred_types = ('singlefile', 'screenshot', 'wget', 'dom', 'media', 'pdf', 'readability', 'mercury') all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types) best_result = {'path': 'None'} From 6a33de671524e9b721b05c8107b7c6206ecf4c15 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 17 Aug 2024 19:31:43 -0700 Subject: [PATCH 18/47] remove archive.today link and add JSON --- archivebox/templates/core/snapshot_live.html | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/archivebox/templates/core/snapshot_live.html b/archivebox/templates/core/snapshot_live.html index 73af92a5..b28c11c5 100644 --- a/archivebox/templates/core/snapshot_live.html +++ b/archivebox/templates/core/snapshot_live.html @@ -385,9 +385,10 @@
    From fba3995d86e225eea17c43194934c175af517f17 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 17 Aug 2024 19:38:51 -0700 Subject: [PATCH 19/47] reorder version output --- archivebox/config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 0151c3c2..afa334c6 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -1036,6 +1036,11 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'enabled': True, 'is_valid': config['SOURCES_DIR'].exists(), }, + 'PERSONAS_DIR': { + 'path': config['PERSONAS_DIR'].resolve(), + 'enabled': True, + 'is_valid': config['PERSONAS_DIR'].exists(), + }, 'LOGS_DIR': { 'path': config['LOGS_DIR'].resolve(), 'enabled': True, @@ -1051,11 +1056,6 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: 'enabled': bool(config['CUSTOM_TEMPLATES_DIR']), 'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(), }, - 'PERSONAS_DIR': { - 'path': config['PERSONAS_DIR'].resolve(), - 'enabled': True, - 'is_valid': config['PERSONAS_DIR'].exists(), - }, # managed by bin/docker_entrypoint.sh and python-crontab: # 'CRONTABS_DIR': { # 'path': config['CRONTABS_DIR'].resolve(), From 4d0bbfccfc78d41efd0aa86514460e06064668d0 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 17 Aug 2024 21:56:23 -0700 Subject: [PATCH 20/47] add uri salt and fix api url namespaces --- archivebox/abid_utils/abid.py | 4 ++++ archivebox/api/v1_api.py | 2 +- archivebox/api/v1_core.py | 18 +++++++++--------- archivebox/core/admin.py | 23 +++++++++++++---------- archivebox/core/models.py | 28 +++++++++++++++++++++++++++- archivebox/core/urls.py | 2 +- 6 files changed, 55 insertions(+), 22 deletions(-) diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py index 23d6dec5..f6f9d153 100644 --- a/archivebox/abid_utils/abid.py +++ b/archivebox/abid_utils/abid.py @@ -72,6 +72,10 @@ class ABID(NamedTuple): subtype=suffix[18:20].upper(), rand=suffix[20:26].upper(), ) + + @property + def uri_salt(self) -> str: + return DEFAULT_ABID_URI_SALT @property def suffix(self): diff --git a/archivebox/api/v1_api.py b/archivebox/api/v1_api.py index 4fa5d94b..546ef8a0 100644 --- a/archivebox/api/v1_api.py +++ b/archivebox/api/v1_api.py @@ -63,7 +63,7 @@ api = NinjaAPIWithIOCapture( version='1.0.0', csrf=False, auth=API_AUTH_METHODS, - urls_namespace="api", + urls_namespace="api-1", docs=Swagger(settings={"persistAuthorization": True}), # docs_decorator=login_required, # renderer=ORJSONRenderer(), diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 9046c361..595ec047 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -33,7 +33,7 @@ class ArchiveResultSchema(Schema): snapshot_tags: str extractor: str - cmd_version: str + cmd_version: Optional[str] cmd: List[str] pwd: str status: str @@ -93,16 +93,16 @@ class ArchiveResultFilterSchema(FilterSchema): created__lt: Optional[datetime] = Field(None, q='updated__lt') -@router.get("/archiveresults", response=List[ArchiveResultSchema]) +@router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult") @paginate -def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)): +def get_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)): """List all ArchiveResult entries matching these filters.""" qs = ArchiveResult.objects.all() results = filters.filter(qs) return results -@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema) +@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="get_archiveresult") def get_archiveresult(request, archiveresult_id: str): """Get a specific ArchiveResult by abid, uuid, or pk.""" return ArchiveResult.objects.get(Q(pk__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id) | Q(uuid__icontains=archiveresult_id)) @@ -211,9 +211,9 @@ class SnapshotFilterSchema(FilterSchema): -@router.get("/snapshots", response=List[SnapshotSchema]) +@router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots") @paginate -def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True): +def get_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True): """List all Snapshot entries matching these filters.""" request.with_archiveresults = with_archiveresults @@ -221,7 +221,7 @@ def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_arc results = filters.filter(qs) return results -@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema) +@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot") def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): """Get a specific Snapshot by abid, uuid, or pk.""" request.with_archiveresults = with_archiveresults @@ -286,6 +286,6 @@ class TagSchema(Schema): def resolve_created_by_id(obj): return str(obj.created_by_id) -@router.get("/tags", response=List[TagSchema]) -def list_tags(request): +@router.get("/tags", response=List[TagSchema], url_name="get_tags") +def get_tags(request): return Tag.objects.all() diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 7e1aa7f9..3fa86816 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -168,28 +168,31 @@ def get_abid_info(self, obj): return format_html( # URL Hash: {}
    ''' -     DB ID:      {}
    -        .id:                      {}    
    -        .uuid:                  {}    
    +     DB ID:                {}
    +        .id:                    {}    
    +        .uuid:                {}    

    -     ABID:       {}
    -         TS:                  {} ({})
    -         URI:                 {} ({})
    +     ABID:         {}_{}                            /api/v1 GET JSON     API DOCS
    +         TS:                  {}        ({})
    +         URI:                 {}           ({})
            SUBTYPE:       {} ({})     -         RAND:              {} ({})

    -         as ULID:               {}
    -         as UUID:              {}

    +   RAND:   {} ({})     +   SALT:   {} +

    +         .ulid:                    {}
    +         .uuid:                   {}

    ''', obj.pk, obj.id, obj.uuid, - obj.abid, + *obj.abid.split('_', 1), obj.api_url, obj.api_docs_url, obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'], obj.ABID.uri, str(obj.abid_values['uri']), obj.ABID.subtype, str(obj.abid_values['subtype']), obj.ABID.rand, str(obj.abid_values['rand'])[-7:], + obj.ABID.uri_salt, obj.ABID.ulid, obj.ABID.uuid, ) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 2c9a9969..dfd826f2 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -14,7 +14,7 @@ from django.db import models from django.utils.functional import cached_property from django.utils.text import slugify from django.core.cache import cache -from django.urls import reverse +from django.urls import reverse, reverse_lazy from django.db.models import Case, When, Value, IntegerField from django.contrib.auth.models import User # noqa @@ -103,6 +103,15 @@ class Tag(ABIDModel): i = 1 if i is None else i+1 else: return super().save(*args, **kwargs) + + @property + def api_url(self) -> str: + # /api/v1/core/snapshot/{uulid} + return reverse_lazy('api-1:get_tag', args=[self.abid]) + + @property + def api_docs_url(self) -> str: + return f'/api/v1/docs#/Core%20Models/api_v1_core_get_tag' class Snapshot(ABIDModel): @@ -167,6 +176,15 @@ class Snapshot(ABIDModel): def icons(self) -> str: return snapshot_icons(self) + + @property + def api_url(self) -> str: + # /api/v1/core/snapshot/{uulid} + return reverse_lazy('api-1:get_snapshot', args=[self.abid]) + + @property + def api_docs_url(self) -> str: + return f'/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot' @cached_property def extension(self) -> str: @@ -353,6 +371,14 @@ class ArchiveResult(ABIDModel): def snapshot_dir(self): return Path(self.snapshot.link_dir) + @property + def api_url(self) -> str: + # /api/v1/core/archiveresult/{uulid} + return reverse_lazy('api-1:get_archiveresult', args=[self.abid]) + + @property + def api_docs_url(self) -> str: + return f'/api/v1/docs#/Core%20Models/api_v1_core_get_archiveresult' @property def extractor_module(self): diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index 14b3d774..04382c99 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -38,7 +38,7 @@ urlpatterns = [ path('accounts/', include('django.contrib.auth.urls')), path('admin/', archivebox_admin.urls), - path("api/", include('api.urls')), + path("api/", include('api.urls'), name='api'), path('health/', HealthCheckView.as_view(), name='healthcheck'), path('error/', lambda *_: 1/0), From 033ec08d0cea4909fed04ad876cdcb8039e547f4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 17 Aug 2024 21:56:45 -0700 Subject: [PATCH 21/47] save snapshot ids during migration --- archivebox/core/migrations/0024_auto_20240513_1143.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/archivebox/core/migrations/0024_auto_20240513_1143.py b/archivebox/core/migrations/0024_auto_20240513_1143.py index 95652a07..e2192794 100644 --- a/archivebox/core/migrations/0024_auto_20240513_1143.py +++ b/archivebox/core/migrations/0024_auto_20240513_1143.py @@ -64,7 +64,9 @@ def generate_snapshot_abids(apps, schema_editor): snapshot.abid_rand_src = 'self.uuid' snapshot.abid = calculate_abid(snapshot) - snapshot.save(update_fields=["abid"]) + snapshot.uuid = snapshot.abid.uuid + snapshot.id = snapshot.abid.uuid + snapshot.save(update_fields=["abid", "uuid", "id"]) def generate_archiveresult_abids(apps, schema_editor): print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)') From f72debfdb2ed02262002e7d896477d9005488802 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 17 Aug 2024 22:58:35 -0700 Subject: [PATCH 22/47] migrate ArchiveResult.id to old_id, and make uuid main id --- archivebox/abid_utils/models.py | 4 ++-- archivebox/core/admin.py | 6 +++--- archivebox/core/models.py | 19 +++++++++++++------ 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py index de8b3c87..07fd3b3a 100644 --- a/archivebox/abid_utils/models.py +++ b/archivebox/abid_utils/models.py @@ -69,8 +69,8 @@ class ABIDModel(models.Model): abid_subtype_src = 'None' # e.g. 'self.extractor' abid_rand_src = 'None' # e.g. 'self.uuid' or 'self.id' - id = models.UUIDField(primary_key=True, default=uuid4, editable=True) - uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + # id = models.UUIDField(primary_key=True, default=uuid4, editable=True) + # uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) abid = ABIDField(prefix=abid_prefix) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 3fa86816..cc773f78 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -185,8 +185,8 @@ def get_abid_info(self, obj): ''', obj.pk, - obj.id, - obj.uuid, + getattr(obj, 'id', str(getattr(obj, 'old_id', '')) + ' (.old_id)'), + getattr(obj, 'uuid', str(getattr(obj, 'id', '')) +' (.id)'), *obj.abid.split('_', 1), obj.api_url, obj.api_docs_url, obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'], obj.ABID.uri, str(obj.abid_values['uri']), @@ -204,8 +204,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): sort_fields = ('title_str', 'url_str', 'added', 'files') readonly_fields = ('admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers') search_fields = ('id', 'url', 'abid', 'uuid', 'timestamp', 'title', 'tags__name') - fields = ('url', 'timestamp', 'created_by', 'tags', 'title', *readonly_fields) list_filter = ('added', 'updated', 'tags', 'archiveresult__status', 'created_by') + fields = ('url', 'timestamp', 'created_by', 'tags', 'title', *readonly_fields) ordering = ['-added'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] autocomplete_fields = ['tags'] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index dfd826f2..3f496bcc 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -5,6 +5,7 @@ from typing import Optional, List, Dict from django_stubs_ext.db.models import TypedModelMeta import json +import random import uuid from uuid import uuid4 @@ -60,7 +61,7 @@ class Tag(ABIDModel): # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + uuid = models.UUIDField(default=uuid.uuid4, editable=True, unique=True) abid = ABIDField(prefix=abid_prefix) @@ -122,7 +123,7 @@ class Snapshot(ABIDModel): abid_rand_src = 'self.id' id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk - uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + uuid = models.UUIDField(default=uuid.uuid4, editable=True, unique=True) abid = ABIDField(prefix=abid_prefix) url = models.URLField(unique=True, db_index=True) @@ -335,18 +336,19 @@ class ArchiveResultManager(models.Manager): qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') return qs +def rand_int_id(): + return random.getrandbits(32) class ArchiveResult(ABIDModel): abid_prefix = 'res_' abid_ts_src = 'self.snapshot.added' abid_uri_src = 'self.snapshot.url' abid_subtype_src = 'self.extractor' - abid_rand_src = 'self.uuid' + abid_rand_src = 'self.id' EXTRACTOR_CHOICES = EXTRACTOR_CHOICES - id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # legacy pk TODO: move to UUIDField - # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) + old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='ID') + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True) abid = ABIDField(prefix=abid_prefix) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) @@ -363,10 +365,15 @@ class ArchiveResult(ABIDModel): class Meta(TypedModelMeta): verbose_name = 'Result' + def __str__(self): return self.extractor + def save(self, *args, **kwargs): + super().save(*args, **kwargs) + assert str(self.id) == str(self.abid.uuid) + @cached_property def snapshot_dir(self): return Path(self.snapshot.link_dir) From a271bcb4cecfeb242d258f14fd8b73ad66ea2563 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 17 Aug 2024 23:03:43 -0700 Subject: [PATCH 23/47] use new ids for ArchiveResult API --- archivebox/api/v1_core.py | 12 ++++++------ archivebox/core/admin.py | 5 ++++- archivebox/core/models.py | 4 ++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 595ec047..2bff6fa4 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -22,8 +22,9 @@ router = Router(tags=['Core Models']) class ArchiveResultSchema(Schema): abid: str - uuid: UUID - pk: str + id: UUID + # old_id: int + modified: datetime created: datetime created_by_id: str @@ -73,8 +74,7 @@ class ArchiveResultSchema(Schema): class ArchiveResultFilterSchema(FilterSchema): - uuid: Optional[UUID] = Field(None, q='uuid') - # abid: Optional[str] = Field(None, q='abid') + id: Optional[UUID] = Field(None, q='id') search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains']) snapshot_uuid: Optional[UUID] = Field(None, q='snapshot_uuid__icontains') @@ -104,8 +104,8 @@ def get_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)) @router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="get_archiveresult") def get_archiveresult(request, archiveresult_id: str): - """Get a specific ArchiveResult by abid, uuid, or pk.""" - return ArchiveResult.objects.get(Q(pk__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id) | Q(uuid__icontains=archiveresult_id)) + """Get a specific ArchiveResult by pk, abid, or old_id.""" + return ArchiveResult.objects.get(Q(pk__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id) | Q(old_id__icontains=archiveresult_id)) # @router.post("/archiveresult", response=ArchiveResultSchema) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index cc773f78..9ee2930c 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -511,7 +511,10 @@ class ArchiveResultAdmin(admin.ModelAdmin): ) def identifiers(self, obj): - return get_abid_info(self, obj) + try: + return get_abid_info(self, obj) + except Exception as e: + return str(e) @admin.display( description='Snapshot Tags' diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 3f496bcc..605cf3e8 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -347,8 +347,8 @@ class ArchiveResult(ABIDModel): abid_rand_src = 'self.id' EXTRACTOR_CHOICES = EXTRACTOR_CHOICES - old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='ID') - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True) + old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID') + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True, verbose_name='ID') abid = ABIDField(prefix=abid_prefix) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) From 951025228f2a10cdea8c07afe7597e6b769b0bad Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 18 Aug 2024 00:24:14 -0700 Subject: [PATCH 24/47] add tag endpoint --- archivebox/api/v1_core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 2bff6fa4..9881fb97 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -289,3 +289,7 @@ class TagSchema(Schema): @router.get("/tags", response=List[TagSchema], url_name="get_tags") def get_tags(request): return Tag.objects.all() + +@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag") +def get_tag(request, tag_id: str): + return Tag.objects.get(id=tag_id) From 8c50257fe991a34b39d403ca248edb9d1f7bfcd8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 18 Aug 2024 00:24:38 -0700 Subject: [PATCH 25/47] move snapshot id to old_id --- archivebox/core/admin.py | 18 +++++++++++++++++- archivebox/core/models.py | 12 ++++++++---- archivebox/index/schema.py | 2 +- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 9ee2930c..e48e95a6 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -20,7 +20,7 @@ from signal_webhooks.admin import WebhookAdmin, get_webhook_model from ..util import htmldecode, urldecode, ansi_to_html -from core.models import Snapshot, ArchiveResult, Tag +from core.models import Snapshot, ArchiveResult, Tag, SnapshotTag from core.forms import AddLinkForm from core.mixins import SearchResultsAdminMixin @@ -125,9 +125,14 @@ archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archiveb class ArchiveResultInline(admin.TabularInline): model = ArchiveResult + fk_name = 'snapshot' class TagInline(admin.TabularInline): model = Snapshot.tags.through + # fk_name = 'snapshottag' + + def identifiers(self, obj): + return '-' from django.contrib.admin.helpers import ActionForm from django.contrib.admin.widgets import AutocompleteSelectMultiple @@ -449,6 +454,17 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): +# @admin.register(SnapshotTag, site=archivebox_admin) +# class SnapshotTagAdmin(admin.ModelAdmin): +# list_display = ('id', 'snapshot', 'tag') +# sort_fields = ('id', 'snapshot', 'tag') +# search_fields = ('id', 'snapshot_id', 'tag_id') +# fields = ('snapshot', 'id') +# actions = ['delete_selected'] +# ordering = ['-id'] + +# def identifiers(self, obj): +# return get_abid_info(self, obj) @admin.register(Tag, site=archivebox_admin) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 605cf3e8..5844c9a6 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -114,16 +114,19 @@ class Tag(ABIDModel): def api_docs_url(self) -> str: return f'/api/v1/docs#/Core%20Models/api_v1_core_get_tag' +class SnapshotTag(models.Model): + snapshot = models.OneToOneField('Snapshot', primary_key=True, on_delete=models.CASCADE, to_field='id') + tag = models.ForeignKey(Tag, on_delete=models.CASCADE, to_field='id') class Snapshot(ABIDModel): abid_prefix = 'snp_' abid_ts_src = 'self.added' abid_uri_src = 'self.url' abid_subtype_src = '"01"' - abid_rand_src = 'self.id' + abid_rand_src = 'self.old_id' - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk - uuid = models.UUIDField(default=uuid.uuid4, editable=True, unique=True) + old_id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk + id = models.UUIDField(default=uuid.uuid4, editable=True, unique=True) abid = ABIDField(prefix=abid_prefix) url = models.URLField(unique=True, db_index=True) @@ -351,7 +354,8 @@ class ArchiveResult(ABIDModel): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True, verbose_name='ID') abid = ABIDField(prefix=abid_prefix) - snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) + snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id') + extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32) cmd = models.JSONField() pwd = models.CharField(max_length=256) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index c2644eb2..b30a9de8 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -266,7 +266,7 @@ class Link: @cached_property def snapshot(self): from core.models import Snapshot - return Snapshot.objects.only('uuid').get(url=self.url) + return Snapshot.objects.only('id').get(url=self.url) @cached_property def snapshot_id(self): From 57d31b2b14b28d461b74ad4016153e1f288ab6e0 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 18 Aug 2024 01:07:21 -0700 Subject: [PATCH 26/47] fix snapshot uuid --- archivebox/api/v1_core.py | 6 +++--- archivebox/core/admin.py | 5 ++++- archivebox/core/models.py | 5 ++++- archivebox/index/schema.py | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 9881fb97..fd944901 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -77,7 +77,7 @@ class ArchiveResultFilterSchema(FilterSchema): id: Optional[UUID] = Field(None, q='id') search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains']) - snapshot_uuid: Optional[UUID] = Field(None, q='snapshot_uuid__icontains') + snapshot_id: Optional[UUID] = Field(None, q='snapshot_id__icontains') snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains') snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains') @@ -227,7 +227,7 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): request.with_archiveresults = with_archiveresults snapshot = None try: - snapshot = Snapshot.objects.get(Q(uuid__startswith=snapshot_id) | Q(abid__startswith=snapshot_id)| Q(pk__startswith=snapshot_id)) + snapshot = Snapshot.objects.get(Q(abid__startswith=snapshot_id)| Q(pk__startswith=snapshot_id)) except Snapshot.DoesNotExist: pass @@ -237,7 +237,7 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): pass try: - snapshot = snapshot or Snapshot.objects.get(Q(uuid__icontains=snapshot_id) | Q(abid__icontains=snapshot_id)) + snapshot = snapshot or Snapshot.objects.get(Q(pk__icontains=snapshot_id) | Q(abid__icontains=snapshot_id)) except Snapshot.DoesNotExist: pass diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index e48e95a6..f441b9e5 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -294,7 +294,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): ) def identifiers(self, obj): - return get_abid_info(self, obj) + try: + return get_abid_info(self, obj) + except Exception as e: + return str(e) @admin.display( description='Title', diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 5844c9a6..09ad94fa 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -140,6 +140,9 @@ class Snapshot(ABIDModel): keys = ('url', 'timestamp', 'title', 'tags', 'updated') + @property + def uuid(self): + return self.id def __repr__(self) -> str: title = self.title or '-' @@ -354,7 +357,7 @@ class ArchiveResult(ABIDModel): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True, verbose_name='ID') abid = ABIDField(prefix=abid_prefix) - snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id') + snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id', db_column='snapshot_id') extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32) cmd = models.JSONField() diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index b30a9de8..5dfe4630 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -274,7 +274,7 @@ class Link: @cached_property def snapshot_uuid(self): - return str(self.snapshot.uuid) + return str(self.snapshot.id) @cached_property def snapshot_abid(self): From ff992a3edb6fe42f10a7b32d807ed74e3a6ee8f2 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 18 Aug 2024 09:47:42 -0400 Subject: [PATCH 27/47] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 93ade3b0..29512cfb 100644 --- a/README.md +++ b/README.md @@ -407,7 +407,7 @@ See below for usage examples using the CLI, W > *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*
    From 18ebaed7ed28b43c5fa8a4c505ba8653e3c7f061 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Aug 2024 18:34:01 -0700 Subject: [PATCH 30/47] fix admin UI display of tags --- archivebox/core/admin.py | 52 +++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index f441b9e5..20f7ae39 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -126,13 +126,18 @@ archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archiveb class ArchiveResultInline(admin.TabularInline): model = ArchiveResult fk_name = 'snapshot' + extra = 1 -class TagInline(admin.TabularInline): - model = Snapshot.tags.through - # fk_name = 'snapshottag' - - def identifiers(self, obj): - return '-' +class TagInline(admin.StackedInline): + model = SnapshotTag + # fk_name = 'snapshot' + fields = ('id', 'tag') + extra = 1 + # min_num = 1 + max_num = 1000 + autocomplete_fields = ( + 'tag', + ) from django.contrib.admin.helpers import ActionForm from django.contrib.admin.widgets import AutocompleteSelectMultiple @@ -173,7 +178,8 @@ def get_abid_info(self, obj): return format_html( # URL Hash: {}
    ''' -     DB ID:                {}
    +     DB PK:              {}
    +        .old_id:            {}    
           .id:                    {}    
           .uuid:                {}    

    @@ -190,8 +196,9 @@ def get_abid_info(self, obj): ''', obj.pk, - getattr(obj, 'id', str(getattr(obj, 'old_id', '')) + ' (.old_id)'), - getattr(obj, 'uuid', str(getattr(obj, 'id', '')) +' (.id)'), + getattr(obj, 'old_id', ''), + getattr(obj, 'id', ''), + getattr(obj, 'uuid', ''), *obj.abid.split('_', 1), obj.api_url, obj.api_docs_url, obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'], obj.ABID.uri, str(obj.abid_values['uri']), @@ -207,21 +214,28 @@ def get_abid_info(self, obj): class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): list_display = ('added', 'title_str', 'files', 'size', 'url_str') sort_fields = ('title_str', 'url_str', 'added', 'files') - readonly_fields = ('admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers') - search_fields = ('id', 'url', 'abid', 'uuid', 'timestamp', 'title', 'tags__name') - list_filter = ('added', 'updated', 'tags', 'archiveresult__status', 'created_by') - fields = ('url', 'timestamp', 'created_by', 'tags', 'title', *readonly_fields) + readonly_fields = ('tags', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers') + search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name') + list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags') + fields = ('url', 'timestamp', 'created_by', 'title', *readonly_fields) ordering = ['-added'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] autocomplete_fields = ['tags'] + # inlines = [TagInline, ArchiveResultInline] inlines = [ArchiveResultInline] list_per_page = SNAPSHOTS_PER_PAGE action_form = SnapshotActionForm + save_on_top = True + def changelist_view(self, request, extra_context=None): extra_context = extra_context or {} - return super().changelist_view(request, extra_context | GLOBAL_CONTEXT) + try: + return super().changelist_view(request, extra_context | GLOBAL_CONTEXT) + except Exception as e: + self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}') + return super().changelist_view(request, GLOBAL_CONTEXT) def get_urls(self): urls = super().get_urls() @@ -472,11 +486,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): @admin.register(Tag, site=archivebox_admin) class TagAdmin(admin.ModelAdmin): - list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'abid') + list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'abid', 'id') sort_fields = ('id', 'name', 'slug', 'abid') - readonly_fields = ('created', 'modified', 'identifiers', 'num_snapshots', 'snapshots') + readonly_fields = ('id', 'uuid', 'abid', 'created', 'modified', 'identifiers', 'num_snapshots', 'snapshots') search_fields = ('id', 'abid', 'uuid', 'name', 'slug') - fields = ('name', 'slug', 'created_by', *readonly_fields, ) + fields = ('name', 'slug', 'created_by', *readonly_fields) actions = ['delete_selected'] ordering = ['-id'] @@ -508,9 +522,9 @@ class TagAdmin(admin.ModelAdmin): class ArchiveResultAdmin(admin.ModelAdmin): list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str') sort_fields = ('start_ts', 'extractor', 'status') - readonly_fields = ('snapshot_info', 'tags_str', 'created_by', 'created', 'modified', 'identifiers') + readonly_fields = ('snapshot_info', 'tags_str', 'created', 'modified', 'identifiers') search_fields = ('id', 'uuid', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') - fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'cmd', 'start_ts', 'end_ts', 'cmd_version', *readonly_fields) + fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'cmd', 'start_ts', 'end_ts', 'created_by', 'cmd_version', *readonly_fields) autocomplete_fields = ['snapshot'] list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') From 2c157f0b5bd41f5fb77fa71ecb438340fa34d7ca Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Aug 2024 18:34:52 -0700 Subject: [PATCH 31/47] increase max POST field limit for admin --- archivebox/core/settings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 870c5681..be530e6f 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -463,6 +463,7 @@ SIGNAL_WEBHOOKS = { }, } +DATA_UPLOAD_MAX_NUMBER_FIELDS = None ADMIN_DATA_VIEWS = { "NAME": "Environment", From 3148d2a3ef5714f00a6c23aed02c750c6e4f0bc5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Aug 2024 18:35:07 -0700 Subject: [PATCH 32/47] add squashmigrations to allowed mgmgt command list --- archivebox/manage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/manage.py b/archivebox/manage.py index 413a4cfb..6e8c578a 100755 --- a/archivebox/manage.py +++ b/archivebox/manage.py @@ -7,7 +7,7 @@ if __name__ == '__main__': # versions of ./manage.py commands whenever possible. When that's not possible # (e.g. makemigrations), you can comment out this check temporarily - if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'startapp' in sys.argv): + if not ('makemigrations' in sys.argv or 'migrate' in sys.argv or 'startapp' in sys.argv or 'squashmigrations' in sys.argv): print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):") print() print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:') From 7164fb961c87d962a76716c020640fb2e8e5cb2a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Aug 2024 18:35:35 -0700 Subject: [PATCH 33/47] hotlink to snapshots using generic search query link instead of id__startswith --- archivebox/templates/admin/base.html | 2 +- archivebox/templates/core/snapshot.html | 2 +- archivebox/templates/core/snapshot_live.html | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index 897a26d5..9bcf053a 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -281,7 +281,7 @@ // if we arrive at the index with a url like ??id__startswith=... // we were hotlinked here with the intention of making it easy for the user to perform some // actions on the given snapshot. therefore we should preselect the snapshot to save them a click - if (window.location.search.startsWith('?id__startswith=') || window.location.search.startsWith('?id__exact=')) { + if (window.location.search.startsWith('?')) { const result_checkboxes = [...document.querySelectorAll('#result_list .action-checkbox input[type=checkbox]')] if (result_checkboxes.length === 1) { result_checkboxes[0].click() diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html index 4dac0beb..d3f4081d 100644 --- a/archivebox/templates/core/snapshot.html +++ b/archivebox/templates/core/snapshot.html @@ -351,7 +351,7 @@ WARC | Media | Git | - Actions | + Actions | Admin | See all files...
    diff --git a/archivebox/templates/core/snapshot_live.html b/archivebox/templates/core/snapshot_live.html index b28c11c5..4b219c29 100644 --- a/archivebox/templates/core/snapshot_live.html +++ b/archivebox/templates/core/snapshot_live.html @@ -349,7 +349,7 @@ From cf2faecf61bd8229d563c262a1306708e90251ad Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Aug 2024 18:36:20 -0700 Subject: [PATCH 34/47] add migrations for SnapshotTag through model --- .../migrations/0027_update_snapshot_ids.py | 46 +++++++++++++++++++ .../0028_alter_archiveresult_uuid.py | 19 ++++++++ .../migrations/0029_alter_archiveresult_id.py | 18 ++++++++ .../0030_alter_archiveresult_uuid.py | 18 ++++++++ ...lt_id_alter_archiveresult_uuid_and_more.py | 34 ++++++++++++++ .../migrations/0032_alter_archiveresult_id.py | 19 ++++++++ .../0033_rename_id_archiveresult_old_id.py | 18 ++++++++ ...eresult_old_id_alter_archiveresult_uuid.py | 41 +++++++++++++++++ ...ove_archiveresult_uuid_archiveresult_id.py | 19 ++++++++ ...iveresult_id_alter_archiveresult_old_id.py | 25 ++++++++++ .../0037_rename_id_snapshot_old_id.py | 18 ++++++++ .../0038_rename_uuid_snapshot_id.py | 18 ++++++++ ...ame_snapshot_archiveresult_snapshot_old.py | 18 ++++++++ .../migrations/0040_archiveresult_snapshot.py | 34 ++++++++++++++ ...1_alter_archiveresult_snapshot_and_more.py | 24 ++++++++++ .../0042_remove_archiveresult_snapshot_old.py | 17 +++++++ ...ult_snapshot_alter_snapshot_id_and_more.py | 20 ++++++++ ...result_snapshot_alter_tag_uuid_and_more.py | 40 ++++++++++++++++ archivebox/core/models.py | 17 +++++-- 19 files changed, 460 insertions(+), 3 deletions(-) create mode 100644 archivebox/core/migrations/0027_update_snapshot_ids.py create mode 100644 archivebox/core/migrations/0028_alter_archiveresult_uuid.py create mode 100644 archivebox/core/migrations/0029_alter_archiveresult_id.py create mode 100644 archivebox/core/migrations/0030_alter_archiveresult_uuid.py create mode 100644 archivebox/core/migrations/0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py create mode 100644 archivebox/core/migrations/0032_alter_archiveresult_id.py create mode 100644 archivebox/core/migrations/0033_rename_id_archiveresult_old_id.py create mode 100644 archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py create mode 100644 archivebox/core/migrations/0035_remove_archiveresult_uuid_archiveresult_id.py create mode 100644 archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py create mode 100644 archivebox/core/migrations/0037_rename_id_snapshot_old_id.py create mode 100644 archivebox/core/migrations/0038_rename_uuid_snapshot_id.py create mode 100644 archivebox/core/migrations/0039_rename_snapshot_archiveresult_snapshot_old.py create mode 100644 archivebox/core/migrations/0040_archiveresult_snapshot.py create mode 100644 archivebox/core/migrations/0041_alter_archiveresult_snapshot_and_more.py create mode 100644 archivebox/core/migrations/0042_remove_archiveresult_snapshot_old.py create mode 100644 archivebox/core/migrations/0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py create mode 100644 archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py diff --git a/archivebox/core/migrations/0027_update_snapshot_ids.py b/archivebox/core/migrations/0027_update_snapshot_ids.py new file mode 100644 index 00000000..9b97782d --- /dev/null +++ b/archivebox/core/migrations/0027_update_snapshot_ids.py @@ -0,0 +1,46 @@ +# Generated by Django 5.0.6 on 2024-08-18 02:48 + +from django.db import migrations + +from django.db import migrations +from datetime import datetime +from abid_utils.abid import ABID + + +def update_snapshot_ids(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + num_total = Snapshot.objects.all().count() + print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...') + for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()): + assert snapshot.abid + snapshot.uuid = ABID.parse(snapshot.abid).uuid + snapshot.save(update_fields=["uuid"]) + assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid) + if idx % 1000 == 0: + print(f'Migrated {idx}/{num_total} Snapshot objects...') + +def update_archiveresult_ids(apps, schema_editor): + ArchiveResult = apps.get_model("core", "ArchiveResult") + num_total = ArchiveResult.objects.all().count() + print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)') + for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()): + assert result.abid + result.uuid = ABID.parse(result.abid).uuid + result.save(update_fields=["uuid"]) + assert str(ABID.parse(result.abid).uuid) == str(result.uuid) + if idx % 5000 == 0: + print(f'Migrated {idx}/{num_total} ArchiveResult objects...') + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0026_archiveresult_created_archiveresult_created_by_and_more'), + ] + + operations = [ + migrations.RunPython(update_snapshot_ids, reverse_code=migrations.RunPython.noop), + migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop), + ] + + diff --git a/archivebox/core/migrations/0028_alter_archiveresult_uuid.py b/archivebox/core/migrations/0028_alter_archiveresult_uuid.py new file mode 100644 index 00000000..9b10f044 --- /dev/null +++ b/archivebox/core/migrations/0028_alter_archiveresult_uuid.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-08-18 04:28 + +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0027_update_snapshot_ids'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(default=uuid.uuid4), + ), + ] diff --git a/archivebox/core/migrations/0029_alter_archiveresult_id.py b/archivebox/core/migrations/0029_alter_archiveresult_id.py new file mode 100644 index 00000000..7464a670 --- /dev/null +++ b/archivebox/core/migrations/0029_alter_archiveresult_id.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-08-18 04:28 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0028_alter_archiveresult_uuid'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.BigIntegerField(primary_key=True, serialize=False, verbose_name='ID'), + ), + ] diff --git a/archivebox/core/migrations/0030_alter_archiveresult_uuid.py b/archivebox/core/migrations/0030_alter_archiveresult_uuid.py new file mode 100644 index 00000000..3c1ad788 --- /dev/null +++ b/archivebox/core/migrations/0030_alter_archiveresult_uuid.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-08-18 05:00 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0029_alter_archiveresult_id'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(unique=True), + ), + ] diff --git a/archivebox/core/migrations/0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py b/archivebox/core/migrations/0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py new file mode 100644 index 00000000..64fd6cbe --- /dev/null +++ b/archivebox/core/migrations/0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more.py @@ -0,0 +1,34 @@ +# Generated by Django 5.0.6 on 2024-08-18 05:09 + +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0030_alter_archiveresult_uuid'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.IntegerField(default=uuid.uuid4, primary_key=True, serialize=False, verbose_name='ID'), + ), + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(default=uuid.uuid4, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='uuid', + field=models.UUIDField(default=uuid.uuid4, unique=True), + ), + migrations.AlterField( + model_name='tag', + name='uuid', + field=models.UUIDField(default=uuid.uuid4, null=True, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0032_alter_archiveresult_id.py b/archivebox/core/migrations/0032_alter_archiveresult_id.py new file mode 100644 index 00000000..98299a31 --- /dev/null +++ b/archivebox/core/migrations/0032_alter_archiveresult_id.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-08-18 05:20 + +import core.models +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.BigIntegerField(default=core.models.rand_int_id, primary_key=True, serialize=False, verbose_name='ID'), + ), + ] diff --git a/archivebox/core/migrations/0033_rename_id_archiveresult_old_id.py b/archivebox/core/migrations/0033_rename_id_archiveresult_old_id.py new file mode 100644 index 00000000..ebced58e --- /dev/null +++ b/archivebox/core/migrations/0033_rename_id_archiveresult_old_id.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-08-18 05:34 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0032_alter_archiveresult_id'), + ] + + operations = [ + migrations.RenameField( + model_name='archiveresult', + old_name='id', + new_name='old_id', + ), + ] diff --git a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py new file mode 100644 index 00000000..121a2154 --- /dev/null +++ b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py @@ -0,0 +1,41 @@ +# Generated by Django 5.0.6 on 2024-08-18 05:37 + +import core.models +import uuid +from django.db import migrations, models + +from abid_utils.abid import ABID + + +def update_archiveresult_ids(apps, schema_editor): + ArchiveResult = apps.get_model("core", "ArchiveResult") + num_total = ArchiveResult.objects.all().count() + print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)') + for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()): + assert result.abid + result.uuid = ABID.parse(result.abid).uuid + result.save(update_fields=["uuid"]) + assert str(ABID.parse(result.abid).uuid) == str(result.uuid) + if idx % 2500 == 0: + print(f'Migrated {idx}/{num_total} ArchiveResult objects...') + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0033_rename_id_archiveresult_old_id'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='old_id', + field=models.BigIntegerField(default=core.models.rand_int_id, serialize=False, verbose_name='ID'), + ), + migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop), + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0035_remove_archiveresult_uuid_archiveresult_id.py b/archivebox/core/migrations/0035_remove_archiveresult_uuid_archiveresult_id.py new file mode 100644 index 00000000..26287e3c --- /dev/null +++ b/archivebox/core/migrations/0035_remove_archiveresult_uuid_archiveresult_id.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-08-18 05:49 + +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'), + ] + + operations = [ + migrations.RenameField( + model_name='archiveresult', + old_name='uuid', + new_name='id', + ), + ] diff --git a/archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py b/archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py new file mode 100644 index 00000000..10b4f9c6 --- /dev/null +++ b/archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py @@ -0,0 +1,25 @@ +# Generated by Django 5.0.6 on 2024-08-18 05:59 + +import core.models +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0035_remove_archiveresult_uuid_archiveresult_id'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True, verbose_name='ID'), + ), + migrations.AlterField( + model_name='archiveresult', + name='old_id', + field=models.BigIntegerField(default=core.models.rand_int_id, serialize=False, verbose_name='Old ID'), + ), + ] diff --git a/archivebox/core/migrations/0037_rename_id_snapshot_old_id.py b/archivebox/core/migrations/0037_rename_id_snapshot_old_id.py new file mode 100644 index 00000000..7d901d96 --- /dev/null +++ b/archivebox/core/migrations/0037_rename_id_snapshot_old_id.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-08-18 06:08 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'), + ] + + operations = [ + migrations.RenameField( + model_name='snapshot', + old_name='id', + new_name='old_id', + ), + ] diff --git a/archivebox/core/migrations/0038_rename_uuid_snapshot_id.py b/archivebox/core/migrations/0038_rename_uuid_snapshot_id.py new file mode 100644 index 00000000..d22a8fc4 --- /dev/null +++ b/archivebox/core/migrations/0038_rename_uuid_snapshot_id.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-08-18 06:09 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0037_rename_id_snapshot_old_id'), + ] + + operations = [ + migrations.RenameField( + model_name='snapshot', + old_name='uuid', + new_name='id', + ), + ] diff --git a/archivebox/core/migrations/0039_rename_snapshot_archiveresult_snapshot_old.py b/archivebox/core/migrations/0039_rename_snapshot_archiveresult_snapshot_old.py new file mode 100644 index 00000000..7c2a4e29 --- /dev/null +++ b/archivebox/core/migrations/0039_rename_snapshot_archiveresult_snapshot_old.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-08-18 06:25 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0038_rename_uuid_snapshot_id'), + ] + + operations = [ + migrations.RenameField( + model_name='archiveresult', + old_name='snapshot', + new_name='snapshot_old', + ), + ] diff --git a/archivebox/core/migrations/0040_archiveresult_snapshot.py b/archivebox/core/migrations/0040_archiveresult_snapshot.py new file mode 100644 index 00000000..fa04a9d4 --- /dev/null +++ b/archivebox/core/migrations/0040_archiveresult_snapshot.py @@ -0,0 +1,34 @@ +# Generated by Django 5.0.6 on 2024-08-18 06:46 + +import django.db.models.deletion +from django.db import migrations, models + +def update_archiveresult_snapshot_ids(apps, schema_editor): + ArchiveResult = apps.get_model("core", "ArchiveResult") + Snapshot = apps.get_model("core", "Snapshot") + num_total = ArchiveResult.objects.all().count() + print(f' Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)') + for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator()): + assert result.snapshot_old_id + snapshot = Snapshot.objects.get(old_id=result.snapshot_old_id) + result.snapshot_id = snapshot.id + result.save(update_fields=["snapshot_id"]) + assert str(result.snapshot_id) == str(snapshot.id) + if idx % 5000 == 0: + print(f'Migrated {idx}/{num_total} ArchiveResult objects...') + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0039_rename_snapshot_archiveresult_snapshot_old'), + ] + + operations = [ + migrations.AddField( + model_name='archiveresult', + name='snapshot', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults', to='core.snapshot', to_field='id'), + ), + migrations.RunPython(update_archiveresult_snapshot_ids, reverse_code=migrations.RunPython.noop), + ] diff --git a/archivebox/core/migrations/0041_alter_archiveresult_snapshot_and_more.py b/archivebox/core/migrations/0041_alter_archiveresult_snapshot_and_more.py new file mode 100644 index 00000000..d4be8875 --- /dev/null +++ b/archivebox/core/migrations/0041_alter_archiveresult_snapshot_and_more.py @@ -0,0 +1,24 @@ +# Generated by Django 5.0.6 on 2024-08-18 06:50 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0040_archiveresult_snapshot'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='snapshot', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'), + ), + migrations.AlterField( + model_name='archiveresult', + name='snapshot_old', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults_old', to='core.snapshot'), + ), + ] diff --git a/archivebox/core/migrations/0042_remove_archiveresult_snapshot_old.py b/archivebox/core/migrations/0042_remove_archiveresult_snapshot_old.py new file mode 100644 index 00000000..3fe9f316 --- /dev/null +++ b/archivebox/core/migrations/0042_remove_archiveresult_snapshot_old.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.6 on 2024-08-18 06:51 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0041_alter_archiveresult_snapshot_and_more'), + ] + + operations = [ + migrations.RemoveField( + model_name='archiveresult', + name='snapshot_old', + ), + ] diff --git a/archivebox/core/migrations/0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py b/archivebox/core/migrations/0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py new file mode 100644 index 00000000..c0acddb0 --- /dev/null +++ b/archivebox/core/migrations/0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py @@ -0,0 +1,20 @@ +# Generated by Django 5.0.6 on 2024-08-18 06:52 + +import django.db.models.deletion +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0042_remove_archiveresult_snapshot_old'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='snapshot', + field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'), + ), + ] diff --git a/archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py b/archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py new file mode 100644 index 00000000..d981dca9 --- /dev/null +++ b/archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py @@ -0,0 +1,40 @@ +# Generated by Django 5.0.6 on 2024-08-19 23:01 + +import django.db.models.deletion +import uuid +from django.db import migrations, models + + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + # No-op, SnapshotTag model already exists in DB + ], + state_operations=[ + migrations.CreateModel( + name='SnapshotTag', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('snapshot', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')), + ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')), + ], + options={ + 'db_table': 'core_snapshot_tags', + 'unique_together': {('snapshot', 'tag')}, + }, + ), + migrations.AlterField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'), + ), + ], + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 09ad94fa..9ccd6145 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -61,7 +61,7 @@ class Tag(ABIDModel): # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - uuid = models.UUIDField(default=uuid.uuid4, editable=True, unique=True) + uuid = models.UUIDField(default=uuid.uuid4, null=True, unique=True) abid = ABIDField(prefix=abid_prefix) @@ -77,6 +77,10 @@ class Tag(ABIDModel): def __str__(self): return self.name + @property + def old_id(self): + return self.id + def slugify(self, tag, i=None): slug = slugify(tag) if i is not None: @@ -115,9 +119,15 @@ class Tag(ABIDModel): return f'/api/v1/docs#/Core%20Models/api_v1_core_get_tag' class SnapshotTag(models.Model): - snapshot = models.OneToOneField('Snapshot', primary_key=True, on_delete=models.CASCADE, to_field='id') + id = models.AutoField(primary_key=True) + + snapshot = models.OneToOneField('Snapshot', on_delete=models.CASCADE, to_field='old_id') tag = models.ForeignKey(Tag, on_delete=models.CASCADE, to_field='id') + class Meta: + db_table = 'core_snapshot_tags' + unique_together = [('snapshot', 'tag')] + class Snapshot(ABIDModel): abid_prefix = 'snp_' abid_ts_src = 'self.added' @@ -133,10 +143,11 @@ class Snapshot(ABIDModel): timestamp = models.CharField(max_length=32, unique=True, db_index=True) title = models.CharField(max_length=512, null=True, blank=True, db_index=True) + + tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) added = models.DateTimeField(auto_now_add=True, db_index=True) updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True) - tags = models.ManyToManyField(Tag, blank=True) keys = ('url', 'timestamp', 'title', 'tags', 'updated') From a49739b41c539901afab3521135197b1204948ff Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Aug 2024 18:36:35 -0700 Subject: [PATCH 35/47] add hacky workaround for TagInline not showing in admin Snapshot change view --- archivebox/templates/admin/base.html | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index 9bcf053a..c0d9ac5b 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -265,7 +265,11 @@ }) console.log('Converted', buttons.children().length, 'admin actions from dropdown to buttons') } - + function fixInlineAddRow() { + $('#id_snapshottag-MAX_NUM_FORMS').val('1000') + $('.add-row').show() + } + function setupSnapshotGridListToggle() { $("#snapshot-view-list").click(selectSnapshotListView) $("#snapshot-view-grid").click(selectSnapshotGridView) @@ -290,6 +294,7 @@ } $(document).ready(function() { fix_actions() + fixInlineAddRow() setupSnapshotGridListToggle() setTimeOffset() selectSnapshotIfHotlinked() From c68a66f74ef253e281e880cbd2a80dd2d9ae99e3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Aug 2024 19:41:43 -0700 Subject: [PATCH 36/47] add salt kwarg to abid generation funcs --- archivebox/abid_utils/abid.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py index f6f9d153..66472f10 100644 --- a/archivebox/abid_utils/abid.py +++ b/archivebox/abid_utils/abid.py @@ -184,7 +184,7 @@ def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str: return str(rand)[-ABID_RAND_LEN:].upper() -def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID: +def abid_from_values(prefix, ts, uri, subtype, rand, salt=DEFAULT_ABID_URI_SALT) -> ABID: """ Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). """ @@ -192,7 +192,7 @@ def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID: abid = ABID( prefix=abid_part_from_prefix(prefix), ts=abid_part_from_ts(ts), - uri=abid_part_from_uri(uri), + uri=abid_part_from_uri(uri, salt=salt), subtype=abid_part_from_subtype(subtype), rand=abid_part_from_rand(rand), ) From 74c11d41be295228f68c0db248b5bcda1deac785 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Aug 2024 19:42:01 -0700 Subject: [PATCH 37/47] dont allow modifying Snapshot.timestamp as it would break folder dirs --- archivebox/core/admin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 20f7ae39..8b9c126c 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -214,10 +214,10 @@ def get_abid_info(self, obj): class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): list_display = ('added', 'title_str', 'files', 'size', 'url_str') sort_fields = ('title_str', 'url_str', 'added', 'files') - readonly_fields = ('tags', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers') + readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers') search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name') list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags') - fields = ('url', 'timestamp', 'created_by', 'title', *readonly_fields) + fields = ('url', 'created_by', 'title', *readonly_fields) ordering = ['-added'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] autocomplete_fields = ['tags'] From 344e902fc6973c434786f2bee3d611e3027b77ff Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Aug 2024 19:42:25 -0700 Subject: [PATCH 38/47] migrate SnapshotTag to use new snapshot id --- ...result_snapshot_alter_tag_uuid_and_more.py | 2 +- .../migrations/0045_alter_snapshot_old_id.py | 19 +++++++++ ...ult_snapshot_alter_snapshot_id_and_more.py | 30 ++++++++++++++ ...er_snapshottag_unique_together_and_more.py | 24 +++++++++++ ...8_alter_archiveresult_snapshot_and_more.py | 24 +++++++++++ ...pshot_snapshottag_snapshot_old_and_more.py | 22 ++++++++++ .../0050_alter_snapshottag_snapshot_old.py | 19 +++++++++ ...snapshot_alter_snapshottag_snapshot_old.py | 40 +++++++++++++++++++ ...er_snapshottag_unique_together_and_more.py | 27 +++++++++++++ .../0053_remove_snapshottag_snapshot_old.py | 17 ++++++++ .../0054_alter_snapshot_timestamp.py | 18 +++++++++ archivebox/core/models.py | 15 ++++--- 12 files changed, 251 insertions(+), 6 deletions(-) create mode 100644 archivebox/core/migrations/0045_alter_snapshot_old_id.py create mode 100644 archivebox/core/migrations/0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py create mode 100644 archivebox/core/migrations/0047_alter_snapshottag_unique_together_and_more.py create mode 100644 archivebox/core/migrations/0048_alter_archiveresult_snapshot_and_more.py create mode 100644 archivebox/core/migrations/0049_rename_snapshot_snapshottag_snapshot_old_and_more.py create mode 100644 archivebox/core/migrations/0050_alter_snapshottag_snapshot_old.py create mode 100644 archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py create mode 100644 archivebox/core/migrations/0052_alter_snapshottag_unique_together_and_more.py create mode 100644 archivebox/core/migrations/0053_remove_snapshottag_snapshot_old.py create mode 100644 archivebox/core/migrations/0054_alter_snapshot_timestamp.py diff --git a/archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py b/archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py index d981dca9..b7531233 100644 --- a/archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py +++ b/archivebox/core/migrations/0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more.py @@ -22,7 +22,7 @@ class Migration(migrations.Migration): name='SnapshotTag', fields=[ ('id', models.AutoField(primary_key=True, serialize=False)), - ('snapshot', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')), + ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')), ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')), ], options={ diff --git a/archivebox/core/migrations/0045_alter_snapshot_old_id.py b/archivebox/core/migrations/0045_alter_snapshot_old_id.py new file mode 100644 index 00000000..7dc1a26a --- /dev/null +++ b/archivebox/core/migrations/0045_alter_snapshot_old_id.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-08-20 01:54 + +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshot', + name='old_id', + field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py b/archivebox/core/migrations/0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py new file mode 100644 index 00000000..39216ec5 --- /dev/null +++ b/archivebox/core/migrations/0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more.py @@ -0,0 +1,30 @@ +# Generated by Django 5.0.6 on 2024-08-20 01:55 + +import django.db.models.deletion +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0045_alter_snapshot_old_id'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='snapshot', + field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'), + ), + migrations.AlterField( + model_name='snapshot', + name='id', + field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='old_id', + field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0047_alter_snapshottag_unique_together_and_more.py b/archivebox/core/migrations/0047_alter_snapshottag_unique_together_and_more.py new file mode 100644 index 00000000..b1c845f8 --- /dev/null +++ b/archivebox/core/migrations/0047_alter_snapshottag_unique_together_and_more.py @@ -0,0 +1,24 @@ +# Generated by Django 5.0.6 on 2024-08-20 02:16 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='snapshot', + field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'), + ), + migrations.AlterField( + model_name='snapshottag', + name='tag', + field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'), + ), + ] diff --git a/archivebox/core/migrations/0048_alter_archiveresult_snapshot_and_more.py b/archivebox/core/migrations/0048_alter_archiveresult_snapshot_and_more.py new file mode 100644 index 00000000..81bc8a06 --- /dev/null +++ b/archivebox/core/migrations/0048_alter_archiveresult_snapshot_and_more.py @@ -0,0 +1,24 @@ +# Generated by Django 5.0.6 on 2024-08-20 02:17 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0047_alter_snapshottag_unique_together_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='snapshot', + field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'), + ), + migrations.AlterField( + model_name='snapshottag', + name='snapshot', + field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'), + ), + ] diff --git a/archivebox/core/migrations/0049_rename_snapshot_snapshottag_snapshot_old_and_more.py b/archivebox/core/migrations/0049_rename_snapshot_snapshottag_snapshot_old_and_more.py new file mode 100644 index 00000000..aa0c5b39 --- /dev/null +++ b/archivebox/core/migrations/0049_rename_snapshot_snapshottag_snapshot_old_and_more.py @@ -0,0 +1,22 @@ +# Generated by Django 5.0.6 on 2024-08-20 02:26 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0048_alter_archiveresult_snapshot_and_more'), + ] + + operations = [ + migrations.RenameField( + model_name='snapshottag', + old_name='snapshot', + new_name='snapshot_old', + ), + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together={('snapshot_old', 'tag')}, + ), + ] diff --git a/archivebox/core/migrations/0050_alter_snapshottag_snapshot_old.py b/archivebox/core/migrations/0050_alter_snapshottag_snapshot_old.py new file mode 100644 index 00000000..4bff827c --- /dev/null +++ b/archivebox/core/migrations/0050_alter_snapshottag_snapshot_old.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-08-20 02:30 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshottag', + name='snapshot_old', + field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'), + ), + ] diff --git a/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py b/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py new file mode 100644 index 00000000..ddb7afbb --- /dev/null +++ b/archivebox/core/migrations/0051_snapshottag_snapshot_alter_snapshottag_snapshot_old.py @@ -0,0 +1,40 @@ +# Generated by Django 5.0.6 on 2024-08-20 02:31 + +import django.db.models.deletion +from django.db import migrations, models + + +def update_snapshottag_ids(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + SnapshotTag = apps.get_model("core", "SnapshotTag") + num_total = SnapshotTag.objects.all().count() + print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)') + for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator()): + assert snapshottag.snapshot_old_id + snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id) + snapshottag.snapshot_id = snapshot.id + snapshottag.save(update_fields=["snapshot_id"]) + assert str(snapshottag.snapshot_id) == str(snapshot.id) + if idx % 100 == 0: + print(f'Migrated {idx}/{num_total} SnapshotTag objects...') + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0050_alter_snapshottag_snapshot_old'), + ] + + operations = [ + migrations.AddField( + model_name='snapshottag', + name='snapshot', + field=models.ForeignKey(blank=True, db_column='snapshot_id', null=True, on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'), + ), + migrations.AlterField( + model_name='snapshottag', + name='snapshot_old', + field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, related_name='snapshottag_old_set', to='core.snapshot', to_field='old_id'), + ), + migrations.RunPython(update_snapshottag_ids, reverse_code=migrations.RunPython.noop), + ] diff --git a/archivebox/core/migrations/0052_alter_snapshottag_unique_together_and_more.py b/archivebox/core/migrations/0052_alter_snapshottag_unique_together_and_more.py new file mode 100644 index 00000000..e11000bc --- /dev/null +++ b/archivebox/core/migrations/0052_alter_snapshottag_unique_together_and_more.py @@ -0,0 +1,27 @@ +# Generated by Django 5.0.6 on 2024-08-20 02:37 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'), + ] + + operations = [ + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together=set(), + ), + migrations.AlterField( + model_name='snapshottag', + name='snapshot', + field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'), + ), + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together={('snapshot', 'tag')}, + ), + ] diff --git a/archivebox/core/migrations/0053_remove_snapshottag_snapshot_old.py b/archivebox/core/migrations/0053_remove_snapshottag_snapshot_old.py new file mode 100644 index 00000000..cf50fc2c --- /dev/null +++ b/archivebox/core/migrations/0053_remove_snapshottag_snapshot_old.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.6 on 2024-08-20 02:38 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0052_alter_snapshottag_unique_together_and_more'), + ] + + operations = [ + migrations.RemoveField( + model_name='snapshottag', + name='snapshot_old', + ), + ] diff --git a/archivebox/core/migrations/0054_alter_snapshot_timestamp.py b/archivebox/core/migrations/0054_alter_snapshot_timestamp.py new file mode 100644 index 00000000..6febe7c3 --- /dev/null +++ b/archivebox/core/migrations/0054_alter_snapshot_timestamp.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-08-20 02:40 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0053_remove_snapshottag_snapshot_old'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshot', + name='timestamp', + field=models.CharField(db_index=True, editable=False, max_length=32, unique=True), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 9ccd6145..61a62714 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -121,8 +121,8 @@ class Tag(ABIDModel): class SnapshotTag(models.Model): id = models.AutoField(primary_key=True) - snapshot = models.OneToOneField('Snapshot', on_delete=models.CASCADE, to_field='old_id') - tag = models.ForeignKey(Tag, on_delete=models.CASCADE, to_field='id') + snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id') + tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id') class Meta: db_table = 'core_snapshot_tags' @@ -135,12 +135,12 @@ class Snapshot(ABIDModel): abid_subtype_src = '"01"' abid_rand_src = 'self.old_id' - old_id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk - id = models.UUIDField(default=uuid.uuid4, editable=True, unique=True) + old_id = models.UUIDField(default=uuid.uuid4, editable=False, unique=True) # legacy pk + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True) abid = ABIDField(prefix=abid_prefix) url = models.URLField(unique=True, db_index=True) - timestamp = models.CharField(max_length=32, unique=True, db_index=True) + timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) title = models.CharField(max_length=512, null=True, blank=True, db_index=True) @@ -365,6 +365,7 @@ class ArchiveResult(ABIDModel): EXTRACTOR_CHOICES = EXTRACTOR_CHOICES old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID') + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True, verbose_name='ID') abid = ABIDField(prefix=abid_prefix) @@ -392,6 +393,10 @@ class ArchiveResult(ABIDModel): super().save(*args, **kwargs) assert str(self.id) == str(self.abid.uuid) + @property + def uuid(self): + return self.id + @cached_property def snapshot_dir(self): return Path(self.snapshot.link_dir) From c4ef2993b2aa457334430fb9c7b9677de769cd5c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Aug 2024 20:00:04 -0700 Subject: [PATCH 39/47] update REST API and Admin UI to use new id and old_id exclusively --- archivebox/api/v1_core.py | 23 +++++++++++++---------- archivebox/core/admin.py | 22 ++++++++-------------- archivebox/core/models.py | 6 +++++- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index fd944901..56e9d22a 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -21,9 +21,9 @@ router = Router(tags=['Core Models']) ### ArchiveResult ######################################################################### class ArchiveResultSchema(Schema): - abid: str id: UUID - # old_id: int + old_id: int + abid: str modified: datetime created: datetime @@ -105,7 +105,7 @@ def get_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)) @router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="get_archiveresult") def get_archiveresult(request, archiveresult_id: str): """Get a specific ArchiveResult by pk, abid, or old_id.""" - return ArchiveResult.objects.get(Q(pk__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id) | Q(old_id__icontains=archiveresult_id)) + return ArchiveResult.objects.get(Q(id__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id) | Q(old_id__icontains=archiveresult_id)) # @router.post("/archiveresult", response=ArchiveResultSchema) @@ -137,9 +137,10 @@ def get_archiveresult(request, archiveresult_id: str): class SnapshotSchema(Schema): + id: UUID + old_id: UUID abid: str - uuid: UUID - pk: str + modified: datetime created: datetime created_by_id: str @@ -189,10 +190,12 @@ class SnapshotSchema(Schema): class SnapshotFilterSchema(FilterSchema): + id: Optional[str] = Field(None, q='id__icontains') + old_id: Optional[str] = Field(None, q='old_id__icontains') abid: Optional[str] = Field(None, q='abid__icontains') - uuid: Optional[str] = Field(None, q='uuid__icontains') - pk: Optional[str] = Field(None, q='pk__icontains') + created_by_id: str = Field(None, q='created_by_id__icontains') + created__gte: datetime = Field(None, q='created__gte') created__lt: datetime = Field(None, q='created__lt') created: datetime = Field(None, q='created') @@ -200,7 +203,7 @@ class SnapshotFilterSchema(FilterSchema): modified__gte: datetime = Field(None, q='modified__gte') modified__lt: datetime = Field(None, q='modified__lt') - search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'abid__icontains', 'uuid__icontains']) + search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'abid__icontains', 'old_id__icontains']) url: Optional[str] = Field(None, q='url') tag: Optional[str] = Field(None, q='tags__name') title: Optional[str] = Field(None, q='title__icontains') @@ -227,7 +230,7 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): request.with_archiveresults = with_archiveresults snapshot = None try: - snapshot = Snapshot.objects.get(Q(abid__startswith=snapshot_id)| Q(pk__startswith=snapshot_id)) + snapshot = Snapshot.objects.get(Q(abid__startswith=snapshot_id) | Q(id__startswith=snapshot_id) | Q(old_id__startswith=snapshot_id)) except Snapshot.DoesNotExist: pass @@ -237,7 +240,7 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): pass try: - snapshot = snapshot or Snapshot.objects.get(Q(pk__icontains=snapshot_id) | Q(abid__icontains=snapshot_id)) + snapshot = snapshot or Snapshot.objects.get(Q(abid__icontains=snapshot_id) | Q(id__icontains=snapshot_id) | Q(old_id__icontains=snapshot_id)) except Snapshot.DoesNotExist: pass diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 8b9c126c..ad10ef18 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -178,11 +178,6 @@ def get_abid_info(self, obj): return format_html( # URL Hash: {}
    ''' -     DB PK:              {}
    -        .old_id:            {}    
    -        .id:                    {}    
    -        .uuid:                {}    
    -
        ABID:         {}_{}                            /api/v1 GET JSON     API DOCS
            TS:                  {}        ({})
    @@ -191,22 +186,18 @@ def get_abid_info(self, obj):   RAND:   {} ({})       SALT:   {}

    -         .ulid:                    {}
    -         .uuid:                   {}

    +         .uuid:                   {}
    +         .old_id:                {}
    ''', - obj.pk, - getattr(obj, 'old_id', ''), - getattr(obj, 'id', ''), - getattr(obj, 'uuid', ''), - *obj.abid.split('_', 1), obj.api_url, obj.api_docs_url, + *str(obj.abid or obj.get_abid()).split('_', 1), obj.api_url, obj.api_docs_url, obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'], obj.ABID.uri, str(obj.abid_values['uri']), obj.ABID.subtype, str(obj.abid_values['subtype']), obj.ABID.rand, str(obj.abid_values['rand'])[-7:], obj.ABID.uri_salt, - obj.ABID.ulid, obj.ABID.uuid, + getattr(obj, 'old_id', ''), ) @@ -495,7 +486,10 @@ class TagAdmin(admin.ModelAdmin): ordering = ['-id'] def identifiers(self, obj): - return get_abid_info(self, obj) + try: + return get_abid_info(self, obj) + except Exception as e: + return str(e) def num_snapshots(self, tag): return format_html( diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 61a62714..183697a2 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -163,6 +163,10 @@ class Snapshot(ABIDModel): title = self.title or '-' return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' + def save(self, *args, **kwargs): + super().save(*args, **kwargs) + assert str(self.id) == str(self.abid.uuid) == str(self.uuid) + @classmethod def from_json(cls, info: dict): info = {k: v for k, v in info.items() if k in cls.keys} @@ -391,7 +395,7 @@ class ArchiveResult(ABIDModel): def save(self, *args, **kwargs): super().save(*args, **kwargs) - assert str(self.id) == str(self.abid.uuid) + assert str(self.id) == str(self.abid.uuid) == str(self.uuid) @property def uuid(self): From 850448b42c46c2790330077f5b11fa5a3e8f5701 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 20 Aug 2024 01:56:07 -0700 Subject: [PATCH 40/47] add salt args in more places --- archivebox/abid_utils/abid.py | 2 ++ archivebox/abid_utils/models.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py index 66472f10..3c90e83c 100644 --- a/archivebox/abid_utils/abid.py +++ b/archivebox/abid_utils/abid.py @@ -36,6 +36,8 @@ class ABID(NamedTuple): uri: str # e.g. E4A5CCD9 subtype: str # e.g. 01 rand: str # e.g. ZYEBQE + + # salt: str = DEFAULT_ABID_URI_SALT def __getattr__(self, attr: str) -> Any: return getattr(self.ulid, attr) diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py index 07fd3b3a..9d0ab1d5 100644 --- a/archivebox/abid_utils/models.py +++ b/archivebox/abid_utils/models.py @@ -26,6 +26,7 @@ from .abid import ( ABID_RAND_LEN, ABID_SUFFIX_LEN, DEFAULT_ABID_PREFIX, + DEFAULT_ABID_URI_SALT, abid_part_from_prefix, abid_from_values ) @@ -132,6 +133,7 @@ class ABIDModel(models.Model): uri=uri, subtype=subtype, rand=rand, + salt=DEFAULT_ABID_URI_SALT, ) assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}' return abid From 54acfd9f8605d2bb026d8a9c4cbd74c4c62e8185 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 20 Aug 2024 01:56:37 -0700 Subject: [PATCH 41/47] improve REST API filter parameters and pagination --- archivebox/api/v1_core.py | 176 ++++++++++++++++++++++++++++++++------ 1 file changed, 151 insertions(+), 25 deletions(-) diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 56e9d22a..0c701104 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -1,14 +1,17 @@ __package__ = 'archivebox.api' +import math from uuid import UUID -from typing import List, Optional +from typing import List, Optional, Union, Any from datetime import datetime from django.db.models import Q from django.shortcuts import get_object_or_404 +from django.core.exceptions import ValidationError +from django.contrib.auth import get_user_model from ninja import Router, Schema, FilterSchema, Field, Query -from ninja.pagination import paginate +from ninja.pagination import paginate, PaginationBase from core.models import Snapshot, ArchiveResult, Tag from abid_utils.abid import ABID @@ -17,10 +20,45 @@ router = Router(tags=['Core Models']) +class CustomPagination(PaginationBase): + class Input(Schema): + limit: int = 200 + offset: int = 0 + page: int = 0 + + + class Output(Schema): + total_items: int + total_pages: int + page: int + limit: int + offset: int + num_items: int + items: List[Any] + + def paginate_queryset(self, queryset, pagination: Input, **params): + limit = min(pagination.limit, 500) + offset = pagination.offset or (pagination.page * limit) + total = queryset.count() + total_pages = math.ceil(total / limit) + current_page = math.ceil(offset / (limit + 1)) + items = queryset[offset : offset + limit] + return { + 'total_items': total, + 'total_pages': total_pages, + 'page': current_page, + 'limit': limit, + 'offset': offset, + 'num_items': len(items), + 'items': items, + } + ### ArchiveResult ######################################################################### class ArchiveResultSchema(Schema): + TYPE: str = 'core.models.ArchiveResult' + id: UUID old_id: int abid: str @@ -28,8 +66,10 @@ class ArchiveResultSchema(Schema): modified: datetime created: datetime created_by_id: str + created_by_username: str snapshot_abid: str + snapshot_timestamp: str snapshot_url: str snapshot_tags: str @@ -43,6 +83,11 @@ class ArchiveResultSchema(Schema): @staticmethod def resolve_created_by_id(obj): return str(obj.created_by_id) + + @staticmethod + def resolve_created_by_username(obj): + User = get_user_model() + return User.objects.get(id=obj.created_by_id).username @staticmethod def resolve_pk(obj): @@ -60,6 +105,10 @@ class ArchiveResultSchema(Schema): def resolve_created(obj): return obj.start_ts + @staticmethod + def resolve_snapshot_timestamp(obj): + return obj.snapshot.timestamp + @staticmethod def resolve_snapshot_url(obj): return obj.snapshot.url @@ -74,10 +123,10 @@ class ArchiveResultSchema(Schema): class ArchiveResultFilterSchema(FilterSchema): - id: Optional[UUID] = Field(None, q='id') + id: Optional[str] = Field(None, q=['id__startswith', 'abid__icontains', 'old_id__startswith', 'snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith']) - search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains']) - snapshot_id: Optional[UUID] = Field(None, q='snapshot_id__icontains') + search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains', 'id__startswith', 'abid__icontains', 'old_id__startswith', 'snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith']) + snapshot_id: Optional[str] = Field(None, q=['snapshot__id__startswith', 'snapshot__abid__icontains', 'snapshot__timestamp__startswith']) snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains') snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains') @@ -94,11 +143,11 @@ class ArchiveResultFilterSchema(FilterSchema): @router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult") -@paginate +@paginate(CustomPagination) def get_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)): """List all ArchiveResult entries matching these filters.""" qs = ArchiveResult.objects.all() - results = filters.filter(qs) + results = filters.filter(qs).distinct() return results @@ -137,6 +186,8 @@ def get_archiveresult(request, archiveresult_id: str): class SnapshotSchema(Schema): + TYPE: str = 'core.models.Snapshot' + id: UUID old_id: UUID abid: str @@ -144,6 +195,7 @@ class SnapshotSchema(Schema): modified: datetime created: datetime created_by_id: str + created_by_username: str url: str tags: str @@ -161,6 +213,11 @@ class SnapshotSchema(Schema): @staticmethod def resolve_created_by_id(obj): return str(obj.created_by_id) + + @staticmethod + def resolve_created_by_username(obj): + User = get_user_model() + return User.objects.get(id=obj.created_by_id).username @staticmethod def resolve_pk(obj): @@ -190,11 +247,13 @@ class SnapshotSchema(Schema): class SnapshotFilterSchema(FilterSchema): - id: Optional[str] = Field(None, q='id__icontains') + id: Optional[str] = Field(None, q=['id__icontains', 'abid__icontains', 'old_id__icontains', 'timestamp__startswith']) + old_id: Optional[str] = Field(None, q='old_id__icontains') abid: Optional[str] = Field(None, q='abid__icontains') - created_by_id: str = Field(None, q='created_by_id__icontains') + created_by_id: str = Field(None, q='created_by_id') + created_by_username: str = Field(None, q='created_by__username__icontains') created__gte: datetime = Field(None, q='created__gte') created__lt: datetime = Field(None, q='created__lt') @@ -203,7 +262,7 @@ class SnapshotFilterSchema(FilterSchema): modified__gte: datetime = Field(None, q='modified__gte') modified__lt: datetime = Field(None, q='modified__lt') - search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'abid__icontains', 'old_id__icontains']) + search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'abid__icontains', 'old_id__icontains', 'timestamp__startswith']) url: Optional[str] = Field(None, q='url') tag: Optional[str] = Field(None, q='tags__name') title: Optional[str] = Field(None, q='title__icontains') @@ -215,13 +274,13 @@ class SnapshotFilterSchema(FilterSchema): @router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots") -@paginate -def get_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True): +@paginate(CustomPagination) +def get_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=False): """List all Snapshot entries matching these filters.""" request.with_archiveresults = with_archiveresults qs = Snapshot.objects.all() - results = filters.filter(qs) + results = filters.filter(qs).distinct() return results @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot") @@ -230,12 +289,7 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): request.with_archiveresults = with_archiveresults snapshot = None try: - snapshot = Snapshot.objects.get(Q(abid__startswith=snapshot_id) | Q(id__startswith=snapshot_id) | Q(old_id__startswith=snapshot_id)) - except Snapshot.DoesNotExist: - pass - - try: - snapshot = snapshot or Snapshot.objects.get() + snapshot = Snapshot.objects.get(Q(abid__startswith=snapshot_id) | Q(id__startswith=snapshot_id) | Q(old_id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id)) except Snapshot.DoesNotExist: pass @@ -244,6 +298,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): except Snapshot.DoesNotExist: pass + if not snapshot: + raise Snapshot.DoesNotExist + return snapshot @@ -274,25 +331,94 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): class TagSchema(Schema): - abid: Optional[UUID] = Field(None, q='abid') - uuid: Optional[UUID] = Field(None, q='uuid') - pk: Optional[UUID] = Field(None, q='pk') + TYPE: str = 'core.models.Tag' + + id: UUID + old_id: str + abid: str + modified: datetime created: datetime created_by_id: str + created_by_username: str name: str slug: str + num_snapshots: int + snapshots: List[SnapshotSchema] + @staticmethod + def resolve_old_id(obj): + return str(obj.old_id) @staticmethod def resolve_created_by_id(obj): return str(obj.created_by_id) + + @staticmethod + def resolve_created_by_username(obj): + User = get_user_model() + return User.objects.get(id=obj.created_by_id).username + + @staticmethod + def resolve_num_snapshots(obj, context): + return obj.snapshot_set.all().distinct().count() + + @staticmethod + def resolve_snapshots(obj, context): + if context['request'].with_snapshots: + return obj.snapshot_set.all().distinct() + return Snapshot.objects.none() @router.get("/tags", response=List[TagSchema], url_name="get_tags") +@paginate(CustomPagination) def get_tags(request): - return Tag.objects.all() + request.with_snapshots = False + request.with_archiveresults = False + return Tag.objects.all().distinct() @router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag") -def get_tag(request, tag_id: str): - return Tag.objects.get(id=tag_id) +def get_tag(request, tag_id: str, with_snapshots: bool=True): + request.with_snapshots = with_snapshots + request.with_archiveresults = False + tag = None + try: + tag = tag or Tag.objects.get(old_id__icontains=tag_id) + except (Tag.DoesNotExist, ValidationError, ValueError): + pass + + try: + tag = Tag.objects.get(abid__icontains=tag_id) + except (Tag.DoesNotExist, ValidationError): + pass + + try: + tag = tag or Tag.objects.get(id__icontains=tag_id) + except (Tag.DoesNotExist, ValidationError): + pass + return tag + + + +@router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema], url_name="get_any") +def get_any(request, abid: str): + request.with_snapshots = False + request.with_archiveresults = False + + response = None + try: + response = response or get_snapshot(request, abid) + except Exception: + pass + + try: + response = response or get_archiveresult(request, abid) + except Exception: + pass + + try: + response = response or get_tag(request, abid) + except Exception: + pass + + return response From 506b3d28d4eaee38f825d8cbf8da1ed9ad3ab2fd Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 20 Aug 2024 01:57:07 -0700 Subject: [PATCH 42/47] fix admin UI TagInline and ArchiveResultInline form POST handling --- archivebox/core/admin.py | 120 ++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 34 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index ad10ef18..8f5ac72c 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -1,17 +1,19 @@ __package__ = 'archivebox.core' +import json from io import StringIO from pathlib import Path from contextlib import redirect_stdout from datetime import datetime, timezone from django.contrib import admin -from django.db.models import Count -from django.urls import path +from django.db.models import Count, Q +from django.urls import path, reverse from django.utils.html import format_html from django.utils.safestring import mark_safe from django.shortcuts import render, redirect from django.contrib.auth import get_user_model +from django.core.exceptions import ValidationError from django import forms @@ -124,12 +126,25 @@ archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archiveb class ArchiveResultInline(admin.TabularInline): + name = 'Archive Results Log' model = ArchiveResult - fk_name = 'snapshot' + # fk_name = 'snapshot' extra = 1 + readonly_fields = ('result_id', 'start_ts', 'end_ts', 'extractor', 'command', 'cmd_version') + fields = ('id', *readonly_fields, 'status', 'output') + show_change_link = True + # # classes = ['collapse'] + # # list_display_links = ['abid'] -class TagInline(admin.StackedInline): - model = SnapshotTag + def result_id(self, obj): + return format_html('[{}]', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid) + + def command(self, obj): + return format_html('{}', " ".join(obj.cmd or [])) + + +class TagInline(admin.TabularInline): + model = Tag.snapshot_set.through # fk_name = 'snapshot' fields = ('id', 'tag') extra = 1 @@ -178,42 +193,51 @@ def get_abid_info(self, obj): return format_html( # URL Hash: {}
    ''' + {}     📖 API DOCS +

    -     ABID:         {}_{}                            /api/v1 GET JSON     API DOCS
    -         TS:                  {}        ({})
    -         URI:                 {}           ({})
    -         SUBTYPE:       {} ({})     +     TS:                  {}        ({})
    +     URI:                 {}           ({})
    +     SUBTYPE:       {} ({})       RAND:   {} ({})       SALT:   {}

    -         .uuid:                   {}
    -         .old_id:                {} +     .abid:                   {}
    +     .abid.uuid:           {}
    +     .id:                       {}
    +     .old_id:                {}
    ''', - *str(obj.abid or obj.get_abid()).split('_', 1), obj.api_url, obj.api_docs_url, + obj.api_url, obj.api_url, obj.api_docs_url, obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'], obj.ABID.uri, str(obj.abid_values['uri']), obj.ABID.subtype, str(obj.abid_values['subtype']), obj.ABID.rand, str(obj.abid_values['rand'])[-7:], obj.ABID.uri_salt, - obj.ABID.uuid, + str(obj.abid), + str(obj.ABID.uuid), + obj.id, getattr(obj, 'old_id', ''), ) @admin.register(Snapshot, site=archivebox_admin) class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): + class Meta: + model = Snapshot + list_display = ('added', 'title_str', 'files', 'size', 'url_str') + # list_editable = ('title',) sort_fields = ('title_str', 'url_str', 'added', 'files') - readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers') + readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir') search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name') list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags') fields = ('url', 'created_by', 'title', *readonly_fields) ordering = ['-added'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] - autocomplete_fields = ['tags'] - # inlines = [TagInline, ArchiveResultInline] - inlines = [ArchiveResultInline] + # autocomplete_fields = ['tags'] + inlines = [TagInline, ArchiveResultInline] + # inlines = [ArchiveResultInline] list_per_page = SNAPSHOTS_PER_PAGE action_form = SnapshotActionForm @@ -228,6 +252,35 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}') return super().changelist_view(request, GLOBAL_CONTEXT) + def change_view(self, request, object_id, form_url="", extra_context=None): + snapshot = None + + try: + snapshot = snapshot or Snapshot.objects.get(id=object_id) + except (Snapshot.DoesNotExist, Snapshot.MultipleObjectsReturned, ValidationError): + pass + + try: + snapshot = snapshot or Snapshot.objects.get(abid=Snapshot.abid_prefix + object_id.split('_', 1)[-1]) + except (Snapshot.DoesNotExist, ValidationError): + pass + + + try: + snapshot = snapshot or Snapshot.objects.get(old_id=object_id) + except (Snapshot.DoesNotExist, Snapshot.MultipleObjectsReturned, ValidationError): + pass + + if snapshot: + object_id = str(snapshot.id) + + return super().change_view( + request, + object_id, + form_url, + extra_context=extra_context, + ) + def get_urls(self): urls = super().get_urls() custom_urls = [ @@ -237,7 +290,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): def get_queryset(self, request): self.request = request - return super().get_queryset(request).prefetch_related('tags').annotate(archiveresult_count=Count('archiveresult')) + return super().get_queryset(request).prefetch_related('tags', 'archiveresult_set').annotate(archiveresult_count=Count('archiveresult')) def tag_list(self, obj): return ', '.join(obj.tags.values_list('name', flat=True)) @@ -298,7 +351,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): obj.extension or '-', ) - def identifiers(self, obj): + def API(self, obj): try: return get_abid_info(self, obj) except Exception as e: @@ -471,21 +524,21 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): # actions = ['delete_selected'] # ordering = ['-id'] -# def identifiers(self, obj): +# def API(self, obj): # return get_abid_info(self, obj) @admin.register(Tag, site=archivebox_admin) class TagAdmin(admin.ModelAdmin): - list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'abid', 'id') - sort_fields = ('id', 'name', 'slug', 'abid') - readonly_fields = ('id', 'uuid', 'abid', 'created', 'modified', 'identifiers', 'num_snapshots', 'snapshots') - search_fields = ('id', 'abid', 'uuid', 'name', 'slug') - fields = ('name', 'slug', 'created_by', *readonly_fields) + list_display = ('abid', 'name', 'created', 'created_by', 'num_snapshots', 'snapshots') + sort_fields = ('name', 'slug', 'abid', 'created_by', 'created') + readonly_fields = ('slug', 'abid', 'created', 'modified', 'API', 'num_snapshots', 'snapshots') + search_fields = ('abid', 'name', 'slug') + fields = ('name', 'created_by', *readonly_fields) actions = ['delete_selected'] - ordering = ['-id'] + ordering = ['-created'] - def identifiers(self, obj): + def API(self, obj): try: return get_abid_info(self, obj) except Exception as e: @@ -502,11 +555,10 @@ class TagAdmin(admin.ModelAdmin): total_count = tag.snapshot_set.count() return mark_safe('
    '.join( format_html( - '{} [{}] {}', - snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...', + '[{}] {}', snap.pk, - snap.abid, - snap.url, + snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...', + snap.url[:64], ) for snap in tag.snapshot_set.order_by('-updated')[:10] ) + (f'
    and {total_count-10} more...' if tag.snapshot_set.count() > 10 else '')) @@ -516,8 +568,8 @@ class TagAdmin(admin.ModelAdmin): class ArchiveResultAdmin(admin.ModelAdmin): list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str') sort_fields = ('start_ts', 'extractor', 'status') - readonly_fields = ('snapshot_info', 'tags_str', 'created', 'modified', 'identifiers') - search_fields = ('id', 'uuid', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') + readonly_fields = ('snapshot_info', 'tags_str', 'created', 'modified', 'API') + search_fields = ('id', 'old_id', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'cmd', 'start_ts', 'end_ts', 'created_by', 'cmd_version', *readonly_fields) autocomplete_fields = ['snapshot'] @@ -537,7 +589,7 @@ class ArchiveResultAdmin(admin.ModelAdmin): result.snapshot.url[:128], ) - def identifiers(self, obj): + def API(self, obj): try: return get_abid_info(self, obj) except Exception as e: From 9273db528e722b7ed258287debc5a27b5ca37f8a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 20 Aug 2024 01:58:19 -0700 Subject: [PATCH 43/47] fix abid generation migrations to be historically consistent --- .../migrations/0024_auto_20240513_1143.py | 6 +- .../migrations/0027_update_snapshot_ids.py | 70 +++++++++++++++++-- .../migrations/0040_archiveresult_snapshot.py | 4 +- archivebox/core/models.py | 47 +++++++------ archivebox/core/settings.py | 2 +- archivebox/core/views.py | 1 + 6 files changed, 99 insertions(+), 31 deletions(-) diff --git a/archivebox/core/migrations/0024_auto_20240513_1143.py b/archivebox/core/migrations/0024_auto_20240513_1143.py index e2192794..f8cf645c 100644 --- a/archivebox/core/migrations/0024_auto_20240513_1143.py +++ b/archivebox/core/migrations/0024_auto_20240513_1143.py @@ -2,7 +2,7 @@ from django.db import migrations from datetime import datetime -from abid_utils.abid import abid_from_values +from abid_utils.abid import abid_from_values, DEFAULT_ABID_URI_SALT def calculate_abid(self): @@ -41,6 +41,7 @@ def calculate_abid(self): uri=uri, subtype=subtype, rand=rand, + salt=DEFAULT_ABID_URI_SALT, ) assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}' return abid @@ -65,8 +66,7 @@ def generate_snapshot_abids(apps, schema_editor): snapshot.abid = calculate_abid(snapshot) snapshot.uuid = snapshot.abid.uuid - snapshot.id = snapshot.abid.uuid - snapshot.save(update_fields=["abid", "uuid", "id"]) + snapshot.save(update_fields=["abid", "uuid"]) def generate_archiveresult_abids(apps, schema_editor): print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)') diff --git a/archivebox/core/migrations/0027_update_snapshot_ids.py b/archivebox/core/migrations/0027_update_snapshot_ids.py index 9b97782d..ad197c04 100644 --- a/archivebox/core/migrations/0027_update_snapshot_ids.py +++ b/archivebox/core/migrations/0027_update_snapshot_ids.py @@ -4,29 +4,89 @@ from django.db import migrations from django.db import migrations from datetime import datetime -from abid_utils.abid import ABID +from abid_utils.abid import ABID, abid_from_values, DEFAULT_ABID_URI_SALT +def calculate_abid(self): + """ + Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). + """ + prefix = self.abid_prefix + ts = eval(self.abid_ts_src) + uri = eval(self.abid_uri_src) + subtype = eval(self.abid_subtype_src) + rand = eval(self.abid_rand_src) + + if (not prefix) or prefix == 'obj_': + suggested_abid = self.__class__.__name__[:3].lower() + raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})') + + if not ts: + ts = datetime.utcfromtimestamp(0) + print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat()) + + if not uri: + uri = str(self) + print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri) + + if not subtype: + subtype = self.__class__.__name__ + print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype) + + if not rand: + rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk') + print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand) + + abid = abid_from_values( + prefix=prefix, + ts=ts, + uri=uri, + subtype=subtype, + rand=rand, + salt=DEFAULT_ABID_URI_SALT, + ) + assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}' + return abid + def update_snapshot_ids(apps, schema_editor): Snapshot = apps.get_model("core", "Snapshot") num_total = Snapshot.objects.all().count() print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...') for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()): assert snapshot.abid - snapshot.uuid = ABID.parse(snapshot.abid).uuid - snapshot.save(update_fields=["uuid"]) + snapshot.abid_prefix = 'snp_' + snapshot.abid_ts_src = 'self.added' + snapshot.abid_uri_src = 'self.url' + snapshot.abid_subtype_src = '"01"' + snapshot.abid_rand_src = 'self.uuid' + + snapshot.abid = calculate_abid(snapshot) + snapshot.uuid = snapshot.abid.uuid + snapshot.save(update_fields=["abid", "uuid"]) assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid) if idx % 1000 == 0: print(f'Migrated {idx}/{num_total} Snapshot objects...') def update_archiveresult_ids(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") ArchiveResult = apps.get_model("core", "ArchiveResult") num_total = ArchiveResult.objects.all().count() print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)') - for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()): + for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()): assert result.abid + result.abid_prefix = 'res_' + result.snapshot = Snapshot.objects.get(pk=result.snapshot_id) + result.snapshot_added = result.snapshot.added + result.snapshot_url = result.snapshot.url + result.abid_ts_src = 'self.snapshot_added' + result.abid_uri_src = 'self.snapshot_url' + result.abid_subtype_src = 'self.extractor' + result.abid_rand_src = 'self.id' + + result.abid = calculate_abid(result) + result.uuid = result.abid.uuid result.uuid = ABID.parse(result.abid).uuid - result.save(update_fields=["uuid"]) + result.save(update_fields=["abid", "uuid"]) assert str(ABID.parse(result.abid).uuid) == str(result.uuid) if idx % 5000 == 0: print(f'Migrated {idx}/{num_total} ArchiveResult objects...') diff --git a/archivebox/core/migrations/0040_archiveresult_snapshot.py b/archivebox/core/migrations/0040_archiveresult_snapshot.py index fa04a9d4..8c09d079 100644 --- a/archivebox/core/migrations/0040_archiveresult_snapshot.py +++ b/archivebox/core/migrations/0040_archiveresult_snapshot.py @@ -8,9 +8,9 @@ def update_archiveresult_snapshot_ids(apps, schema_editor): Snapshot = apps.get_model("core", "Snapshot") num_total = ArchiveResult.objects.all().count() print(f' Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)') - for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator()): + for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator(chunk_size=5000)): assert result.snapshot_old_id - snapshot = Snapshot.objects.get(old_id=result.snapshot_old_id) + snapshot = Snapshot.objects.only('id').get(old_id=result.snapshot_old_id) result.snapshot_id = snapshot.id result.save(update_fields=["snapshot_id"]) assert str(result.snapshot_id) == str(snapshot.id) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 183697a2..f3b5211e 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -17,7 +17,6 @@ from django.utils.text import slugify from django.core.cache import cache from django.urls import reverse, reverse_lazy from django.db.models import Case, When, Value, IntegerField -from django.contrib.auth.models import User # noqa from abid_utils.models import ABIDModel, ABIDField @@ -36,6 +35,8 @@ STATUS_CHOICES = [ ("skipped", "skipped") ] +def rand_int_id(): + return random.getrandbits(32) # class BaseModel(models.Model): @@ -49,24 +50,26 @@ STATUS_CHOICES = [ # abstract = True + + class Tag(ABIDModel): """ Based on django-taggit model + ABID base. """ abid_prefix = 'tag_' abid_ts_src = 'self.created' # TODO: add created/modified time - abid_uri_src = 'self.name' + abid_uri_src = 'self.slug' abid_subtype_src = '"03"' - abid_rand_src = 'self.id' + abid_rand_src = 'self.old_id' - # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True) - id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - uuid = models.UUIDField(default=uuid.uuid4, null=True, unique=True) + old_id = models.BigIntegerField(unique=True, default=rand_int_id, serialize=False, verbose_name='Old ID') # legacy PK + + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True) abid = ABIDField(prefix=abid_prefix) name = models.CharField(unique=True, blank=False, max_length=100) - slug = models.SlugField(unique=True, blank=True, max_length=100) + slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False) # slug is autoset on save from name, never set it manually @@ -77,9 +80,9 @@ class Tag(ABIDModel): def __str__(self): return self.name - @property - def old_id(self): - return self.id + # @property + # def old_id(self): + # return self.id def slugify(self, tag, i=None): slug = slugify(tag) @@ -156,16 +159,19 @@ class Snapshot(ABIDModel): return self.id def __repr__(self) -> str: - title = self.title or '-' - return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' + title = (self.title_stripped or '-')[:64] + return f'[{self.timestamp}] {self.url[:64]} ({title})' def __str__(self) -> str: - title = self.title or '-' - return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' + title = (self.title_stripped or '-')[:64] + return f'[{self.timestamp}] {self.url[:64]} ({title})' def save(self, *args, **kwargs): super().save(*args, **kwargs) - assert str(self.id) == str(self.abid.uuid) == str(self.uuid) + try: + assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})' + except AssertionError as e: + print(e) @classmethod def from_json(cls, info: dict): @@ -357,9 +363,6 @@ class ArchiveResultManager(models.Manager): qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') return qs -def rand_int_id(): - return random.getrandbits(32) - class ArchiveResult(ABIDModel): abid_prefix = 'res_' abid_ts_src = 'self.snapshot.added' @@ -387,7 +390,8 @@ class ArchiveResult(ABIDModel): objects = ArchiveResultManager() class Meta(TypedModelMeta): - verbose_name = 'Result' + verbose_name = 'Archive Result' + verbose_name_plural = 'Archive Results Log' def __str__(self): @@ -395,7 +399,10 @@ class ArchiveResult(ABIDModel): def save(self, *args, **kwargs): super().save(*args, **kwargs) - assert str(self.id) == str(self.abid.uuid) == str(self.uuid) + try: + assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})' + except AssertionError as e: + print(e) @property def uuid(self): diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index be530e6f..0faeb570 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -83,7 +83,7 @@ INSTALLED_APPS = [ 'django.contrib.staticfiles', 'django.contrib.admin', 'django_jsonform', - + 'signal_webhooks', 'abid_utils', 'plugantic', diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 7e14e8c1..ab0c2fa1 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -181,6 +181,7 @@ class SnapshotView(View): except (IndexError, ValueError): slug, archivefile = path.split('/', 1)[0], 'index.html' + # slug is a timestamp if slug.replace('.','').isdigit(): From 52a813aa80ee9c4b81389a71f12c53812c78b6a3 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 20 Aug 2024 01:58:36 -0700 Subject: [PATCH 44/47] fix title display in admin UI and abid filter matching in urls --- archivebox/core/models.py | 4 ++++ archivebox/core/settings.py | 2 ++ archivebox/core/views.py | 33 +++++++++++++++++++++++++++++---- 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index f3b5211e..c2b6d4e6 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -213,6 +213,10 @@ class Snapshot(ABIDModel): @property def api_docs_url(self) -> str: return f'/api/v1/docs#/Core%20Models/api_v1_core_get_snapshot' + + @cached_property + def title_stripped(self) -> str: + return (self.title or '').replace("\n", " ").replace("\r", "") @cached_property def extension(self) -> str: diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 0faeb570..cac65ee6 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -120,6 +120,8 @@ MIDDLEWARE = [ ### Authentication Settings ################################################################################ +# AUTH_USER_MODEL = 'auth.User' # cannot be easily changed unfortunately + AUTHENTICATION_BACKENDS = [ 'django.contrib.auth.backends.RemoteUserBackend', 'django.contrib.auth.backends.ModelBackend', diff --git a/archivebox/core/views.py b/archivebox/core/views.py index ab0c2fa1..1b322d39 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -228,7 +228,7 @@ class SnapshotView(View): snap.timestamp, snap.timestamp, snap.url, - snap.title or '', + snap.title_stripped[:64] or '', ) for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added') ) @@ -279,12 +279,35 @@ class SnapshotView(View): content_type="text/html", status=404, ) + + # # slud is an ID + # ulid = slug.split('_', 1)[-1] + # try: + # try: + # snapshot = snapshot or Snapshot.objects.get(Q(abid=ulid) | Q(id=ulid) | Q(old_id=ulid)) + # except Snapshot.DoesNotExist: + # pass + + # try: + # snapshot = Snapshot.objects.get(Q(abid__startswith=slug) | Q(abid__startswith=Snapshot.abid_prefix + slug) | Q(id__startswith=slug) | Q(old_id__startswith=slug)) + # except (Snapshot.DoesNotExist, Snapshot.MultipleObjectsReturned): + # pass + + # try: + # snapshot = snapshot or Snapshot.objects.get(Q(abid__icontains=snapshot_id) | Q(id__icontains=snapshot_id) | Q(old_id__icontains=snapshot_id)) + # except Snapshot.DoesNotExist: + # pass + # return redirect(f'/archive/{snapshot.timestamp}/index.html') + # except Snapshot.DoesNotExist: + # pass + # slug is a URL try: try: - # try exact match on full url first + # try exact match on full url / ABID first snapshot = Snapshot.objects.get( Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path) + | Q(abid__icontains=path) | Q(id__icontains=path) | Q(old_id__icontains=path) ) except Snapshot.DoesNotExist: # fall back to match on exact base_url @@ -318,15 +341,17 @@ class SnapshotView(View): except Snapshot.MultipleObjectsReturned: snapshot_hrefs = mark_safe('
    ').join( format_html( - '{}
    {} {} {}', + '{} {} {} {} {}', snap.added.strftime('%Y-%m-%d %H:%M:%S'), + snap.abid, snap.timestamp, snap.timestamp, snap.url, - snap.title or '', + snap.title_stripped[:64] or '', ) for snap in Snapshot.objects.filter( Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path)) + | Q(abid__icontains=path) | Q(id__icontains=path) | Q(old_id__icontains=path) ).only('url', 'timestamp', 'title', 'added').order_by('-added') ) return HttpResponse( From 849b4963a1e308aa085885ea210ef430ec8a8d29 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 20 Aug 2024 01:58:44 -0700 Subject: [PATCH 45/47] add migrations --- .../core/migrations/0055_alter_tag_slug.py | 18 +++++ .../core/migrations/0056_remove_tag_uuid.py | 17 ++++ .../migrations/0057_rename_id_tag_old_id.py | 18 +++++ .../core/migrations/0058_alter_tag_old_id.py | 19 +++++ archivebox/core/migrations/0059_tag_id.py | 81 +++++++++++++++++++ .../core/migrations/0060_alter_tag_id.py | 19 +++++ ...rename_tag_snapshottag_old_tag_and_more.py | 22 +++++ .../0062_alter_snapshottag_old_tag.py | 19 +++++ ...apshottag_tag_alter_snapshottag_old_tag.py | 40 +++++++++ ...er_snapshottag_unique_together_and_more.py | 27 +++++++ .../0065_remove_snapshottag_old_tag.py | 17 ++++ ...ottag_tag_alter_tag_id_alter_tag_old_id.py | 31 +++++++ .../migrations/0067_alter_snapshottag_tag.py | 19 +++++ .../0068_alter_archiveresult_options.py | 17 ++++ 14 files changed, 364 insertions(+) create mode 100644 archivebox/core/migrations/0055_alter_tag_slug.py create mode 100644 archivebox/core/migrations/0056_remove_tag_uuid.py create mode 100644 archivebox/core/migrations/0057_rename_id_tag_old_id.py create mode 100644 archivebox/core/migrations/0058_alter_tag_old_id.py create mode 100644 archivebox/core/migrations/0059_tag_id.py create mode 100644 archivebox/core/migrations/0060_alter_tag_id.py create mode 100644 archivebox/core/migrations/0061_rename_tag_snapshottag_old_tag_and_more.py create mode 100644 archivebox/core/migrations/0062_alter_snapshottag_old_tag.py create mode 100644 archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py create mode 100644 archivebox/core/migrations/0064_alter_snapshottag_unique_together_and_more.py create mode 100644 archivebox/core/migrations/0065_remove_snapshottag_old_tag.py create mode 100644 archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py create mode 100644 archivebox/core/migrations/0067_alter_snapshottag_tag.py create mode 100644 archivebox/core/migrations/0068_alter_archiveresult_options.py diff --git a/archivebox/core/migrations/0055_alter_tag_slug.py b/archivebox/core/migrations/0055_alter_tag_slug.py new file mode 100644 index 00000000..741b1365 --- /dev/null +++ b/archivebox/core/migrations/0055_alter_tag_slug.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:24 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0054_alter_snapshot_timestamp'), + ] + + operations = [ + migrations.AlterField( + model_name='tag', + name='slug', + field=models.SlugField(editable=False, max_length=100, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0056_remove_tag_uuid.py b/archivebox/core/migrations/0056_remove_tag_uuid.py new file mode 100644 index 00000000..9c01507e --- /dev/null +++ b/archivebox/core/migrations/0056_remove_tag_uuid.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:25 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0055_alter_tag_slug'), + ] + + operations = [ + migrations.RemoveField( + model_name='tag', + name='uuid', + ), + ] diff --git a/archivebox/core/migrations/0057_rename_id_tag_old_id.py b/archivebox/core/migrations/0057_rename_id_tag_old_id.py new file mode 100644 index 00000000..ebe20b01 --- /dev/null +++ b/archivebox/core/migrations/0057_rename_id_tag_old_id.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:29 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0056_remove_tag_uuid'), + ] + + operations = [ + migrations.RenameField( + model_name='tag', + old_name='id', + new_name='old_id', + ), + ] diff --git a/archivebox/core/migrations/0058_alter_tag_old_id.py b/archivebox/core/migrations/0058_alter_tag_old_id.py new file mode 100644 index 00000000..4cc291c0 --- /dev/null +++ b/archivebox/core/migrations/0058_alter_tag_old_id.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:30 + +import core.models +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0057_rename_id_tag_old_id'), + ] + + operations = [ + migrations.AlterField( + model_name='tag', + name='old_id', + field=models.BigIntegerField(default=core.models.rand_int_id, primary_key=True, serialize=False, verbose_name='Old ID'), + ), + ] diff --git a/archivebox/core/migrations/0059_tag_id.py b/archivebox/core/migrations/0059_tag_id.py new file mode 100644 index 00000000..004ac541 --- /dev/null +++ b/archivebox/core/migrations/0059_tag_id.py @@ -0,0 +1,81 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:33 + +from django.db import migrations, models +from abid_utils.models import ABID, abid_from_values + + +def calculate_abid(self): + """ + Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). + """ + prefix = self.abid_prefix + ts = eval(self.abid_ts_src) + uri = eval(self.abid_uri_src) + subtype = eval(self.abid_subtype_src) + rand = eval(self.abid_rand_src) + + if (not prefix) or prefix == 'obj_': + suggested_abid = self.__class__.__name__[:3].lower() + raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})') + + if not ts: + ts = datetime.utcfromtimestamp(0) + print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat()) + + if not uri: + uri = str(self) + print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri) + + if not subtype: + subtype = self.__class__.__name__ + print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype) + + if not rand: + rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk') + print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand) + + abid = abid_from_values( + prefix=prefix, + ts=ts, + uri=uri, + subtype=subtype, + rand=rand, + ) + assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}' + return abid + + +def update_archiveresult_ids(apps, schema_editor): + Tag = apps.get_model("core", "Tag") + num_total = Tag.objects.all().count() + print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...') + for idx, tag in enumerate(Tag.objects.all().iterator()): + assert tag.name + tag.abid_prefix = 'tag_' + tag.abid_ts_src = 'self.created' + tag.abid_uri_src = 'self.slug' + tag.abid_subtype_src = '"03"' + tag.abid_rand_src = 'self.old_id' + tag.abid = calculate_abid(tag) + tag.id = tag.abid.uuid + tag.save(update_fields=["abid", "id"]) + assert str(ABID.parse(tag.abid).uuid) == str(tag.id) + if idx % 10 == 0: + print(f'Migrated {idx}/{num_total} Tag objects...') + + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0058_alter_tag_old_id'), + ] + + operations = [ + migrations.AddField( + model_name='tag', + name='id', + field=models.UUIDField(blank=True, null=True), + ), + migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop), + ] diff --git a/archivebox/core/migrations/0060_alter_tag_id.py b/archivebox/core/migrations/0060_alter_tag_id.py new file mode 100644 index 00000000..aeabefdc --- /dev/null +++ b/archivebox/core/migrations/0060_alter_tag_id.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:42 + +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0059_tag_id'), + ] + + operations = [ + migrations.AlterField( + model_name='tag', + name='id', + field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0061_rename_tag_snapshottag_old_tag_and_more.py b/archivebox/core/migrations/0061_rename_tag_snapshottag_old_tag_and_more.py new file mode 100644 index 00000000..e29c8081 --- /dev/null +++ b/archivebox/core/migrations/0061_rename_tag_snapshottag_old_tag_and_more.py @@ -0,0 +1,22 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:43 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0060_alter_tag_id'), + ] + + operations = [ + migrations.RenameField( + model_name='snapshottag', + old_name='tag', + new_name='old_tag', + ), + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together={('snapshot', 'old_tag')}, + ), + ] diff --git a/archivebox/core/migrations/0062_alter_snapshottag_old_tag.py b/archivebox/core/migrations/0062_alter_snapshottag_old_tag.py new file mode 100644 index 00000000..561d739c --- /dev/null +++ b/archivebox/core/migrations/0062_alter_snapshottag_old_tag.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:44 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0061_rename_tag_snapshottag_old_tag_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshottag', + name='old_tag', + field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'), + ), + ] diff --git a/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py b/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py new file mode 100644 index 00000000..6c574669 --- /dev/null +++ b/archivebox/core/migrations/0063_snapshottag_tag_alter_snapshottag_old_tag.py @@ -0,0 +1,40 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:45 + +import django.db.models.deletion +from django.db import migrations, models + + +def update_snapshottag_ids(apps, schema_editor): + Tag = apps.get_model("core", "Tag") + SnapshotTag = apps.get_model("core", "SnapshotTag") + num_total = SnapshotTag.objects.all().count() + print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)') + for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator()): + assert snapshottag.old_tag_id + tag = Tag.objects.get(old_id=snapshottag.old_tag_id) + snapshottag.tag_id = tag.id + snapshottag.save(update_fields=["tag_id"]) + assert str(snapshottag.tag_id) == str(tag.id) + if idx % 100 == 0: + print(f'Migrated {idx}/{num_total} SnapshotTag objects...') + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0062_alter_snapshottag_old_tag'), + ] + + operations = [ + migrations.AddField( + model_name='snapshottag', + name='tag', + field=models.ForeignKey(blank=True, db_column='tag_id', null=True, on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'), + ), + migrations.AlterField( + model_name='snapshottag', + name='old_tag', + field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, related_name='snapshottags_old', to='core.tag'), + ), + migrations.RunPython(update_snapshottag_ids, reverse_code=migrations.RunPython.noop), + ] diff --git a/archivebox/core/migrations/0064_alter_snapshottag_unique_together_and_more.py b/archivebox/core/migrations/0064_alter_snapshottag_unique_together_and_more.py new file mode 100644 index 00000000..911bf68b --- /dev/null +++ b/archivebox/core/migrations/0064_alter_snapshottag_unique_together_and_more.py @@ -0,0 +1,27 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:50 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'), + ] + + operations = [ + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together=set(), + ), + migrations.AlterField( + model_name='snapshottag', + name='tag', + field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'), + ), + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together={('snapshot', 'tag')}, + ), + ] diff --git a/archivebox/core/migrations/0065_remove_snapshottag_old_tag.py b/archivebox/core/migrations/0065_remove_snapshottag_old_tag.py new file mode 100644 index 00000000..16b2eea0 --- /dev/null +++ b/archivebox/core/migrations/0065_remove_snapshottag_old_tag.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:51 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0064_alter_snapshottag_unique_together_and_more'), + ] + + operations = [ + migrations.RemoveField( + model_name='snapshottag', + name='old_tag', + ), + ] diff --git a/archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py b/archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py new file mode 100644 index 00000000..e6022eab --- /dev/null +++ b/archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py @@ -0,0 +1,31 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:52 + +import core.models +import django.db.models.deletion +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0065_remove_snapshottag_old_tag'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshottag', + name='tag', + field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'), + ), + migrations.AlterField( + model_name='tag', + name='id', + field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='tag', + name='old_id', + field=models.BigIntegerField(default=core.models.rand_int_id, serialize=False, unique=True, verbose_name='Old ID'), + ), + ] diff --git a/archivebox/core/migrations/0067_alter_snapshottag_tag.py b/archivebox/core/migrations/0067_alter_snapshottag_tag.py new file mode 100644 index 00000000..b1c9f6a5 --- /dev/null +++ b/archivebox/core/migrations/0067_alter_snapshottag_tag.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.6 on 2024-08-20 03:53 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshottag', + name='tag', + field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'), + ), + ] diff --git a/archivebox/core/migrations/0068_alter_archiveresult_options.py b/archivebox/core/migrations/0068_alter_archiveresult_options.py new file mode 100644 index 00000000..d5606592 --- /dev/null +++ b/archivebox/core/migrations/0068_alter_archiveresult_options.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.6 on 2024-08-20 07:26 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0067_alter_snapshottag_tag'), + ] + + operations = [ + migrations.AlterModelOptions( + name='archiveresult', + options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'}, + ), + ] From 267964881c7a53941cec4feb41ac59e97b116cef Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 20 Aug 2024 03:29:57 -0700 Subject: [PATCH 46/47] fix tags editor in Snapshot list view --- archivebox/core/admin.py | 26 +++++++++++++------------- archivebox/templates/admin/base.html | 8 ++++++++ archivebox/templates/static/admin.css | 7 +++---- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 8f5ac72c..78b6bdf8 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -155,25 +155,26 @@ class TagInline(admin.TabularInline): ) from django.contrib.admin.helpers import ActionForm -from django.contrib.admin.widgets import AutocompleteSelectMultiple +from django.contrib.admin.widgets import FilteredSelectMultiple -class AutocompleteTags: - model = Tag - search_fields = ['name'] - name = 'tags' - remote_field = TagInline +# class AutocompleteTags: +# model = Tag +# search_fields = ['name'] +# name = 'name' +# # source_field = 'name' +# remote_field = Tag._meta.get_field('name') -class AutocompleteTagsAdminStub: - name = 'admin' +# class AutocompleteTagsAdminStub: +# name = 'admin' class SnapshotActionForm(ActionForm): tags = forms.ModelMultipleChoiceField( queryset=Tag.objects.all(), required=False, - widget=AutocompleteSelectMultiple( - AutocompleteTags(), - AutocompleteTagsAdminStub(), + widget=FilteredSelectMultiple( + 'core_tag__name', + False, ), ) @@ -235,9 +236,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): fields = ('url', 'created_by', 'title', *readonly_fields) ordering = ['-added'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] - # autocomplete_fields = ['tags'] + autocomplete_fields = ['tags'] inlines = [TagInline, ArchiveResultInline] - # inlines = [ArchiveResultInline] list_per_page = SNAPSHOTS_PER_PAGE action_form = SnapshotActionForm diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index c0d9ac5b..00e2f205 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -45,6 +45,13 @@ {% endif %} {% endblock %} + + + +