diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a01ef8ad..c6375313 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -3,6 +3,7 @@ on: [push]
env:
MAX_LINE_LENGTH: 110
+ DOCKER_IMAGE: archivebox-ci
jobs:
lint:
@@ -118,12 +119,12 @@ jobs:
- name: Build image
run: |
- docker build . -t archivebox
+ docker build . -t "$DOCKER_IMAGE"
- name: Init data dir
run: |
mkdir data
- docker run -v "$PWD"/data:/data archivebox init
+ docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" init
- name: Run test server
run: |
@@ -132,7 +133,7 @@ jobs:
- name: Add link
run: |
- docker run -v "$PWD"/data:/data --network host archivebox add http://www.test-nginx-1.local
+ docker run -v "$PWD"/data:/data --network host "$DOCKER_IMAGE" add http://www.test-nginx-1.local
- name: Add stdin link
run: |
@@ -140,8 +141,8 @@ jobs:
- name: List links
run: |
- docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; }
- docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; }
+ docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; }
+ docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; }
- name: Start docker-compose stack
run: |
diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index 4337e4a3..55c68e16 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -9,9 +9,10 @@ from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model
+from django import forms
from core.models import Snapshot
-from core.forms import AddLinkForm
+from core.forms import AddLinkForm, TagField
from core.utils import get_icons
from util import htmldecode, urldecode, ansi_to_html
@@ -55,6 +56,32 @@ def delete_snapshots(modeladmin, request, queryset):
delete_snapshots.short_description = "Delete"
+class SnapshotAdminForm(forms.ModelForm):
+ tags = TagField(required=False)
+
+ class Meta:
+ model = Snapshot
+ fields = "__all__"
+
+ def save(self, commit=True):
+ # Based on: https://stackoverflow.com/a/49933068/3509554
+
+ # Get the unsave instance
+ instance = forms.ModelForm.save(self, False)
+ tags = self.cleaned_data.pop("tags")
+
+ #update save_m2m
+ def new_save_m2m():
+ instance.save_tags(tags)
+
+ # Do we need to save all changes now?
+ self.save_m2m = new_save_m2m
+ if commit:
+ instance.save()
+
+ return instance
+
+
class SnapshotAdmin(admin.ModelAdmin):
list_display = ('added', 'title_str', 'url_str', 'files', 'size')
sort_fields = ('title_str', 'url_str', 'added')
@@ -65,6 +92,13 @@ class SnapshotAdmin(admin.ModelAdmin):
ordering = ['-added']
actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
actions_template = 'admin/actions_as_select.html'
+ form = SnapshotAdminForm
+
+ def get_queryset(self, request):
+ return super().get_queryset(request).prefetch_related('tags')
+
+ def tag_list(self, obj):
+ return ', '.join(obj.tags.values_list('name', flat=True))
def id_str(self, obj):
return format_html(
@@ -75,9 +109,9 @@ class SnapshotAdmin(admin.ModelAdmin):
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
- format_html('{}', tag.strip())
- for tag in obj.tags.split(',')
- ) if obj.tags else ''
+ format_html(' {} ', tag.id, tag)
+ for tag in obj.tags.all()
+ )
return format_html(
''
'
'
diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py
index f641298a..8f48929b 100644
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -3,6 +3,7 @@ __package__ = 'archivebox.core'
from django import forms
from ..util import URL_REGEX
+from .utils_taggit import edit_string_for_tags, parse_tags
CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
@@ -12,3 +13,44 @@ CHOICES = (
class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
+
+
+class TagWidgetMixin:
+ def format_value(self, value):
+ if value is not None and not isinstance(value, str):
+ value = edit_string_for_tags(value)
+ return super().format_value(value)
+
+class TagWidget(TagWidgetMixin, forms.TextInput):
+ pass
+
+class TagField(forms.CharField):
+ widget = TagWidget
+
+ def clean(self, value):
+ value = super().clean(value)
+ try:
+ return parse_tags(value)
+ except ValueError:
+ raise forms.ValidationError(
+ "Please provide a comma-separated list of tags."
+ )
+
+ def has_changed(self, initial_value, data_value):
+ # Always return False if the field is disabled since self.bound_data
+ # always uses the initial value in this case.
+ if self.disabled:
+ return False
+
+ try:
+ data_value = self.clean(data_value)
+ except forms.ValidationError:
+ pass
+
+ if initial_value is None:
+ initial_value = []
+
+ initial_value = [tag.name for tag in initial_value]
+ initial_value.sort()
+
+ return initial_value != data_value
diff --git a/archivebox/core/migrations/0006_auto_20201012_1520.py b/archivebox/core/migrations/0006_auto_20201012_1520.py
new file mode 100644
index 00000000..694c9908
--- /dev/null
+++ b/archivebox/core/migrations/0006_auto_20201012_1520.py
@@ -0,0 +1,70 @@
+# Generated by Django 3.0.8 on 2020-10-12 15:20
+
+from django.db import migrations, models
+from django.utils.text import slugify
+
+def forwards_func(apps, schema_editor):
+ SnapshotModel = apps.get_model("core", "Snapshot")
+ TagModel = apps.get_model("core", "Tag")
+
+ db_alias = schema_editor.connection.alias
+ snapshots = SnapshotModel.objects.all()
+ for snapshot in snapshots:
+ tags = snapshot.tags
+ tag_set = (
+ set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
+ )
+ tag_set.discard("")
+
+ for tag in tag_set:
+ to_add, _ = TagModel.objects.get_or_create(name=tag, slug=slugify(tag))
+ snapshot.tags.add(to_add)
+
+
+def reverse_func(apps, schema_editor):
+ SnapshotModel = apps.get_model("core", "Snapshot")
+ TagModel = apps.get_model("core", "Tag")
+
+ db_alias = schema_editor.connection.alias
+ snapshots = SnapshotModel.objects.all()
+ for snapshot in snapshots:
+ tags = snapshot.tags.values_list("name", flat=True)
+ snapshot.tags_old = ",".join([tag for tag in tags])
+ snapshot.save()
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0005_auto_20200728_0326'),
+ ]
+
+ operations = [
+ migrations.RenameField(
+ model_name='snapshot',
+ old_name='tags',
+ new_name='tags_old',
+ ),
+ migrations.CreateModel(
+ name='Tag',
+ fields=[
+ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('name', models.CharField(max_length=100, unique=True, verbose_name='name')),
+ ('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')),
+ ],
+ options={
+ 'verbose_name': 'Tag',
+ 'verbose_name_plural': 'Tags',
+ },
+ ),
+ migrations.AddField(
+ model_name='snapshot',
+ name='tags',
+ field=models.ManyToManyField(to='core.Tag'),
+ ),
+ migrations.RunPython(forwards_func, reverse_func),
+ migrations.RemoveField(
+ model_name='snapshot',
+ name='tags_old',
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 313dd67d..7d0c799f 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -2,13 +2,55 @@ __package__ = 'archivebox.core'
import uuid
-from django.db import models
+from django.db import models, transaction
from django.utils.functional import cached_property
+from django.utils.text import slugify
from ..util import parse_date
from ..index.schema import Link
+class Tag(models.Model):
+ """
+ Based on django-taggit model
+ """
+ name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
+ slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
+
+ class Meta:
+ verbose_name = "Tag"
+ verbose_name_plural = "Tags"
+
+ def __str__(self):
+ return self.name
+
+ def slugify(self, tag, i=None):
+ slug = slugify(tag)
+ if i is not None:
+ slug += "_%d" % i
+ return slug
+
+ def save(self, *args, **kwargs):
+ if self._state.adding and not self.slug:
+ self.slug = self.slugify(self.name)
+
+ with transaction.atomic():
+ slugs = set(
+ type(self)
+ ._default_manager.filter(slug__startswith=self.slug)
+ .values_list("slug", flat=True)
+ )
+
+ i = None
+ while True:
+ slug = self.slugify(self.name, i)
+ if slug not in slugs:
+ self.slug = slug
+ return super().save(*args, **kwargs)
+ i = 1 if i is None else i+1
+ else:
+ return super().save(*args, **kwargs)
+
class Snapshot(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
@@ -16,11 +58,10 @@ class Snapshot(models.Model):
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
- tags = models.CharField(max_length=256, null=True, blank=True, db_index=True)
added = models.DateTimeField(auto_now_add=True, db_index=True)
updated = models.DateTimeField(null=True, blank=True, db_index=True)
- # bookmarked = models.DateTimeField()
+ tags = models.ManyToManyField(Tag)
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
@@ -41,7 +82,8 @@ class Snapshot(models.Model):
args = args or self.keys
return {
key: getattr(self, key)
- for key in args
+ if key != 'tags' else self.get_tags_str()
+ for key in args
}
def as_link(self) -> Link:
@@ -50,6 +92,13 @@ class Snapshot(models.Model):
def as_link_with_details(self) -> Link:
from ..index import load_link_details
return load_link_details(self.as_link())
+
+ def get_tags_str(self) -> str:
+ tags = ','.join(
+ tag.name
+ for tag in self.tags.all()
+ ) if self.tags.all() else ''
+ return tags
@cached_property
def bookmarked(self):
@@ -96,3 +145,10 @@ class Snapshot(models.Model):
and self.history['title'][-1].output.strip()):
return self.history['title'][-1].output.strip()
return None
+
+ def save_tags(self, tags=[]):
+ tags_id = []
+ for tag in tags:
+ tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
+ self.tags.clear()
+ self.tags.add(*tags_id)
diff --git a/archivebox/core/utils_taggit.py b/archivebox/core/utils_taggit.py
new file mode 100644
index 00000000..5a2d511d
--- /dev/null
+++ b/archivebox/core/utils_taggit.py
@@ -0,0 +1,113 @@
+# Taken from https://github.com/jazzband/django-taggit/blob/3b56adb637ab95aca5036c37a358402c825a367c/taggit/utils.py
+
+def parse_tags(tagstring):
+ """
+ Parses tag input, with multiple word input being activated and
+ delineated by commas and double quotes. Quotes take precedence, so
+ they may contain commas.
+
+ Returns a sorted list of unique tag names.
+
+ Ported from Jonathan Buchanan's `django-tagging
+ `_
+ """
+ if not tagstring:
+ return []
+
+ # Special case - if there are no commas or double quotes in the
+ # input, we don't *do* a recall... I mean, we know we only need to
+ # split on spaces.
+ if "," not in tagstring and '"' not in tagstring:
+ words = list(set(split_strip(tagstring, " ")))
+ words.sort()
+ return words
+
+ words = []
+ buffer = []
+ # Defer splitting of non-quoted sections until we know if there are
+ # any unquoted commas.
+ to_be_split = []
+ saw_loose_comma = False
+ open_quote = False
+ i = iter(tagstring)
+ try:
+ while True:
+ c = next(i)
+ if c == '"':
+ if buffer:
+ to_be_split.append("".join(buffer))
+ buffer = []
+ # Find the matching quote
+ open_quote = True
+ c = next(i)
+ while c != '"':
+ buffer.append(c)
+ c = next(i)
+ if buffer:
+ word = "".join(buffer).strip()
+ if word:
+ words.append(word)
+ buffer = []
+ open_quote = False
+ else:
+ if not saw_loose_comma and c == ",":
+ saw_loose_comma = True
+ buffer.append(c)
+ except StopIteration:
+ # If we were parsing an open quote which was never closed treat
+ # the buffer as unquoted.
+ if buffer:
+ if open_quote and "," in buffer:
+ saw_loose_comma = True
+ to_be_split.append("".join(buffer))
+ if to_be_split:
+ if saw_loose_comma:
+ delimiter = ","
+ else:
+ delimiter = " "
+ for chunk in to_be_split:
+ words.extend(split_strip(chunk, delimiter))
+ words = list(set(words))
+ words.sort()
+ return words
+
+
+def split_strip(string, delimiter=","):
+ """
+ Splits ``string`` on ``delimiter``, stripping each resulting string
+ and returning a list of non-empty strings.
+
+ Ported from Jonathan Buchanan's `django-tagging
+ `_
+ """
+ if not string:
+ return []
+
+ words = [w.strip() for w in string.split(delimiter)]
+ return [w for w in words if w]
+
+
+def edit_string_for_tags(tags):
+ """
+ Given list of ``Tag`` instances, creates a string representation of
+ the list suitable for editing by the user, such that submitting the
+ given string representation back without changing it will give the
+ same list of tags.
+
+ Tag names which contain commas will be double quoted.
+
+ If any tag name which isn't being quoted contains whitespace, the
+ resulting string of tag names will be comma-delimited, otherwise
+ it will be space-delimited.
+
+ Ported from Jonathan Buchanan's `django-tagging
+ `_
+ """
+ names = []
+ for tag in tags:
+ name = tag.name
+ if "," in name or " " in name:
+ names.append('"%s"' % name)
+ else:
+ names.append(name)
+ return ", ".join(sorted(names))
diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py
index b3ca7231..aa7c8817 100644
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@@ -34,13 +34,19 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) ->
def write_link_to_sql_index(link: Link):
from core.models import Snapshot
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
+ tags = info.pop("tags")
+ if tags is None:
+ tags = []
+
try:
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
except Snapshot.DoesNotExist:
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
- return Snapshot.objects.update_or_create(url=link.url, defaults=info)[0]
+ snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
+ snapshot.save_tags(tags)
+ return snapshot
@enforce_types
@@ -65,8 +71,14 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
except Snapshot.DoesNotExist:
snap = write_link_to_sql_index(link)
snap.title = link.title
- snap.tags = link.tags
+
+ tag_set = (
+ set(tag.strip() for tag in (link.tags or '').split(','))
+ )
+ tag_list = list(tag_set) or []
+
snap.save()
+ snap.save_tags(tag_list)
diff --git a/archivebox/themes/default/static/admin.css b/archivebox/themes/default/static/admin.css
index b2b58d64..932f380b 100644
--- a/archivebox/themes/default/static/admin.css
+++ b/archivebox/themes/default/static/admin.css
@@ -222,3 +222,11 @@ body.model-snapshot.change-list #content .object-tools {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
+
+.tags > a > .tag {
+ border: 1px solid;
+ border-radius: 10px;
+ background-color: #f3f3f3;
+ padding: 3px;
+}
+
diff --git a/docker-compose.yml b/docker-compose.yml
index a209e959..f9a75748 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,7 +12,7 @@ version: '3.7'
services:
archivebox:
# build: .
- image: nikisweeting/archivebox:latest
+ image: ${DOCKER_IMAGE:-nikisweeting/archivebox:latest}
command: server 0.0.0.0:8000
stdin_open: true
tty: true
diff --git a/tests/tags_migration/index.sqlite3 b/tests/tags_migration/index.sqlite3
new file mode 100755
index 00000000..04d35a71
Binary files /dev/null and b/tests/tags_migration/index.sqlite3 differ
diff --git a/tests/test_init.py b/tests/test_init.py
index d162fa80..ae07e5da 100644
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -4,7 +4,7 @@
import os
import subprocess
from pathlib import Path
-import json
+import json, shutil
import sqlite3
from archivebox.config import OUTPUT_PERMISSIONS
@@ -131,4 +131,42 @@ def test_unrecognized_folders(tmp_path, process, disable_extractors_dict):
init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8")
- assert init_process.returncode == 0
\ No newline at end of file
+ assert init_process.returncode == 0
+
+def test_tags_migration(tmp_path, disable_extractors_dict):
+
+ base_sqlite_path = Path(__file__).parent / 'tags_migration'
+
+ if os.path.exists(tmp_path):
+ shutil.rmtree(tmp_path)
+ shutil.copytree(str(base_sqlite_path), tmp_path)
+ os.chdir(tmp_path)
+
+ conn = sqlite3.connect("index.sqlite3")
+ conn.row_factory = sqlite3.Row
+ c = conn.cursor()
+ c.execute("SELECT id, tags from core_snapshot")
+ snapshots = c.fetchall()
+ snapshots_dict = { sn['id']: sn['tags'] for sn in snapshots}
+ conn.commit()
+ conn.close()
+
+ init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
+
+ conn = sqlite3.connect("index.sqlite3")
+ conn.row_factory = sqlite3.Row
+ c = conn.cursor()
+ c.execute("""
+ SELECT core_snapshot.id, core_tag.name from core_snapshot
+ JOIN core_snapshot_tags on core_snapshot_tags.snapshot_id=core_snapshot.id
+ JOIN core_tag on core_tag.id=core_snapshot_tags.tag_id
+ """)
+ tags = c.fetchall()
+ conn.commit()
+ conn.close()
+
+ for tag in tags:
+ snapshot_id = tag["id"]
+ tag_name = tag["name"]
+ # Check each tag migrated is in the previous field
+ assert tag_name in snapshots_dict[snapshot_id]
\ No newline at end of file