From 62c9028212f8c96d5faa99ea0061a011a4237249 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Mon, 21 Sep 2020 11:50:26 -0500 Subject: [PATCH 01/11] Improved tags --- archivebox.egg-info/requires.txt | 1 + archivebox/core/admin.py | 12 ++- .../migrations/0006_auto_20200915_2006.py | 89 ++++++++++++++++++ archivebox/core/models.py | 11 ++- archivebox/core/settings.py | 1 + archivebox/index/__init__.py | 11 ++- archivebox/index/schema.py | 3 +- archivebox/index/sql.py | 9 +- setup.py | 1 + tests/tags_migration/index.sqlite3 | Bin 0 -> 167936 bytes tests/test_init.py | 44 ++++++++- 11 files changed, 172 insertions(+), 10 deletions(-) create mode 100644 archivebox/core/migrations/0006_auto_20200915_2006.py create mode 100755 tests/tags_migration/index.sqlite3 diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt index 71dc253d..ca279875 100644 --- a/archivebox.egg-info/requires.txt +++ b/archivebox.egg-info/requires.txt @@ -4,6 +4,7 @@ mypy-extensions==0.4.3 base32-crockford==0.3.0 django==3.0.8 django-extensions==3.0.3 +django-taggit==1.3.0 dateparser ipython youtube-dl diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 4337e4a3..a35d589b 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -66,6 +66,12 @@ class SnapshotAdmin(admin.ModelAdmin): actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots] actions_template = 'admin/actions_as_select.html' + def get_queryset(self, request): + return super().get_queryset(request).prefetch_related('tags') + + def tag_list(self, obj): + return u", ".join(o.name for o in obj.tags.all()) + def id_str(self, obj): return format_html( '{}', @@ -75,9 +81,9 @@ class SnapshotAdmin(admin.ModelAdmin): def title_str(self, obj): canon = obj.as_link().canonical_outputs() tags = ''.join( - format_html('{}', tag.strip()) - for tag in obj.tags.split(',') - ) if obj.tags else '' + format_html(' {} ', tag) + for tag in obj.tags.all() + ) if obj.tags.all() else '' return format_html( '' '' diff --git a/archivebox/core/migrations/0006_auto_20200915_2006.py b/archivebox/core/migrations/0006_auto_20200915_2006.py new file mode 100644 index 00000000..59bb111e --- /dev/null +++ b/archivebox/core/migrations/0006_auto_20200915_2006.py @@ -0,0 +1,89 @@ +# Generated by Django 3.0.8 on 2020-09-15 20:06 + +from django.db import migrations, models +from django.contrib.contenttypes.models import ContentType +from django.utils.text import slugify +import django.db.models.deletion +import taggit.managers + +def forwards_func(apps, schema_editor): + SnapshotModel = apps.get_model("core", "Snapshot") + TaggedItemModel = apps.get_model("core", "TaggedItem") + TagModel = apps.get_model("taggit", "Tag") + contents = ContentType.objects.all() + try: + ct = ContentType.objects.filter(app_label="core", model="snapshot") + except model.DoesNotExist: # Be explicit about exceptions + ct = None + + db_alias = schema_editor.connection.alias + snapshots = SnapshotModel.objects.all() + for snapshot in snapshots: + tags = snapshot.tags + tag_set = ( + set(tag.strip() for tag in (snapshot.tags_old or '').split(',')) + ) + tag_list = list(tag_set) or [] + + for tag in tag_list: + new_tag, created = TagModel.objects.get_or_create(name=tag, slug=slugify(tag)) + TaggedItemModel.objects.get_or_create( + content_type_id=ct[0].id, + object_id=snapshot.id, + tag=new_tag + ) + + +def reverse_func(apps, schema_editor): + SnapshotModel = apps.get_model("core", "Snapshot") + TaggedItemModel = apps.get_model("core", "TaggedItem") + TagModel = apps.get_model("taggit", "Tag") + ct = ContentType.objects.get(app_label="core", model="snapshot") + + db_alias = schema_editor.connection.alias + snapshots = SnapshotModel.objects.all() + for snapshot in snapshots: + for tag in tags: + tagged_items = TaggedItemModel.objects.filter( + object_id=snapshot.id, + ).delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ('contenttypes', '0002_remove_content_type_name'), + ('taggit', '0003_taggeditem_add_unique_index'), + ('core', '0005_auto_20200728_0326'), + ] + + operations = [ + migrations.RenameField( + model_name='snapshot', + old_name='tags', + new_name='tags_old', + ), + migrations.CreateModel( + name='TaggedItem', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('object_id', models.UUIDField(db_index=True, verbose_name='object ID')), + ('content_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_tagged_items', to='contenttypes.ContentType', verbose_name='content type')), + ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_items', to='taggit.Tag')), + ], + options={ + 'verbose_name': 'Tag', + 'verbose_name_plural': 'Tags', + }, + ), + migrations.AddField( + model_name='snapshot', + name='tags', + field=taggit.managers.TaggableManager(help_text='A comma-separated list of tags.', through='core.TaggedItem', to='taggit.Tag', verbose_name='Tags'), + ), + migrations.RunPython(forwards_func, reverse_func), + migrations.RemoveField( + model_name='snapshot', + name='tags_old', + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 313dd67d..b7719b2e 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -5,10 +5,19 @@ import uuid from django.db import models from django.utils.functional import cached_property +from taggit.managers import TaggableManager +from taggit.models import GenericUUIDTaggedItemBase, TaggedItemBase + from ..util import parse_date from ..index.schema import Link + +class TaggedItem(GenericUUIDTaggedItemBase, TaggedItemBase): + class Meta: + verbose_name = "Tag" + verbose_name_plural = "Tags" + class Snapshot(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) @@ -16,7 +25,7 @@ class Snapshot(models.Model): timestamp = models.CharField(max_length=32, unique=True, db_index=True) title = models.CharField(max_length=128, null=True, blank=True, db_index=True) - tags = models.CharField(max_length=256, null=True, blank=True, db_index=True) + tags = TaggableManager(through=TaggedItem) added = models.DateTimeField(auto_now_add=True, db_index=True) updated = models.DateTimeField(null=True, blank=True, db_index=True) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 14b3b369..6ae2b6af 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -31,6 +31,7 @@ INSTALLED_APPS = [ 'core', 'django_extensions', + 'taggit', ] diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 06832dbc..f93a4ab8 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -86,9 +86,16 @@ def merge_links(a: Link, b: Link) -> Link: ) # all unique, truthy tags + tags_a = [] + if a.tags: + tags_a = a.tags.all() + tags_b = [] + if b.tags: + tags_b = b.tags.all() + tags_set = ( - set(tag.strip() for tag in (a.tags or '').split(',')) - | set(tag.strip() for tag in (b.tags or '').split(',')) + set(tag.name.strip() for tag in tags_a) + | set(tag.name.strip() for tag in tags_b) ) tags = ','.join(tags_set) or None diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 7508890d..7ed44e74 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -157,7 +157,8 @@ class Link: assert isinstance(self.url, str) and '://' in self.url assert self.updated is None or isinstance(self.updated, datetime) assert self.title is None or (isinstance(self.title, str) and self.title) - assert self.tags is None or isinstance(self.tags, str) + #for tag in self.tags.all(): + # assert tag is None or isinstance(tag, TaggedItem) assert isinstance(self.sources, list) assert all(isinstance(source, str) and source for source in self.sources) assert isinstance(self.history, dict) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index b3ca7231..bd3664da 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -65,7 +65,14 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: except Snapshot.DoesNotExist: snap = write_link_to_sql_index(link) snap.title = link.title - snap.tags = link.tags + + tag_set = ( + set(tag.strip() for tag in (link.tags or '').split(',')) + ) + tag_list = list(tag_set) or [] + + for tag in tag_list: + snap.tags.add(tag) snap.save() diff --git a/setup.py b/setup.py index db83e9bf..0272f565 100755 --- a/setup.py +++ b/setup.py @@ -80,6 +80,7 @@ setuptools.setup( "base32-crockford==0.3.0", "django==3.0.8", "django-extensions==3.0.3", + "django-taggit==1.3.0", "dateparser", "ipython", diff --git a/tests/tags_migration/index.sqlite3 b/tests/tags_migration/index.sqlite3 new file mode 100755 index 0000000000000000000000000000000000000000..04d35a71e68e8460936ae8f525bcfc169e53e967 GIT binary patch literal 167936 zcmeI5du$xXeaClik31g9yU~N>*%l?OC{fgbqPUNH(s629p)JOiWb0u^F^ui+`bYx!k0J;XBZ!L@ND$OPizY>zf6}B)(FSPz zM^d0OvwOFXJ3gc+jxBx<;_7zxH^2GLXC8Mmvp0M3sV7QJOJ1$jD`r#Hg*}2O3J=S& zAPBG1zohdg(*Iwke_{I9d9kqVUat$MCjV-HT8ZSR%r;^Fw_P+`2LTWO0T2KI5C8!X z009sH0T2KI5V(5;#t+4)vx(sw0{J@m1Sygu@o&d}JN`)Q|6+d=yB150et-1Sqfd|S z9r?z{r$$yrjt~EE_%En2ULXJhAOHd&00JNY0w8d^1oVjk;o$UQ!)i22wQ56A6xAqI zOU;s5))Y;dQ}T0aR#tNhYHC5xoJyy%ik@E>9}vbLTrAY;7LAZL%vQ5zFmolVLT3y)hjjMa&W^~0+6+P8IuA0_T3;VorHE&!k zP1DuXzH!xjI+vT@OXE&=^1C%4HI>Tg5AUH79$Tbb)+tccXl)eDre$o@Yu9fW8&%)j+Bg^el$iMUF-ntm_$c-BT*pt=@pIKImnffF zy~Rh!Kl8&<=h z^|Nn%sq+D-~PmCh=v zc4lxun0}1bb{8X~NcmQ3SG#x(#+N(oTS947S98k)144poDppEW>OwatPg*kd8@!y& zLX&nm#%kGIGYadqQel(A)^quswiu?aC}qc$=DKoZ@ji%FpT25zm32uHbury_v1Kz( zX?ix3oexo_GzM?RTX&2pHKU}ptVq$Zj>3m->y(s~p3`IGAA~_t7s(IEXGxuWnS7Fb z{C2x?*nfBrM0$FJox00JK zA#@Dy=Pn1i%g&JS-j*CFY5pM)!opBVvfV;KT4CP9Ws?;6Zfl zpP~;rPI%BDPdUWA@biJOaCBDWUfj`pcp@C7clFz)>jq0a*zx8D)Aj#oGRi!56?Yn z7Fy+|S(J;GY=>F(H$G=B$V+s)L$fZMWw}8&Rjo-Ak{-QWSZ`f5oB|9?gxpCR8OKkl0#0zm)-KmY_l00ck)1V8`;KmY_l00e&F1m-0xI<-<-lj)Nx z%&C)dprbnvpE{SxYPE+nMN{UK{G6JV)!c%nEu^$lS~io*Dtre)et6ibn5FWmLaowct>#W; z^17m@+3){F^0q+#;ROO900JNY0w4eaAOHd&00JNY0w8d25g3vZbR)n@sfzpm@2!GG zZa@G8KmY_l00ck)1V8`;KmY_lpdSH@|NBuv7zls>2!H?xfB*=900@8p2!H?x+*<_L z{r{8XIf49)yiI;genh@c{*C-Id5e65{4IHde2KhH{+j$5`4jSK@`vR2$nTI>Nt@gx zEm9*TQlJs>0s#;J0T2KI5C8!X009sH0T2Lz-A-UY62-_QKOW%62l(-Rb{xEq9~1mI z!H?tYI55VK`}uJnKkjA6@E(37{21rQ7&}U%{5Zmo!~7U!$IuWzM)+}%9|yuxR1C9& z#10`zibk>izuPM*@(%(a00JNY0w4eaAOHd&00JPe8wjxVe?0$hH!3m00ck)1V8`;KmY_l00edm0gV55OGzW2 zAOHd&00JNY0w4eaAOHd&00O(2KrHgV0ujG1kYn+0Mc*3yYV6wR#|Fm+o(cc{$lK9h z8Cn{7pY*2q_0W0oHQ}4WLjh7F|26p0KoCxUDFj~r$e~DLZcco4zG<$Mt>W`$b**NY z#Y(Aalxu6j^Mj`^EG=GIk}oYj^2Cxn6}&tppO`8Yr{q$#X{}jx`P}(S^0~`TJRv`M zVfpOhg)8#oOIPH@%a_hCpQDb>E}gq{Ql2sk%~GvuG)on0N-mmBi@obcr3ST?=dD81 zV2+#C^(Ob|2G^~P`jmXttQXeJ`Uy=@lHRy9u)1a$6|2!O*QkHnfg_7LDb%RcYSU=m z*sypCUFl5rg{4QAE-alpy>zjgS8oI-yGi*2%XHPh4>Vmp(vKX*I06 z-+Kp)HwClRTsN5Mj{Q35R?Ftvl-#J8ritQIm)uMI1tAPtKHmrK3)M(K1X?Wh#tQhL*%1U}wD{LO&0jI7kpZmbl zhiEZ(a<@~@Q}X$9TSa4(Tzw30vi&RjB8eB~#C9W)nYDVYwb8IsrsDZ2%SAq=r^Z3e7}={%jcG#y1XR!9QAdx{^#w0GZT9xMzD^bBjrH^O3*Aia zz%jRV)Nj+1B}a}6Pu|UssrI$;p+q~nDBhZPYYd0N58f*8pbAt}HDzX2GP)LMzron| zZPSv2_T7%cRlxf^Z)%>3#-j&)1e&D3ZwM#aFU!ld!M4@*7-9PiVAH^XM_Uqejw|cXul@ z?RMQvvE1*#y&GYN4Td(BiQWd&&P_xT1^StLB~bXmI@GO)W(dp z!ycc(iV=+9t;ostp8b);d0KDPfOK9;?k8-T2S4p3cJ4qTE7cZCWW5buGQ3xIIWu)IE;oKHmd+Y~6;tMKYdD^*CTzvjb_B z?5U&4TOS&YBo-INR~~d*C@-*Jscl|UVtra_Ul&dt?`;=beVgZ=cGjAhsguSwA3QxA zdNuWYV7mNwmPYc)&|qSAR=mQ&HeM{3nwHV9UTj&_f)zNGdXMw8ugQmT5V8|hsb zbn>^XaN5L9ej8E3VSn)Zisv)1RjW*mowCCd#w!m*uN0$@Z}sy0AX$TUdAJw+-(-DQrzc zTTNv%R?1#{aqZ&%X2lEIy}GFv&uWEh%_>14S=zuZ9*-n$%!;>&yAb0>=}DkaD_Shq zhN7A)dNyskYcOuCK}U3-#TVCThpu;=6Iv!6ar8cc>FEet7MJw|1(MUqq#ansr z!i_g(9}NTkqVHpIN5${z+q6_{)%`Ei64lblKDQm^d1PvygZ23Qo zH;?aw-8ExbXZSMdYWvHKl9Y+nQ4=1uJK*1f?C=Mw++4Sie|s{A?~U z5MwGyzq022pU*oy%eMv-XJ4J$ma zW+ahJinnIGt=;*$>eM;s?3f>$|01j7oiDjBST|hBDbII|0K<+w6>Ik1o$CGi+Gh>+ z!CxOtoKJ36$Nb_9)qj4x9sadQVtGbvPxD6Ts{w{X&(QT^x~LYizN<(4&CYkNC#K!j zd^Wr;`qj|yJAhNi9N^kSyy^C54g!5?oE1$Ny@-n<@3oTlmD-zR-m z@r*(tmr1MnlrQ`^62Z`Vu1V8`;KmY_l00ck)1V8`;KmY{pJpyd}A0qS_06(P% zyg&d1KmY_l00ck)1V8`;KmY_l00izG0t1pLMnYjpj7IG7|Jwrj8F~BO$slq90w4ea zAOHd&00JNY0w4eaAOHd&KnaAUs2Gx@Xf#GoLh;`eNQxft0s#;J0T2KI5C8!X009sH z0T2LzUl4(|8X6WBFSo}picwKhG-Xc7Ki%`Is9LMnSoQ}TK`omNtosb*H^%++je zE|pVqb2)9rnp-WHX*H$k+0<(KSv8|5dM1;~XHO|JP1HkZ$8xw%!7J|`)i z$rV>krK35M&(T;)E|an~Q^_T&qFORrMa!gmdZ4lI<|7{>JCb z1$n93tebV&EX$2jvsBwzUfB45a^$N5`6u${L{~iBk{EP9|;v4Y`@tN39 zVqc5>LF{VmTWcA;|q32q4;MpO^ z_t|zGcCK9qogE_YrFKVW2RJ#`h=a3FXZKKxxonuLavo>dys=?*l{sND3!Ep^SbZ?; z$P1V|5`7Wkb_7RfAcCXS8zC8^5oXh2COBZtM0-sqN2%%I8Hp=r&kF5IW+(Cpy?S(3 za-?>yI~uvgux;UJ>sUCV*y#gNYH@H{Vq!X%nF{v)&=9?LBpKqV^QVG$ljmns5qf=k zF65-&wQvO3HiNc}qo8Z!NZ{v_1Jvg5Oo(ad8ZZ(3WF|ZyL}zs_!hLLTSBV=mD^bJw ztfR&aXQqxMH*hw@jU8ETI5T#%xq;InHBPEbpBu<5nMgN;-TyD_c?Y{IBn1K>00JNY z0w4eaAOHd&00JNY0wC~?6R>~(|As)mN8TWR{EjD&1VI1|bm-xH{6UeTn3(qh#A!wNOEkfuM8$fR?*j6MFh_x~e5 z?(vNP5C8!X009sH0T2KI5C8!X009sHf%htbVY>gHAWD)XL}TP7Ary}b@p$}8@oVqZ z3_&*tfB*=900@8p2!H?xfB*=9!0i!epA?7AF3#)LqR`oKk8QZe_S<9I?X7xt+uM9+ zy`F9O`fRi}wNhG>i)PcDIw=>*wN~+@OgHAM*66MbbT7M;aN!f?&G&A1k3DCPzLWBlHCMLGdX)y) z`JwqLT1~TDwu1USYxA^lUcS8oy6e`% Date: Mon, 21 Sep 2020 13:57:38 -0500 Subject: [PATCH 02/11] Fix serialization --- archivebox/core/models.py | 10 +++++++++- archivebox/index/__init__.py | 20 ++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index b7719b2e..8ba0bb70 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -50,7 +50,8 @@ class Snapshot(models.Model): args = args or self.keys return { key: getattr(self, key) - for key in args + if key != 'tags' else self.get_tags_str() + for key in args } def as_link(self) -> Link: @@ -59,6 +60,13 @@ class Snapshot(models.Model): def as_link_with_details(self) -> Link: from ..index import load_link_details return load_link_details(self.as_link()) + + def get_tags_str(self) -> str: + tags = ','.join( + tag.name + for tag in self.tags.all() + ) if self.tags.all() else '' + return tags @cached_property def bookmarked(self): diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index f93a4ab8..d588242e 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -86,16 +86,20 @@ def merge_links(a: Link, b: Link) -> Link: ) # all unique, truthy tags - tags_a = [] - if a.tags: - tags_a = a.tags.all() - tags_b = [] - if b.tags: - tags_b = b.tags.all() + #tags_a = [] + #if a.tags: + # tags_a = a.tags.all() + #tags_b = [] + #if b.tags: + # tags_b = b.tags.all() + #tags_set = ( + # set(tag.name.strip() for tag in tags_a) + # | set(tag.name.strip() for tag in tags_b) + #) tags_set = ( - set(tag.name.strip() for tag in tags_a) - | set(tag.name.strip() for tag in tags_b) + set(tag.strip() for tag in (a.tags or '').split(',')) + | set(tag.strip() for tag in (b.tags or '').split(',')) ) tags = ','.join(tags_set) or None From 45775c607c61416a3e6c06a4dc79740626e333d2 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Mon, 21 Sep 2020 14:29:45 -0500 Subject: [PATCH 03/11] Fixed empty tags --- archivebox/index/sql.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index bd3664da..360a7309 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -72,7 +72,9 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: tag_list = list(tag_set) or [] for tag in tag_list: - snap.tags.add(tag) + # TODO check empty tags + if snap.tags: + snap.tags.add(tag) snap.save() From e06d3f91287f5c060279b2dc4367943c5ea39e54 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Mon, 21 Sep 2020 14:54:11 -0500 Subject: [PATCH 04/11] Fixed Link schema --- archivebox/index/schema.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 7ed44e74..7508890d 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -157,8 +157,7 @@ class Link: assert isinstance(self.url, str) and '://' in self.url assert self.updated is None or isinstance(self.updated, datetime) assert self.title is None or (isinstance(self.title, str) and self.title) - #for tag in self.tags.all(): - # assert tag is None or isinstance(tag, TaggedItem) + assert self.tags is None or isinstance(self.tags, str) assert isinstance(self.sources, list) assert all(isinstance(source, str) and source for source in self.sources) assert isinstance(self.history, dict) From 533ae7413c7be613acd33a8e8034d30d4f99f460 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Tue, 22 Sep 2020 11:07:28 -0500 Subject: [PATCH 05/11] Removed comments --- archivebox/index/__init__.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index d588242e..06832dbc 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -86,17 +86,6 @@ def merge_links(a: Link, b: Link) -> Link: ) # all unique, truthy tags - #tags_a = [] - #if a.tags: - # tags_a = a.tags.all() - #tags_b = [] - #if b.tags: - # tags_b = b.tags.all() - - #tags_set = ( - # set(tag.name.strip() for tag in tags_a) - # | set(tag.name.strip() for tag in tags_b) - #) tags_set = ( set(tag.strip() for tag in (a.tags or '').split(',')) | set(tag.strip() for tag in (b.tags or '').split(',')) From bf09c6b40aad8f4cbdd6b32f15b881a01930daf0 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Tue, 22 Sep 2020 13:58:01 -0500 Subject: [PATCH 06/11] Fixed docker test --- .github/workflows/test.yml | 13 +++++++------ docker-compose.yml | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 78faa3ae..346cf620 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,6 +3,7 @@ on: [push] env: MAX_LINE_LENGTH: 110 + DOCKER_IMAGE: archivebox-ci jobs: lint: @@ -118,12 +119,12 @@ jobs: - name: Build image run: | - docker build . -t archivebox + docker build . -t "$DOCKER_IMAGE" - name: Init data dir run: | mkdir data - docker run -v "$PWD"/data:/data archivebox init + docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" init - name: Run test server run: | @@ -132,16 +133,16 @@ jobs: - name: Add link run: | - docker run -v "$PWD"/data:/data --network host archivebox add http://www.test-nginx-1.local + docker run -v "$PWD"/data:/data --network host "$DOCKER_IMAGE" add http://www.test-nginx-1.local - name: Add stdin link run: | - echo "http://www.test-nginx-2.local" | docker run -i -v "$PWD"/data:/data archivebox add + echo "http://www.test-nginx-2.local" | docker run -i -v "$PWD"/data:/data "$DOCKER_IMAGE" add - name: List links run: | - docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; } - docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; } + docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; } + docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; } - name: Start docker-compose stack run: | diff --git a/docker-compose.yml b/docker-compose.yml index a209e959..f9a75748 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,7 +12,7 @@ version: '3.7' services: archivebox: # build: . - image: nikisweeting/archivebox:latest + image: ${DOCKER_IMAGE:-nikisweeting/archivebox:latest} command: server 0.0.0.0:8000 stdin_open: true tty: true From 4581ea956f665a188a8bcd07048eba738104bf2e Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Thu, 24 Sep 2020 15:11:17 -0500 Subject: [PATCH 07/11] Fixed empty tags --- archivebox/core/admin.py | 2 +- archivebox/index/sql.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index a35d589b..14f24c19 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -70,7 +70,7 @@ class SnapshotAdmin(admin.ModelAdmin): return super().get_queryset(request).prefetch_related('tags') def tag_list(self, obj): - return u", ".join(o.name for o in obj.tags.all()) + return ', '.join(obj.tags.values_list('name', flat=True)) def id_str(self, obj): return format_html( diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 360a7309..844ebbf4 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -40,7 +40,8 @@ def write_link_to_sql_index(link: Link): while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): info["timestamp"] = str(float(info["timestamp"]) + 1.0) - return Snapshot.objects.update_or_create(url=link.url, defaults=info)[0] + Snapshot.objects.update_or_create(url=link.url, defaults=info) + return Snapshot.objects.get(url=link.url) @enforce_types @@ -72,9 +73,7 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: tag_list = list(tag_set) or [] for tag in tag_list: - # TODO check empty tags - if snap.tags: - snap.tags.add(tag) + snap.tags.add(tag) snap.save() From 62f3d648d4686304dc15218f8ca284f230239b3c Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 7 Oct 2020 09:46:10 -0500 Subject: [PATCH 08/11] fix: reverse_func functional --- archivebox/core/migrations/0006_auto_20200915_2006.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/archivebox/core/migrations/0006_auto_20200915_2006.py b/archivebox/core/migrations/0006_auto_20200915_2006.py index 59bb111e..e6b9c66a 100644 --- a/archivebox/core/migrations/0006_auto_20200915_2006.py +++ b/archivebox/core/migrations/0006_auto_20200915_2006.py @@ -43,10 +43,11 @@ def reverse_func(apps, schema_editor): db_alias = schema_editor.connection.alias snapshots = SnapshotModel.objects.all() for snapshot in snapshots: - for tag in tags: - tagged_items = TaggedItemModel.objects.filter( - object_id=snapshot.id, - ).delete() + tags = TaggedItemModel.objects.filter( + object_id=snapshot.id, + ) + snapshot.tags_old = ",".join([tag.tag.name for tag in tags]) + snapshot.save() class Migration(migrations.Migration): From b9e5b781a7be88b7f4dd81546fcd239d1cd69a25 Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 7 Oct 2020 09:59:49 -0500 Subject: [PATCH 09/11] fix: Avoid creating empty tag on migration --- archivebox/core/migrations/0006_auto_20200915_2006.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/core/migrations/0006_auto_20200915_2006.py b/archivebox/core/migrations/0006_auto_20200915_2006.py index e6b9c66a..efb3d1d4 100644 --- a/archivebox/core/migrations/0006_auto_20200915_2006.py +++ b/archivebox/core/migrations/0006_auto_20200915_2006.py @@ -23,9 +23,9 @@ def forwards_func(apps, schema_editor): tag_set = ( set(tag.strip() for tag in (snapshot.tags_old or '').split(',')) ) - tag_list = list(tag_set) or [] - - for tag in tag_list: + tag_set.discard("") + + for tag in tag_set: new_tag, created = TagModel.objects.get_or_create(name=tag, slug=slugify(tag)) TaggedItemModel.objects.get_or_create( content_type_id=ct[0].id, From 10384a8a6fbfe84d19f3f629a09e36ad4643d25b Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 7 Oct 2020 10:15:56 -0500 Subject: [PATCH 10/11] style: Improve look of tags in admin list --- archivebox/core/admin.py | 4 ++-- archivebox/themes/default/static/admin.css | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 14f24c19..b28d6e52 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -81,9 +81,9 @@ class SnapshotAdmin(admin.ModelAdmin): def title_str(self, obj): canon = obj.as_link().canonical_outputs() tags = ''.join( - format_html(' {} ', tag) + format_html(' {} ', tag.id, tag) for tag in obj.tags.all() - ) if obj.tags.all() else '' + ) return format_html( '' '' diff --git a/archivebox/themes/default/static/admin.css b/archivebox/themes/default/static/admin.css index b2b58d64..932f380b 100644 --- a/archivebox/themes/default/static/admin.css +++ b/archivebox/themes/default/static/admin.css @@ -222,3 +222,11 @@ body.model-snapshot.change-list #content .object-tools { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } + +.tags > a > .tag { + border: 1px solid; + border-radius: 10px; + background-color: #f3f3f3; + padding: 3px; +} + From 62c78e1d1096f85869528a31dcdfd28a87b4a1b9 Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 12 Oct 2020 13:47:03 -0500 Subject: [PATCH 11/11] refactor: Remove django-taggit and replace it with a local tags setup --- archivebox.egg-info/requires.txt | 1 - archivebox/core/admin.py | 30 ++++- archivebox/core/forms.py | 42 +++++++ .../migrations/0006_auto_20200915_2006.py | 90 -------------- .../migrations/0006_auto_20201012_1520.py | 70 +++++++++++ archivebox/core/models.py | 53 ++++++-- archivebox/core/settings.py | 1 - archivebox/core/utils_taggit.py | 113 ++++++++++++++++++ archivebox/index/sql.py | 12 +- setup.py | 1 - tests/test_init.py | 12 +- 11 files changed, 313 insertions(+), 112 deletions(-) delete mode 100644 archivebox/core/migrations/0006_auto_20200915_2006.py create mode 100644 archivebox/core/migrations/0006_auto_20201012_1520.py create mode 100644 archivebox/core/utils_taggit.py diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt index ca279875..71dc253d 100644 --- a/archivebox.egg-info/requires.txt +++ b/archivebox.egg-info/requires.txt @@ -4,7 +4,6 @@ mypy-extensions==0.4.3 base32-crockford==0.3.0 django==3.0.8 django-extensions==3.0.3 -django-taggit==1.3.0 dateparser ipython youtube-dl diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index b28d6e52..55c68e16 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -9,9 +9,10 @@ from django.utils.html import format_html from django.utils.safestring import mark_safe from django.shortcuts import render, redirect from django.contrib.auth import get_user_model +from django import forms from core.models import Snapshot -from core.forms import AddLinkForm +from core.forms import AddLinkForm, TagField from core.utils import get_icons from util import htmldecode, urldecode, ansi_to_html @@ -55,6 +56,32 @@ def delete_snapshots(modeladmin, request, queryset): delete_snapshots.short_description = "Delete" +class SnapshotAdminForm(forms.ModelForm): + tags = TagField(required=False) + + class Meta: + model = Snapshot + fields = "__all__" + + def save(self, commit=True): + # Based on: https://stackoverflow.com/a/49933068/3509554 + + # Get the unsave instance + instance = forms.ModelForm.save(self, False) + tags = self.cleaned_data.pop("tags") + + #update save_m2m + def new_save_m2m(): + instance.save_tags(tags) + + # Do we need to save all changes now? + self.save_m2m = new_save_m2m + if commit: + instance.save() + + return instance + + class SnapshotAdmin(admin.ModelAdmin): list_display = ('added', 'title_str', 'url_str', 'files', 'size') sort_fields = ('title_str', 'url_str', 'added') @@ -65,6 +92,7 @@ class SnapshotAdmin(admin.ModelAdmin): ordering = ['-added'] actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots] actions_template = 'admin/actions_as_select.html' + form = SnapshotAdminForm def get_queryset(self, request): return super().get_queryset(request).prefetch_related('tags') diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index f641298a..8f48929b 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -3,6 +3,7 @@ __package__ = 'archivebox.core' from django import forms from ..util import URL_REGEX +from .utils_taggit import edit_string_for_tags, parse_tags CHOICES = ( ('0', 'depth = 0 (archive just these URLs)'), @@ -12,3 +13,44 @@ CHOICES = ( class AddLinkForm(forms.Form): url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True) depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0') + + +class TagWidgetMixin: + def format_value(self, value): + if value is not None and not isinstance(value, str): + value = edit_string_for_tags(value) + return super().format_value(value) + +class TagWidget(TagWidgetMixin, forms.TextInput): + pass + +class TagField(forms.CharField): + widget = TagWidget + + def clean(self, value): + value = super().clean(value) + try: + return parse_tags(value) + except ValueError: + raise forms.ValidationError( + "Please provide a comma-separated list of tags." + ) + + def has_changed(self, initial_value, data_value): + # Always return False if the field is disabled since self.bound_data + # always uses the initial value in this case. + if self.disabled: + return False + + try: + data_value = self.clean(data_value) + except forms.ValidationError: + pass + + if initial_value is None: + initial_value = [] + + initial_value = [tag.name for tag in initial_value] + initial_value.sort() + + return initial_value != data_value diff --git a/archivebox/core/migrations/0006_auto_20200915_2006.py b/archivebox/core/migrations/0006_auto_20200915_2006.py deleted file mode 100644 index efb3d1d4..00000000 --- a/archivebox/core/migrations/0006_auto_20200915_2006.py +++ /dev/null @@ -1,90 +0,0 @@ -# Generated by Django 3.0.8 on 2020-09-15 20:06 - -from django.db import migrations, models -from django.contrib.contenttypes.models import ContentType -from django.utils.text import slugify -import django.db.models.deletion -import taggit.managers - -def forwards_func(apps, schema_editor): - SnapshotModel = apps.get_model("core", "Snapshot") - TaggedItemModel = apps.get_model("core", "TaggedItem") - TagModel = apps.get_model("taggit", "Tag") - contents = ContentType.objects.all() - try: - ct = ContentType.objects.filter(app_label="core", model="snapshot") - except model.DoesNotExist: # Be explicit about exceptions - ct = None - - db_alias = schema_editor.connection.alias - snapshots = SnapshotModel.objects.all() - for snapshot in snapshots: - tags = snapshot.tags - tag_set = ( - set(tag.strip() for tag in (snapshot.tags_old or '').split(',')) - ) - tag_set.discard("") - - for tag in tag_set: - new_tag, created = TagModel.objects.get_or_create(name=tag, slug=slugify(tag)) - TaggedItemModel.objects.get_or_create( - content_type_id=ct[0].id, - object_id=snapshot.id, - tag=new_tag - ) - - -def reverse_func(apps, schema_editor): - SnapshotModel = apps.get_model("core", "Snapshot") - TaggedItemModel = apps.get_model("core", "TaggedItem") - TagModel = apps.get_model("taggit", "Tag") - ct = ContentType.objects.get(app_label="core", model="snapshot") - - db_alias = schema_editor.connection.alias - snapshots = SnapshotModel.objects.all() - for snapshot in snapshots: - tags = TaggedItemModel.objects.filter( - object_id=snapshot.id, - ) - snapshot.tags_old = ",".join([tag.tag.name for tag in tags]) - snapshot.save() - - -class Migration(migrations.Migration): - - dependencies = [ - ('contenttypes', '0002_remove_content_type_name'), - ('taggit', '0003_taggeditem_add_unique_index'), - ('core', '0005_auto_20200728_0326'), - ] - - operations = [ - migrations.RenameField( - model_name='snapshot', - old_name='tags', - new_name='tags_old', - ), - migrations.CreateModel( - name='TaggedItem', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('object_id', models.UUIDField(db_index=True, verbose_name='object ID')), - ('content_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_tagged_items', to='contenttypes.ContentType', verbose_name='content type')), - ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_items', to='taggit.Tag')), - ], - options={ - 'verbose_name': 'Tag', - 'verbose_name_plural': 'Tags', - }, - ), - migrations.AddField( - model_name='snapshot', - name='tags', - field=taggit.managers.TaggableManager(help_text='A comma-separated list of tags.', through='core.TaggedItem', to='taggit.Tag', verbose_name='Tags'), - ), - migrations.RunPython(forwards_func, reverse_func), - migrations.RemoveField( - model_name='snapshot', - name='tags_old', - ), - ] diff --git a/archivebox/core/migrations/0006_auto_20201012_1520.py b/archivebox/core/migrations/0006_auto_20201012_1520.py new file mode 100644 index 00000000..694c9908 --- /dev/null +++ b/archivebox/core/migrations/0006_auto_20201012_1520.py @@ -0,0 +1,70 @@ +# Generated by Django 3.0.8 on 2020-10-12 15:20 + +from django.db import migrations, models +from django.utils.text import slugify + +def forwards_func(apps, schema_editor): + SnapshotModel = apps.get_model("core", "Snapshot") + TagModel = apps.get_model("core", "Tag") + + db_alias = schema_editor.connection.alias + snapshots = SnapshotModel.objects.all() + for snapshot in snapshots: + tags = snapshot.tags + tag_set = ( + set(tag.strip() for tag in (snapshot.tags_old or '').split(',')) + ) + tag_set.discard("") + + for tag in tag_set: + to_add, _ = TagModel.objects.get_or_create(name=tag, slug=slugify(tag)) + snapshot.tags.add(to_add) + + +def reverse_func(apps, schema_editor): + SnapshotModel = apps.get_model("core", "Snapshot") + TagModel = apps.get_model("core", "Tag") + + db_alias = schema_editor.connection.alias + snapshots = SnapshotModel.objects.all() + for snapshot in snapshots: + tags = snapshot.tags.values_list("name", flat=True) + snapshot.tags_old = ",".join([tag for tag in tags]) + snapshot.save() + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0005_auto_20200728_0326'), + ] + + operations = [ + migrations.RenameField( + model_name='snapshot', + old_name='tags', + new_name='tags_old', + ), + migrations.CreateModel( + name='Tag', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=100, unique=True, verbose_name='name')), + ('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')), + ], + options={ + 'verbose_name': 'Tag', + 'verbose_name_plural': 'Tags', + }, + ), + migrations.AddField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField(to='core.Tag'), + ), + migrations.RunPython(forwards_func, reverse_func), + migrations.RemoveField( + model_name='snapshot', + name='tags_old', + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 8ba0bb70..7d0c799f 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -2,22 +2,55 @@ __package__ = 'archivebox.core' import uuid -from django.db import models +from django.db import models, transaction from django.utils.functional import cached_property - -from taggit.managers import TaggableManager -from taggit.models import GenericUUIDTaggedItemBase, TaggedItemBase +from django.utils.text import slugify from ..util import parse_date from ..index.schema import Link +class Tag(models.Model): + """ + Based on django-taggit model + """ + name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100) + slug = models.SlugField(verbose_name="slug", unique=True, max_length=100) -class TaggedItem(GenericUUIDTaggedItemBase, TaggedItemBase): class Meta: verbose_name = "Tag" verbose_name_plural = "Tags" + def __str__(self): + return self.name + + def slugify(self, tag, i=None): + slug = slugify(tag) + if i is not None: + slug += "_%d" % i + return slug + + def save(self, *args, **kwargs): + if self._state.adding and not self.slug: + self.slug = self.slugify(self.name) + + with transaction.atomic(): + slugs = set( + type(self) + ._default_manager.filter(slug__startswith=self.slug) + .values_list("slug", flat=True) + ) + + i = None + while True: + slug = self.slugify(self.name, i) + if slug not in slugs: + self.slug = slug + return super().save(*args, **kwargs) + i = 1 if i is None else i+1 + else: + return super().save(*args, **kwargs) + class Snapshot(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) @@ -25,11 +58,10 @@ class Snapshot(models.Model): timestamp = models.CharField(max_length=32, unique=True, db_index=True) title = models.CharField(max_length=128, null=True, blank=True, db_index=True) - tags = TaggableManager(through=TaggedItem) added = models.DateTimeField(auto_now_add=True, db_index=True) updated = models.DateTimeField(null=True, blank=True, db_index=True) - # bookmarked = models.DateTimeField() + tags = models.ManyToManyField(Tag) keys = ('url', 'timestamp', 'title', 'tags', 'updated') @@ -113,3 +145,10 @@ class Snapshot(models.Model): and self.history['title'][-1].output.strip()): return self.history['title'][-1].output.strip() return None + + def save_tags(self, tags=[]): + tags_id = [] + for tag in tags: + tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) + self.tags.clear() + self.tags.add(*tags_id) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 6ae2b6af..14b3b369 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -31,7 +31,6 @@ INSTALLED_APPS = [ 'core', 'django_extensions', - 'taggit', ] diff --git a/archivebox/core/utils_taggit.py b/archivebox/core/utils_taggit.py new file mode 100644 index 00000000..5a2d511d --- /dev/null +++ b/archivebox/core/utils_taggit.py @@ -0,0 +1,113 @@ +# Taken from https://github.com/jazzband/django-taggit/blob/3b56adb637ab95aca5036c37a358402c825a367c/taggit/utils.py + +def parse_tags(tagstring): + """ + Parses tag input, with multiple word input being activated and + delineated by commas and double quotes. Quotes take precedence, so + they may contain commas. + + Returns a sorted list of unique tag names. + + Ported from Jonathan Buchanan's `django-tagging + `_ + """ + if not tagstring: + return [] + + # Special case - if there are no commas or double quotes in the + # input, we don't *do* a recall... I mean, we know we only need to + # split on spaces. + if "," not in tagstring and '"' not in tagstring: + words = list(set(split_strip(tagstring, " "))) + words.sort() + return words + + words = [] + buffer = [] + # Defer splitting of non-quoted sections until we know if there are + # any unquoted commas. + to_be_split = [] + saw_loose_comma = False + open_quote = False + i = iter(tagstring) + try: + while True: + c = next(i) + if c == '"': + if buffer: + to_be_split.append("".join(buffer)) + buffer = [] + # Find the matching quote + open_quote = True + c = next(i) + while c != '"': + buffer.append(c) + c = next(i) + if buffer: + word = "".join(buffer).strip() + if word: + words.append(word) + buffer = [] + open_quote = False + else: + if not saw_loose_comma and c == ",": + saw_loose_comma = True + buffer.append(c) + except StopIteration: + # If we were parsing an open quote which was never closed treat + # the buffer as unquoted. + if buffer: + if open_quote and "," in buffer: + saw_loose_comma = True + to_be_split.append("".join(buffer)) + if to_be_split: + if saw_loose_comma: + delimiter = "," + else: + delimiter = " " + for chunk in to_be_split: + words.extend(split_strip(chunk, delimiter)) + words = list(set(words)) + words.sort() + return words + + +def split_strip(string, delimiter=","): + """ + Splits ``string`` on ``delimiter``, stripping each resulting string + and returning a list of non-empty strings. + + Ported from Jonathan Buchanan's `django-tagging + `_ + """ + if not string: + return [] + + words = [w.strip() for w in string.split(delimiter)] + return [w for w in words if w] + + +def edit_string_for_tags(tags): + """ + Given list of ``Tag`` instances, creates a string representation of + the list suitable for editing by the user, such that submitting the + given string representation back without changing it will give the + same list of tags. + + Tag names which contain commas will be double quoted. + + If any tag name which isn't being quoted contains whitespace, the + resulting string of tag names will be comma-delimited, otherwise + it will be space-delimited. + + Ported from Jonathan Buchanan's `django-tagging + `_ + """ + names = [] + for tag in tags: + name = tag.name + if "," in name or " " in name: + names.append('"%s"' % name) + else: + names.append(name) + return ", ".join(sorted(names)) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 844ebbf4..aa7c8817 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -34,14 +34,19 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> def write_link_to_sql_index(link: Link): from core.models import Snapshot info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} + tags = info.pop("tags") + if tags is None: + tags = [] + try: info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp except Snapshot.DoesNotExist: while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): info["timestamp"] = str(float(info["timestamp"]) + 1.0) - Snapshot.objects.update_or_create(url=link.url, defaults=info) - return Snapshot.objects.get(url=link.url) + snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info) + snapshot.save_tags(tags) + return snapshot @enforce_types @@ -72,9 +77,8 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: ) tag_list = list(tag_set) or [] - for tag in tag_list: - snap.tags.add(tag) snap.save() + snap.save_tags(tag_list) diff --git a/setup.py b/setup.py index 0272f565..db83e9bf 100755 --- a/setup.py +++ b/setup.py @@ -80,7 +80,6 @@ setuptools.setup( "base32-crockford==0.3.0", "django==3.0.8", "django-extensions==3.0.3", - "django-taggit==1.3.0", "dateparser", "ipython", diff --git a/tests/test_init.py b/tests/test_init.py index 72caa6d0..ae07e5da 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -157,18 +157,16 @@ def test_tags_migration(tmp_path, disable_extractors_dict): conn.row_factory = sqlite3.Row c = conn.cursor() c.execute(""" - SELECT snapshot.id snapshot, tags.name tag - FROM core_snapshot snapshot, core_taggeditem snapshot_tagged, taggit_tag tags - WHERE - snapshot.id = snapshot_tagged.object_id - AND tags.id = snapshot_tagged.tag_id + SELECT core_snapshot.id, core_tag.name from core_snapshot + JOIN core_snapshot_tags on core_snapshot_tags.snapshot_id=core_snapshot.id + JOIN core_tag on core_tag.id=core_snapshot_tags.tag_id """) tags = c.fetchall() conn.commit() conn.close() for tag in tags: - snapshot_id = tag['snapshot'] - tag_name = tag['tag'] + snapshot_id = tag["id"] + tag_name = tag["name"] # Check each tag migrated is in the previous field assert tag_name in snapshots_dict[snapshot_id] \ No newline at end of file