remove submodule

2025-05-23 19:47:06 -04:00 · 2016-06-26 10:14:44 +02:00 · 2016-06-26 10:14:44 +02:00 · 9bd71281b9
commit 9bd71281b9
parent a2484fdb5f
22 changed files with 17798 additions and 1 deletions
--- a/bstrlib/buniutil.c
+++ b/bstrlib/buniutil.c
@ -0,0 +1,274 @@
+/*
+ * This source file is part of the bstring string library.  This code was
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
+ * on usage and license.
+ */
+
+/*
+ * buniutil.c
+ *
+ * This file is not necessarily part of the core bstring library itself, but
+ * is just an implementation of basic utf8 processing for bstrlib.  Note that
+ * this module is dependent upon bstrlib.c and utf8util.c
+ */
+
+#include "bstrlib.h"
+#include "buniutil.h"
+
+#define UNICODE__CODE_POINT__REPLACEMENT_CHARACTER (0xFFFDL)
+
+/*  int buIsUTF8Content (const_bstring bu)
+ *
+ *  Scan string and return 1 if its entire contents is entirely UTF8 code
+ *  points.  Otherwise return 0.
+ */
+int buIsUTF8Content (const_bstring bu) {
+struct utf8Iterator iter;
+
+	if (NULL == bdata (bu)) return 0;
+	for (utf8IteratorInit (&iter, bu->data, bu->slen);
+	     iter.next < iter.slen;) {
+		if (0 >= utf8IteratorGetNextCodePoint (&iter, -1)) return 0;
+	}
+	return 1;
+}
+
+/*  int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu,
+ *                     int pos)
+ *
+ *  Convert a string of UTF8 codepoints (bu) skipping the first pos, into a
+ *  sequence of UTF16 encoded code points.  Returns the number of UCS2 16-bit
+ *  words written to the output.  No more than len words are written to the
+ *  target array ucs2.  If any code point in bu is unparsable, it will be
+ *  translated to errCh.
+ */
+int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos) {
+struct tagbstring t;
+struct utf8Iterator iter;
+cpUcs4 ucs4;
+int i, j;
+
+	if (!isLegalUnicodeCodePoint (errCh)) errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
+	if (NULL == ucs2 || 0 >= len || NULL == bdata (bu) || 0 > pos) return BSTR_ERR;
+
+	for (j=0, i=0; j < bu->slen; j++) {
+		if (0x80 != (0xC0 & bu->data[j])) {
+			if (i >= pos) break;
+			i++;
+		}
+	}
+
+	t.mlen = -1;
+	t.data = bu->data + j;
+	t.slen = bu->slen - j;
+
+	utf8IteratorInit (&iter, t.data, t.slen);
+
+	ucs4 = BSTR_ERR;
+	for (i=0; 0 < len && iter.next < iter.slen &&
+	          0 <= (ucs4 = utf8IteratorGetNextCodePoint (&iter, errCh)); i++) {
+		if (ucs4 < 0x10000) {
+			*ucs2++ = (cpUcs2) ucs4;
+			len--;
+		} else {
+			if (len < 2) {
+				*ucs2++ = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
+				len--;
+			} else {
+				long y = ucs4 - 0x10000;
+				ucs2[0] = (cpUcs2) (0xD800 | (y >> 10));
+				ucs2[1] = (cpUcs2) (0xDC00 | (y & 0x03FF));
+				len -= 2;
+				ucs2 += 2;
+				i++;
+			}
+		}
+	}
+	while (0 < len) {
+		*ucs2++ = 0;
+		len--;
+	}
+
+	utf8IteratorUninit (&iter);
+	if (0 > ucs4) return BSTR_ERR;
+	return i;
+}
+
+/*
+
+Unicode                   UTF-8
+-------                   -----
+U-00000000 - U-0000007F:  0xxxxxxx
+U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
+U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
+U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+UTF-32: U-000000 - U-10FFFF
+
+*/
+
+/*  int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh)
+ *
+ *  Convert an array of UCS4 code points (bu) to UTF8 codepoints b.  Any
+ *  invalid code point is replaced by errCh.  If errCh is itself not a
+ *  valid code point, then this translation will halt upon the first error
+ *  and return BSTR_ERR.  Otherwise BSTR_OK is returned.
+ */
+int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh) {
+int i, oldSlen;
+
+	if (NULL == bu || NULL == b || 0 > len || 0 > (oldSlen = blengthe (b, -1))) return BSTR_ERR;
+	if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0;
+
+	for (i=0; i < len; i++) {
+		unsigned char c[6];
+		cpUcs4 v = bu[i];
+
+		if (!isLegalUnicodeCodePoint (v)) {
+			if (~0 == errCh) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+			v = errCh;
+		}
+
+		if (v < 0x80) {
+			if (BSTR_OK != bconchar (b, (char) v)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else if (v < 0x800) {
+			c[0] = (unsigned char) ( (v >>  6)         + 0xc0);
+			c[1] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk (b, c, 2)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else if (v < 0x10000) {
+			c[0] = (unsigned char) ( (v >> 12)         + 0xe0);
+			c[1] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[2] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk (b, c, 3)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else
+#if 0
+			if (v < 0x200000)
+#endif
+		{
+			c[0] = (unsigned char) ( (v >> 18)         + 0xf0);
+			c[1] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
+			c[2] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[3] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk (b, c, 4)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		}
+#if 0
+		else if (v < 0x4000000) {
+			c[0] = (unsigned char) ( (v >> 24)         + 0xf8);
+			c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
+			c[2] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
+			c[3] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[4] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk (b, c, 5)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else {
+			c[0] = (unsigned char) ( (v >> 30)         + 0xfc);
+			c[1] = (unsigned char) (((v >> 24) & 0x3f) + 0x80);
+			c[2] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
+			c[3] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
+			c[4] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[5] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk (b, c, 6)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		}
+#endif
+	}
+	return BSTR_OK;
+}
+
+#define endSwap(cs,mode) ((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs))
+#define TEMP_UCS4_BUFFER_SIZE (64)
+
+/*  int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len,
+ *                        cpUcs2* bom, cpUcs4 errCh)
+ *
+ *  Append an array of UCS2 code points (utf16) to UTF8 codepoints (bu).  Any
+ *  invalid code point is replaced by errCh.  If errCh is itself not a
+ *  valid code point, then this translation will halt upon the first error
+ *  and return BSTR_ERR.  Otherwise BSTR_OK is returned.  If a byte order mark
+ *  has been previously read, it may be passed in as bom, otherwise if *bom is
+ *  set to 0, it will be filled in with the BOM as read from the first
+ *  character if it is a BOM.
+ */
+int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh) {
+cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE];
+int cc, i, sm, oldSlen;
+
+	if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR;
+	if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0;
+	if (len == 0) return BSTR_OK;
+
+	oldSlen = bu->slen;
+	i = 0;
+
+	/* Check for BOM character and select endianess.  Also remove the
+	   BOM from the stream, since there is no need for it in a UTF-8 encoding. */
+	if (bom && (cpUcs2) 0xFFFE == *bom) {
+		sm = 8;
+	} else if (bom && (cpUcs2) 0xFEFF == *bom) {
+		sm = 0;
+	} else if (utf16[i] == (cpUcs2) 0xFFFE) {
+		if (bom) *bom = utf16[i];
+		sm = 8;
+		i++;
+	} else if (utf16[i] == (cpUcs2) 0xFEFF) {
+		if (bom) *bom = utf16[i];
+		sm = 0;
+		i++;
+	} else {
+		sm = 0; /* Assume local endianness. */
+	}
+
+	cc = 0;
+	for (;i < len; i++) {
+		cpUcs4 c, v;
+		v = endSwap (utf16[i], sm);
+
+		if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */
+			if (v >= 0xDC00 || i >= len) {
+				ErrMode:;
+				if (~0 == errCh) {
+					ErrReturn:;
+					bu->slen = oldSlen;
+					return BSTR_ERR;
+				}
+				v = errCh;
+			} else {
+				i++;
+				if ((c = endSwap (utf16[i], sm) - 0xDC00) > 0x3FF) goto ErrMode;
+				v = ((v - 0xD800) << 10) + c + 0x10000;
+			}
+		}
+		buff[cc] = v;
+		cc++;
+		if (cc >= TEMP_UCS4_BUFFER_SIZE) {
+			if (0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn;
+			cc = 0;
+		}
+	}
+	if (cc > 0 && 0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn;
+
+	return BSTR_OK;
+}