ECI: Update ECIs to AIM ITS/04-023:2022, adding UTF-16BE (was USC-2BE),

UTF-16LE, GBK, separate GB18030, UTF-32BE, UTF-32LE add examples to tests for DATAMATRIX, HANXIN, QRCODE HANXIN: Remove alternating filler in function information; GB 18030 now ECI 32 (previously used ECI 29); fix gate-posts on codeword limits use new ZXing-C++ HanXin detector (diagnostics2 branch) for tests check against ISO/IEC 20830:2021 (no substantive changes) backend_tcl: update ECIs; NOTE: changed names "unicode" -> "utf-16be", "euc-cn" -> "gb2312" GRIDMATRIX/HANXIN/QRCODE/RMQR: warn if auto-conversion (i.e. no ECI given) occurs to resp. specialized char sets (GB 2312/GB 18030/Shift JIS)
2025-05-24 20:14:28 -04:00 · 2022-04-10 11:12:18 +01:00 · 2022-04-10 11:12:18 +01:00 · 624d40021e
commit 624d40021e
parent 69876619dd
38 changed files with 5761 additions and 835 deletions
--- a/backend/eci.c
+++ b/backend/eci.c
@ -1,7 +1,7 @@
 /*  eci.c - Extended Channel Interpretations

    libzint - the open source barcode library
-    Copyright (C) 2009 - 2021 Robin Stuart <rstuart114@gmail.com>
+    Copyright (C) 2009-2022 Robin Stuart <rstuart114@gmail.com>

    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@ -28,7 +28,6 @@
    OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    SUCH DAMAGE.
 */
-/* vim: set ts=4 sw=4 et : */

 #ifdef _MSC_VER
 #include <malloc.h>
@ -40,6 +39,7 @@
 #include "big5.h"
 #include "gb2312.h"
 #include "ksx1001.h"
+#include "gb18030.h"

 /* ECI 20 Shift JIS */
 static int sjis_wctomb(unsigned char *r, const unsigned int wc) {
@ -126,6 +126,47 @@ static int euc_kr_wctomb(unsigned char *r, const unsigned int wc) {
    return 0;
 }

+/* ECI 31 GBK Chinese */
+static int gbk_wctomb(unsigned char *r, const unsigned int wc) {
+    unsigned int c;
+
+    if (wc < 0x80) {
+        *r = (unsigned char) wc;
+        return 1;
+    }
+    if (gbk_wctomb_zint(&c, wc)) {
+        r[0] = (unsigned char) (c >> 8);
+        r[1] = (unsigned char) (c & 0xff);
+        return 2;
+    }
+    return 0;
+}
+
+/* ECI 32 GB 18030 Chinese */
+static int gb18030_wctomb(unsigned char *r, const unsigned int wc) {
+    unsigned int c1, c2;
+    int ret;
+
+    if (wc < 0x80) {
+        *r = (unsigned char) wc;
+        return 1;
+    }
+    ret = gb18030_wctomb_zint(&c1, &c2, wc);
+    if (ret == 2) {
+        r[0] = (unsigned char) (c1 >> 8);
+        r[1] = (unsigned char) (c1 & 0xff);
+        return 2;
+    }
+    if (ret == 4) {
+        r[0] = (unsigned char) (c1 >> 8);
+        r[1] = (unsigned char) (c1 & 0xff);
+        r[2] = (unsigned char) (c2 >> 8);
+        r[3] = (unsigned char) (c2 & 0xff);
+        return 4;
+    }
+    return 0;
+}
+
 /* Helper to count the number of chars in a string within a range */
 static int chr_range_cnt(const unsigned char string[], const int length, const unsigned char c1,
            const unsigned char c2) {
@ -149,8 +190,8 @@ static int chr_range_cnt(const unsigned char string[], const int length, const u

 /* Is ECI convertible from UTF-8? */
 INTERNAL int is_eci_convertible(const int eci) {
-    if (eci == 26 || (eci > 30 && eci != 170)) { /* Exclude ECI 170 - ASCII Invariant */
-        /* UTF-8 (26) or 8-bit binary data (899) or undefined (> 30 and < 899) or not character set (> 899) */
+    if (eci == 26 || (eci > 35 && eci != 170)) { /* Exclude ECI 170 - ASCII Invariant */
+        /* UTF-8 (26) or 8-bit binary data (899) or undefined (> 35 and < 899) or not character set (> 899) */
        return 0;
    }
    return 1;
@ -162,16 +203,21 @@ INTERNAL int get_eci_length(const int eci, const unsigned char source[], int len
        /* Only ASCII backslash (reverse solidus) exceeds UTF-8 length */
        length += chr_cnt(source, length, '\\');

-    } else if (eci == 25) { /* UCS-2BE */
+    } else if (eci == 25 || eci == 33) { /* UTF-16 */
        /* All ASCII chars take 2 bytes */
        length += chr_range_cnt(source, length, 0, 0x7F);
+        /* Surrogate pairs are 4 UTF-8 bytes long so fit */

-    } else if (eci == 29) { /* GB 2312 (and GB 18030 if Han Xin) */
-        /* Not needed for GB 2312 but allow for GB 18030 4 byters */
+    } else if (eci == 32) { /* GB 18030 */
+        /* Allow for GB 18030 4 byters */
        length *= 2;
+
+    } else if (eci == 34 || eci == 35) { /* UTF-32 */
+        /* Quadruple-up ASCII and double-up non-ASCII */
+        length += chr_range_cnt(source, length, 0, 0x7F) * 2 + length;
    }

-    /* Big5 and EUC-KR fit in UTF-8 length */
+    /* Big5, GB 2312, EUC-KR and GBK fit in UTF-8 length */

    return length;
 }
@ -180,14 +226,15 @@ INTERNAL int get_eci_length(const int eci, const unsigned char source[], int len
 INTERNAL int utf8_to_eci(const int eci, const unsigned char source[], unsigned char dest[], int *p_length) {

    typedef int (*eci_func_t)(unsigned char *r, const unsigned int wc);
-    static const eci_func_t eci_funcs[31] = {
-                     NULL,              NULL,              NULL,              NULL,  iso8859_2_wctosb,
-         iso8859_3_wctosb,  iso8859_4_wctosb,  iso8859_5_wctosb,  iso8859_6_wctosb,  iso8859_7_wctosb,
-         iso8859_8_wctosb,  iso8859_9_wctosb, iso8859_10_wctosb, iso8859_11_wctosb,              NULL,
-        iso8859_13_wctosb, iso8859_14_wctosb, iso8859_15_wctosb, iso8859_16_wctosb,              NULL,
-              sjis_wctomb,     cp1250_wctosb,     cp1251_wctosb,     cp1252_wctosb,     cp1256_wctosb,
-            ucs2be_wctomb,              NULL,      ascii_wctosb,       big5_wctomb,     gb2312_wctomb,
-            euc_kr_wctomb,
+    static const eci_func_t eci_funcs[36] = {
+                     NULL,              NULL,              NULL,              NULL,  iso8859_2_wctosb, /*0-4*/
+         iso8859_3_wctosb,  iso8859_4_wctosb,  iso8859_5_wctosb,  iso8859_6_wctosb,  iso8859_7_wctosb, /*5-9*/
+         iso8859_8_wctosb,  iso8859_9_wctosb, iso8859_10_wctosb, iso8859_11_wctosb,              NULL, /*10-14*/
+        iso8859_13_wctosb, iso8859_14_wctosb, iso8859_15_wctosb, iso8859_16_wctosb,              NULL, /*15-19*/
+              sjis_wctomb,     cp1250_wctosb,     cp1251_wctosb,     cp1252_wctosb,     cp1256_wctosb, /*20-24*/
+           utf16be_wctomb,              NULL,      ascii_wctosb,       big5_wctomb,     gb2312_wctomb, /*25-29*/
+            euc_kr_wctomb,        gbk_wctomb,    gb18030_wctomb,    utf16le_wctomb,    utf32be_wctomb, /*30-34*/
+           utf32le_wctomb,
    };
    eci_func_t eci_func;
    unsigned int codepoint, state;
@ -277,3 +324,5 @@ INTERNAL int get_best_eci(const unsigned char source[], int length) {

    return 26; // If all of these fail, use Unicode!
 }
+
+/* vim: set ts=4 sw=4 et : */