ECI: Update ECIs to AIM ITS/04-023:2022, adding UTF-16BE (was USC-2BE),

UTF-16LE, GBK, separate GB18030, UTF-32BE, UTF-32LE
  add examples to tests for DATAMATRIX, HANXIN, QRCODE
HANXIN: Remove alternating filler in function information;
  GB 18030 now ECI 32 (previously used ECI 29);
  fix gate-posts on codeword limits
  use new ZXing-C++ HanXin detector (diagnostics2 branch) for tests
  check against ISO/IEC 20830:2021 (no substantive changes)
backend_tcl: update ECIs; NOTE: changed names "unicode" -> "utf-16be",
  "euc-cn" -> "gb2312"
GRIDMATRIX/HANXIN/QRCODE/RMQR: warn if auto-conversion (i.e. no ECI given)
  occurs to resp. specialized char sets (GB 2312/GB 18030/Shift JIS)
This commit is contained in:
gitlost 2022-04-10 11:12:18 +01:00
parent 69876619dd
commit 624d40021e
38 changed files with 5761 additions and 835 deletions

View file

@ -1,7 +1,7 @@
/* eci.c - Extended Channel Interpretations
libzint - the open source barcode library
Copyright (C) 2009 - 2021 Robin Stuart <rstuart114@gmail.com>
Copyright (C) 2009-2022 Robin Stuart <rstuart114@gmail.com>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@ -28,7 +28,6 @@
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
*/
/* vim: set ts=4 sw=4 et : */
#ifdef _MSC_VER
#include <malloc.h>
@ -40,6 +39,7 @@
#include "big5.h"
#include "gb2312.h"
#include "ksx1001.h"
#include "gb18030.h"
/* ECI 20 Shift JIS */
static int sjis_wctomb(unsigned char *r, const unsigned int wc) {
@ -126,6 +126,47 @@ static int euc_kr_wctomb(unsigned char *r, const unsigned int wc) {
return 0;
}
/* ECI 31 GBK Chinese */
static int gbk_wctomb(unsigned char *r, const unsigned int wc) {
unsigned int c;
if (wc < 0x80) {
*r = (unsigned char) wc;
return 1;
}
if (gbk_wctomb_zint(&c, wc)) {
r[0] = (unsigned char) (c >> 8);
r[1] = (unsigned char) (c & 0xff);
return 2;
}
return 0;
}
/* ECI 32 GB 18030 Chinese */
static int gb18030_wctomb(unsigned char *r, const unsigned int wc) {
unsigned int c1, c2;
int ret;
if (wc < 0x80) {
*r = (unsigned char) wc;
return 1;
}
ret = gb18030_wctomb_zint(&c1, &c2, wc);
if (ret == 2) {
r[0] = (unsigned char) (c1 >> 8);
r[1] = (unsigned char) (c1 & 0xff);
return 2;
}
if (ret == 4) {
r[0] = (unsigned char) (c1 >> 8);
r[1] = (unsigned char) (c1 & 0xff);
r[2] = (unsigned char) (c2 >> 8);
r[3] = (unsigned char) (c2 & 0xff);
return 4;
}
return 0;
}
/* Helper to count the number of chars in a string within a range */
static int chr_range_cnt(const unsigned char string[], const int length, const unsigned char c1,
const unsigned char c2) {
@ -149,8 +190,8 @@ static int chr_range_cnt(const unsigned char string[], const int length, const u
/* Is ECI convertible from UTF-8? */
INTERNAL int is_eci_convertible(const int eci) {
if (eci == 26 || (eci > 30 && eci != 170)) { /* Exclude ECI 170 - ASCII Invariant */
/* UTF-8 (26) or 8-bit binary data (899) or undefined (> 30 and < 899) or not character set (> 899) */
if (eci == 26 || (eci > 35 && eci != 170)) { /* Exclude ECI 170 - ASCII Invariant */
/* UTF-8 (26) or 8-bit binary data (899) or undefined (> 35 and < 899) or not character set (> 899) */
return 0;
}
return 1;
@ -162,16 +203,21 @@ INTERNAL int get_eci_length(const int eci, const unsigned char source[], int len
/* Only ASCII backslash (reverse solidus) exceeds UTF-8 length */
length += chr_cnt(source, length, '\\');
} else if (eci == 25) { /* UCS-2BE */
} else if (eci == 25 || eci == 33) { /* UTF-16 */
/* All ASCII chars take 2 bytes */
length += chr_range_cnt(source, length, 0, 0x7F);
/* Surrogate pairs are 4 UTF-8 bytes long so fit */
} else if (eci == 29) { /* GB 2312 (and GB 18030 if Han Xin) */
/* Not needed for GB 2312 but allow for GB 18030 4 byters */
} else if (eci == 32) { /* GB 18030 */
/* Allow for GB 18030 4 byters */
length *= 2;
} else if (eci == 34 || eci == 35) { /* UTF-32 */
/* Quadruple-up ASCII and double-up non-ASCII */
length += chr_range_cnt(source, length, 0, 0x7F) * 2 + length;
}
/* Big5 and EUC-KR fit in UTF-8 length */
/* Big5, GB 2312, EUC-KR and GBK fit in UTF-8 length */
return length;
}
@ -180,14 +226,15 @@ INTERNAL int get_eci_length(const int eci, const unsigned char source[], int len
INTERNAL int utf8_to_eci(const int eci, const unsigned char source[], unsigned char dest[], int *p_length) {
typedef int (*eci_func_t)(unsigned char *r, const unsigned int wc);
static const eci_func_t eci_funcs[31] = {
NULL, NULL, NULL, NULL, iso8859_2_wctosb,
iso8859_3_wctosb, iso8859_4_wctosb, iso8859_5_wctosb, iso8859_6_wctosb, iso8859_7_wctosb,
iso8859_8_wctosb, iso8859_9_wctosb, iso8859_10_wctosb, iso8859_11_wctosb, NULL,
iso8859_13_wctosb, iso8859_14_wctosb, iso8859_15_wctosb, iso8859_16_wctosb, NULL,
sjis_wctomb, cp1250_wctosb, cp1251_wctosb, cp1252_wctosb, cp1256_wctosb,
ucs2be_wctomb, NULL, ascii_wctosb, big5_wctomb, gb2312_wctomb,
euc_kr_wctomb,
static const eci_func_t eci_funcs[36] = {
NULL, NULL, NULL, NULL, iso8859_2_wctosb, /*0-4*/
iso8859_3_wctosb, iso8859_4_wctosb, iso8859_5_wctosb, iso8859_6_wctosb, iso8859_7_wctosb, /*5-9*/
iso8859_8_wctosb, iso8859_9_wctosb, iso8859_10_wctosb, iso8859_11_wctosb, NULL, /*10-14*/
iso8859_13_wctosb, iso8859_14_wctosb, iso8859_15_wctosb, iso8859_16_wctosb, NULL, /*15-19*/
sjis_wctomb, cp1250_wctosb, cp1251_wctosb, cp1252_wctosb, cp1256_wctosb, /*20-24*/
utf16be_wctomb, NULL, ascii_wctosb, big5_wctomb, gb2312_wctomb, /*25-29*/
euc_kr_wctomb, gbk_wctomb, gb18030_wctomb, utf16le_wctomb, utf32be_wctomb, /*30-34*/
utf32le_wctomb,
};
eci_func_t eci_func;
unsigned int codepoint, state;
@ -277,3 +324,5 @@ INTERNAL int get_best_eci(const unsigned char source[], int length) {
return 26; // If all of these fail, use Unicode!
}
/* vim: set ts=4 sw=4 et : */