mirror of
https://git.code.sf.net/p/zint/code
synced 2025-05-27 21:44:13 -04:00
eci: Add support for all ECIs (Big5, Korean, UCS-2BE)
This commit is contained in:
parent
9795049322
commit
7fe930b4dc
53 changed files with 51324 additions and 907 deletions
413
backend/eci.c
413
backend/eci.c
|
@ -1,7 +1,7 @@
|
|||
/* eci.c - Extended Channel Interpretations
|
||||
|
||||
libzint - the open source barcode library
|
||||
Copyright (C) 2009 - 2020 Robin Stuart <rstuart114@gmail.com>
|
||||
Copyright (C) 2009 - 2021 Robin Stuart <rstuart114@gmail.com>
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
|
@ -30,240 +30,241 @@
|
|||
*/
|
||||
/* vim: set ts=4 sw=4 et : */
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "eci.h"
|
||||
#include "common.h"
|
||||
#ifdef _MSC_VER
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
#include "eci.h"
|
||||
#include "eci_sb.h"
|
||||
#include "sjis.h"
|
||||
#include "big5.h"
|
||||
#include "gb2312.h"
|
||||
#include "ksx1001.h"
|
||||
|
||||
/* Convert Unicode to other character encodings */
|
||||
INTERNAL int utf_to_eci(const int eci, const unsigned char source[], unsigned char dest[], int *length) {
|
||||
int in_posn;
|
||||
int out_posn;
|
||||
int ext;
|
||||
int done;
|
||||
|
||||
if (eci == 26 || eci == 899) {
|
||||
/* Unicode or 8-bit binary data, do not process - just copy data across */
|
||||
memcpy(dest, source, *length);
|
||||
dest[*length] = '\0';
|
||||
/* ECI 20 Shift JIS */
|
||||
static int sjis_wctomb(unsigned char *r, const unsigned int wc) {
|
||||
int ret;
|
||||
unsigned int c;
|
||||
|
||||
ret = sjis_wctomb_zint(&c, wc);
|
||||
if (ret == 0) {
|
||||
return 0;
|
||||
}
|
||||
if (ret == 2) {
|
||||
r[0] = (unsigned char) (c >> 8);
|
||||
r[1] = (unsigned char) (c & 0xff);
|
||||
} else {
|
||||
*r = (unsigned char) c;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* ECI 27 ASCII (ISO/IEC 646:1991 IRV (US)) */
|
||||
static int ascii_wctosb(unsigned char *r, const unsigned int wc) {
|
||||
if (wc < 0x80) {
|
||||
*r = (unsigned char) wc;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ECI 170 ASCII subset (ISO/IEC 646:1991 Invariant, excludes chars that historically had national variants) */
|
||||
static int ascii_invariant_wctosb(unsigned char *r, const unsigned int wc) {
|
||||
if (wc == 0x7f || (wc <= 'z' && wc != '#' && wc != '$' && wc != '@' && (wc <= 'Z' || wc == '_' || wc >= 'a'))) {
|
||||
*r = (unsigned char) wc;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ECI 28 Big5 Chinese (Taiwan) */
|
||||
static int big5_wctomb(unsigned char *r, const unsigned int wc) {
|
||||
unsigned int c;
|
||||
|
||||
if (wc < 0x80) {
|
||||
*r = (unsigned char) wc;
|
||||
return 1;
|
||||
}
|
||||
if (big5_wctomb_zint(&c, wc)) {
|
||||
r[0] = (unsigned char) (c >> 8);
|
||||
r[1] = (unsigned char) (c & 0xff);
|
||||
return 2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ECI 29 GB 2312 Chinese (PRC) */
|
||||
static int gb2312_wctomb(unsigned char *r, const unsigned int wc) {
|
||||
unsigned int c;
|
||||
|
||||
if (wc < 0x80) {
|
||||
*r = (unsigned char) wc;
|
||||
return 1;
|
||||
}
|
||||
if (gb2312_wctomb_zint(&c, wc)) {
|
||||
r[0] = (unsigned char) (c >> 8);
|
||||
r[1] = (unsigned char) (c & 0xff);
|
||||
return 2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ECI 30 KS X 1001 (KS C 5601) Korean */
|
||||
static int ksx1001_wctomb(unsigned char *r, const unsigned int wc) {
|
||||
unsigned int c;
|
||||
|
||||
if (wc < 0x80) {
|
||||
*r = (unsigned char) wc;
|
||||
return 1;
|
||||
}
|
||||
if (ksx1001_wctomb_zint(&c, wc)) {
|
||||
r[0] = (unsigned char) (c >> 8);
|
||||
r[1] = (unsigned char) (c & 0xff);
|
||||
return 2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Helper to count the number of chars in a string within a range */
|
||||
static int chr_range_cnt(const unsigned char string[], const int length, const unsigned char c1,
|
||||
const unsigned char c2) {
|
||||
int count = 0;
|
||||
int i;
|
||||
if (c1) {
|
||||
for (i = 0; i < length; i++) {
|
||||
if (string[i] >= c1 && string[i] <= c2) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < length; i++) {
|
||||
if (string[i] <= c2) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/* Is ECI convertible from UTF-8? */
|
||||
INTERNAL int is_eci_convertible(const int eci) {
|
||||
if (eci == 26 || (eci > 30 && eci != 170)) { /* Exclude ECI 170 - ASCII Invariant */
|
||||
/* UTF-8 (26) or 8-bit binary data (899) or undefined (> 30 and < 899) or not character set (> 899) */
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Calculate length required to convert UTF-8 to (double-byte) encoding */
|
||||
INTERNAL int get_eci_length(const int eci, const unsigned char source[], int length) {
|
||||
if (eci == 20) { /* Shift JIS */
|
||||
/* Only ASCII backslash (reverse solidus) exceeds UTF-8 length */
|
||||
length += chr_cnt(source, length, '\\');
|
||||
|
||||
} else if (eci == 25) { /* UCS-2BE */
|
||||
/* All ASCII chars take 2 bytes */
|
||||
length += chr_range_cnt(source, length, 0, 0x7F);
|
||||
|
||||
} else if (eci == 29) { /* GB 2312 (and GB 18030 if Han Xin) */
|
||||
/* Not needed for GB 2312 but allow for GB 18030 4 byters */
|
||||
length *= 2;
|
||||
}
|
||||
|
||||
/* Big5 and KS X 1001 fit in UTF-8 length */
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
/* Convert UTF-8 Unicode to other character encodings */
|
||||
INTERNAL int utf8_to_eci(const int eci, const unsigned char source[], unsigned char dest[], int *p_length) {
|
||||
|
||||
typedef int (*eci_func_t)(unsigned char *r, const unsigned int wc);
|
||||
static const eci_func_t eci_funcs[31] = {
|
||||
NULL, NULL, NULL, NULL, iso8859_2_wctosb,
|
||||
iso8859_3_wctosb, iso8859_4_wctosb, iso8859_5_wctosb, iso8859_6_wctosb, iso8859_7_wctosb,
|
||||
iso8859_8_wctosb, iso8859_9_wctosb, iso8859_10_wctosb, iso8859_11_wctosb, NULL,
|
||||
iso8859_13_wctosb, iso8859_14_wctosb, iso8859_15_wctosb, iso8859_16_wctosb, NULL,
|
||||
sjis_wctomb, cp1250_wctosb, cp1251_wctosb, cp1252_wctosb, cp1256_wctosb,
|
||||
ucs2be_wctomb, NULL, ascii_wctosb, big5_wctomb, gb2312_wctomb,
|
||||
ksx1001_wctomb,
|
||||
};
|
||||
eci_func_t eci_func;
|
||||
unsigned int codepoint, state;
|
||||
int in_posn;
|
||||
int out_posn;
|
||||
int length = *p_length;
|
||||
|
||||
in_posn = 0;
|
||||
out_posn = 0;
|
||||
do {
|
||||
/* Single byte (ASCII) character */
|
||||
int bytelen = 1;
|
||||
int glyph = (int) source[in_posn];
|
||||
|
||||
if ((source[in_posn] >= 0x80) && (source[in_posn] < 0xc0)) {
|
||||
/* Something has gone wrong, abort */
|
||||
/* Special case ISO/IEC 8859-1 */
|
||||
if (eci == 0 || eci == 3) { /* Default ECI 0 to ISO/IEC 8859-1 */
|
||||
state = 0;
|
||||
while (in_posn < length) {
|
||||
do {
|
||||
decode_utf8(&state, &codepoint, source[in_posn++]);
|
||||
} while (in_posn < length && state != 0 && state != 12);
|
||||
if (state != 0) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
if (codepoint >= 0x80 && (codepoint < 0x00a0 || codepoint >= 0x0100)) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
dest[out_posn++] = (unsigned char) codepoint;
|
||||
}
|
||||
dest[out_posn] = '\0';
|
||||
*p_length = out_posn;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (eci == 170) { /* ASCII Invariant (archaic subset) */
|
||||
eci_func = ascii_invariant_wctosb;
|
||||
} else {
|
||||
eci_func = eci_funcs[eci];
|
||||
if (eci_func == NULL) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
}
|
||||
|
||||
if ((source[in_posn] >= 0xc0) && (source[in_posn] < 0xe0)) {
|
||||
/* Two-byte character */
|
||||
bytelen = 2;
|
||||
glyph = (source[in_posn] & 0x1f) << 6;
|
||||
|
||||
if (*length < (in_posn + 2)) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
|
||||
if (source[in_posn + 1] > 0xc0) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
|
||||
glyph += (source[in_posn + 1] & 0x3f);
|
||||
}
|
||||
|
||||
if ((source[in_posn] >= 0xe0) && (source[in_posn] < 0xf0)) {
|
||||
/* Three-byte character */
|
||||
bytelen = 3;
|
||||
glyph = (source[in_posn] & 0x0f) << 12;
|
||||
|
||||
if (*length < (in_posn + 2)) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
|
||||
if (*length < (in_posn + 3)) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
|
||||
if (source[in_posn + 1] > 0xc0) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
|
||||
if (source[in_posn + 2] > 0xc0) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
|
||||
glyph += (source[in_posn + 1] & 0x3f) << 6;
|
||||
glyph += (source[in_posn + 2] & 0x3f);
|
||||
}
|
||||
|
||||
if (source[in_posn] >= 0xf0 || glyph > 0x2122) {
|
||||
/* Not in any ISO 8859 or Windows page */
|
||||
state = 0;
|
||||
while (in_posn < length) {
|
||||
int incr;
|
||||
do {
|
||||
decode_utf8(&state, &codepoint, source[in_posn++]);
|
||||
} while (in_posn < length && state != 0 && state != 12);
|
||||
if (state != 0) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
|
||||
if (glyph < 128) {
|
||||
dest[out_posn] = glyph;
|
||||
} else {
|
||||
done = 0;
|
||||
for (ext = 0; ext < 128; ext++) {
|
||||
switch (eci) {
|
||||
case 3: // Latin-1
|
||||
if (glyph == iso_8859_1[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 4: // Latin-2
|
||||
if (glyph == iso_8859_2[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 5: // Latin-3
|
||||
if (glyph == iso_8859_3[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 6: // Latin-4
|
||||
if (glyph == iso_8859_4[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 7: // Latin/Cyrillic
|
||||
if (glyph == iso_8859_5[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 8: // Latin/Arabic
|
||||
if (glyph == iso_8859_6[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 9: // Latin/Greek
|
||||
if (glyph == iso_8859_7[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 10: // Latin/Hebrew
|
||||
if (glyph == iso_8859_8[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 11: // Latin-5
|
||||
if (glyph == iso_8859_9[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 12: // Latin-6
|
||||
if (glyph == iso_8859_10[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 13: // Latin/Thai
|
||||
if (glyph == iso_8859_11[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 15: // Latin-7
|
||||
if (glyph == iso_8859_13[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 16: // Latin-8
|
||||
if (glyph == iso_8859_14[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 17: // Latin-9
|
||||
if (glyph == iso_8859_15[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 18: // Latin-10
|
||||
if (glyph == iso_8859_16[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 21: // Windows-1250
|
||||
if (glyph == windows_1250[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 22: // Windows-1251
|
||||
if (glyph == windows_1251[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 23: // Windows-1252
|
||||
if (glyph == windows_1252[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
case 24: // Windows-1256
|
||||
if (glyph == windows_1256[ext]) {
|
||||
dest[out_posn] = ext + 128;
|
||||
done = 1;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!(done)) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
incr = (*eci_func)(dest + out_posn, codepoint);
|
||||
if (incr == 0) {
|
||||
return ZINT_ERROR_INVALID_DATA;
|
||||
}
|
||||
|
||||
in_posn += bytelen;
|
||||
out_posn++;
|
||||
} while (in_posn < *length);
|
||||
out_posn += incr;
|
||||
}
|
||||
dest[out_posn] = '\0';
|
||||
*length = out_posn;
|
||||
*p_length = out_posn;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Find the lowest ECI mode which will encode a given set of Unicode text */
|
||||
INTERNAL int get_best_eci(unsigned char source[], int length) {
|
||||
/* Find the lowest single-byte ECI mode which will encode a given set of Unicode text */
|
||||
INTERNAL int get_best_eci(const unsigned char source[], int length) {
|
||||
int eci = 3;
|
||||
|
||||
/* Note: attempting single-byte conversions only, so get_eci_length() unnecessary */
|
||||
#ifndef _MSC_VER
|
||||
unsigned char local_source[length + 1];
|
||||
#else
|
||||
unsigned char *local_source = (unsigned char*) _alloca(length + 1);
|
||||
unsigned char *local_source = (unsigned char *) _alloca(length + 1);
|
||||
#endif
|
||||
|
||||
do {
|
||||
if (utf_to_eci(eci, source, local_source, &length) == 0) {
|
||||
if (eci == 14) { /* Reserved */
|
||||
eci = 15;
|
||||
} else if (eci == 19) { /* Reserved */
|
||||
eci = 21; /* Skip 20 Shift JIS */
|
||||
}
|
||||
if (utf8_to_eci(eci, source, local_source, &length) == 0) {
|
||||
return eci;
|
||||
}
|
||||
eci++;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue