app/tools/halibut/charset/euc.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272

/*
 * euc.c - routines to handle the various EUC multibyte encodings.
 */

#ifndef ENUM_CHARSETS

#include "charset.h"
#include "internal.h"

struct euc {
    int nchars[3];		       /* GR, SS2+GR, SS3+GR */
    long int (*to_ucs)(unsigned long state);
    unsigned long (*from_ucs)(long int ucs);
};

static void read_euc(charset_spec const *charset, long int input_chr,
		     charset_state *state,
		     void (*emit)(void *ctx, long int output), void *emitctx)
{
    struct euc const *euc = (struct euc *)charset->data;

    /*
     * For EUC input, our state variable divides into three parts:
     * 
     * 	- Topmost nibble (bits 31:28) is nonzero if we're
     * 	  accumulating a multibyte character, and it indicates
     * 	  which section we're in: 1 for GR chars, 2 for things
     * 	  beginning with SS2, 3 for things beginning with SS3.
     * 
     * 	- Next nibble (bits 27:24) indicates how many bytes of the
     * 	  character we've accumulated so far.
     * 
     * 	- The rest (bits 23:0) are those bytes in full, accumulated
     * 	  as a large integer (so that seeing A1 A2 A3, in a
     * 	  hypothetical EUC whose GR encoding is three-byte, runs
     * 	  our state variable from 0 -> 0x110000A1 -> 0x1200A1A2 ->
     * 	  0x13A1A2A3, at which point it gets translated and output
     * 	  and resets to zero).
     */

    if (state->s0 != 0) {

	/*
	 * At this point, no matter whether we had an SS2 or SS3
	 * introducer or not, we _always_ expect a GR character.
	 * Anything else causes us to emit ERROR for an incomplete
	 * character, and then reset to state 0 to process the
	 * character in its own way.
	 */
	if (input_chr < 0xA1 || input_chr == 0xFF) {
	    emit(emitctx, ERROR);
	    state->s0 = 0;
	} else
	    state->s0 = (((state->s0 & 0xFF000000) + 0x01000000) |
			 ((state->s0 & 0x0000FFFF) << 8) | input_chr);

    }

    if (state->s0 == 0) {
	/*
	 * The input character determines which of the four
	 * possible charsets we're going to be in.
	 */
	if (input_chr < 0x80) {	       /* this is always ASCII */
	    emit(emitctx, input_chr);
	} else if (input_chr == 0x8E) {/* SS2 means charset 2 */
	    state->s0 = 0x20000000;
	} else if (input_chr == 0x8F) {/* SS3 means charset 3 */
	    state->s0 = 0x30000000;
	} else if (input_chr < 0xA1 || input_chr == 0xFF) {   /* errors */
	    emit(emitctx, ERROR);
	} else {		       /* A1-FE means charset 1 */
	    state->s0 = 0x11000000 | input_chr;
	}
    }

    /*
     * Finally, if we have accumulated a complete character, output
     * it.
     */
    if (state->s0 != 0 &&
	((state->s0 & 0x0F000000) >> 24) >=
	(unsigned)euc->nchars[(state->s0 >> 28)-1]) {
	emit(emitctx, euc->to_ucs(state->s0));
	state->s0 = 0;
    }
}

/*
 * All EUCs are stateless multi-byte encodings (in the sense that
 * just after any character has been completed, the state is always
 * the same); hence when writing them, there is no need to use the
 * charset_state.
 */

static int write_euc(charset_spec const *charset, long int input_chr,
		     charset_state *state,
		     void (*emit)(void *ctx, long int output), void *emitctx)
{
    struct euc const *euc = (struct euc *)charset->data;
    unsigned long c;
    int cset, len;

    UNUSEDARG(state);

    if (input_chr == -1)
	return TRUE;		       /* stateless; no cleanup required */

    /* ASCII is the easy bit, and is always the same. */
    if (input_chr < 0x80) {
	emit(emitctx, input_chr);
	return TRUE;
    }

    c = euc->from_ucs(input_chr);
    if (!c) {
	return FALSE;
    }

    cset = c >> 28;
    len = euc->nchars[cset-1];
    c &= 0xFFFFFF;

    if (cset > 1)
	emit(emitctx, 0x8C + cset);    /* SS2/SS3 */

    while (len--)
	emit(emitctx, (c >> (8*len)) & 0xFF);
    return TRUE;
}

/*
 * EUC-CN encodes GB2312 only.
 */
static long int euc_cn_to_ucs(unsigned long state)
{
    switch (state >> 28) {
      case 1: return gb2312_to_unicode(((state >> 8) & 0xFF) - 0xA1,
				       ((state     ) & 0xFF) - 0xA1);
      default: return ERROR;
    }
}
static unsigned long euc_cn_from_ucs(long int ucs)
{
    int r, c;
    if (unicode_to_gb2312(ucs, &r, &c))
	return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
    else
	return 0;
}
static const struct euc euc_cn = {
    {2,0,0}, euc_cn_to_ucs, euc_cn_from_ucs
};
const charset_spec charset_CS_EUC_CN = {
    CS_EUC_CN, read_euc, write_euc, &euc_cn
};

/*
 * EUC-KR encodes KS X 1001 only.
 */
static long int euc_kr_to_ucs(unsigned long state)
{
    switch (state >> 28) {
      case 1: return ksx1001_to_unicode(((state >> 8) & 0xFF) - 0xA1,
				       ((state     ) & 0xFF) - 0xA1);
      default: return ERROR;
    }
}
static unsigned long euc_kr_from_ucs(long int ucs)
{
    int r, c;
    if (unicode_to_ksx1001(ucs, &r, &c))
	return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
    else
	return 0;
}
static const struct euc euc_kr = {
    {2,0,0}, euc_kr_to_ucs, euc_kr_from_ucs
};
const charset_spec charset_CS_EUC_KR = {
    CS_EUC_KR, read_euc, write_euc, &euc_kr
};

/*
 * EUC-JP encodes several character sets.
 */
static long int euc_jp_to_ucs(unsigned long state)
{
    switch (state >> 28) {
      case 1: return jisx0208_to_unicode(((state >> 8) & 0xFF) - 0xA1,
					 ((state     ) & 0xFF) - 0xA1);
      case 2:
	/*
	 * This is the top half of JIS X 0201. That means A1-DF map
	 * to FF61-FF9F, and nothing else is valid.
	 */
	{
	    int c = state & 0xFF;
	    if (c >= 0xA1 && c <= 0xDF)
		return c + (0xFF61 - 0xA1);
	    else
		return ERROR;
	}
	/* (no break needed since all control paths have returned) */
      case 3: return jisx0212_to_unicode(((state >> 8) & 0xFF) - 0xA1,
					 ((state     ) & 0xFF) - 0xA1);
      default: return ERROR;	       /* placate optimisers */
    }
}
static unsigned long euc_jp_from_ucs(long int ucs)
{
    int r, c;
    if (ucs >= 0xFF61 && ucs <= 0xFF9F)
	return 0x20000000 | (ucs - (0xFF61 - 0xA1));
    else if (unicode_to_jisx0208(ucs, &r, &c))
	return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
    else if (unicode_to_jisx0212(ucs, &r, &c))
	return 0x30000000 | ((r+0xA1) << 8) | (c+0xA1);
    else
	return 0;
}
static const struct euc euc_jp = {
    {2,1,2}, euc_jp_to_ucs, euc_jp_from_ucs
};
const charset_spec charset_CS_EUC_JP = {
    CS_EUC_JP, read_euc, write_euc, &euc_jp
};

/*
 * EUC-TW encodes CNS 11643 (all planes).
 */
static long int euc_tw_to_ucs(unsigned long state)
{
    int plane;
    switch (state >> 28) {
      case 1: return cns11643_to_unicode(0, ((state >> 8) & 0xFF) - 0xA1,
					    ((state     ) & 0xFF) - 0xA1);
      case 2:
	plane = ((state >> 8) & 0xFF) - 0xA1;
	if (plane >= 7) return ERROR;
	return cns11643_to_unicode(plane, ((state >> 8) & 0xFF) - 0xA1,
					  ((state     ) & 0xFF) - 0xA1);
      default: return ERROR;
    }
}
static unsigned long euc_tw_from_ucs(long int ucs)
{
    int p, r, c;
    if (unicode_to_cns11643(ucs, &p, &r, &c)) {
	if (p == 0)
	    return 0x10000000 | ((r+0xA1) << 8) | (c+0xA1);
	else
	    return 0x20000000 |
		((p + 0xA1) << 16) | ((r+0xA1) << 8) | (c+0xA1);
    } else
	return 0;
}
static const struct euc euc_tw = {
    {2,3,0}, euc_tw_to_ucs, euc_tw_from_ucs
};
const charset_spec charset_CS_EUC_TW = {
    CS_EUC_TW, read_euc, write_euc, &euc_tw
};

#else /* ENUM_CHARSETS */

ENUM_CHARSET(CS_EUC_CN)
ENUM_CHARSET(CS_EUC_KR)
ENUM_CHARSET(CS_EUC_JP)
ENUM_CHARSET(CS_EUC_TW)

#endif /* ENUM_CHARSETS */