1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
/*
* internal.h - internal header stuff for the charset library.
*/
#ifndef charset_internal_h
#define charset_internal_h
/* This invariably comes in handy */
#define lenof(x) ( sizeof((x)) / sizeof(*(x)) )
/* This is an invalid Unicode value used to indicate an error. */
#define ERROR 0xFFFFL /* Unicode value representing error */
#undef TRUE
#define TRUE 1
#undef FALSE
#define FALSE 0
typedef struct charset_spec charset_spec;
typedef struct sbcs_data sbcs_data;
struct charset_spec {
int charset; /* numeric identifier */
/*
* A function to read the character set and output Unicode
* characters. The `emit' function expects to get Unicode chars
* passed to it; it should be sent ERROR for any encoding error
* on the input.
*/
void (*read)(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx);
/*
* A function to read Unicode characters and output in this
* character set. The `emit' function expects to get byte
* values passed to it.
*
* A non-representable input character should cause a FALSE
* return, _before_ `emit' is called. Successful conversion
* causes a TRUE return.
*
* If `input_chr' is -1, this function must revert the encoding
* state to any default required at the end of a piece of
* encoded text.
*/
int (*write)(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx);
void const *data;
};
/*
* This is the format of `data' used by the SBCS read and write
* functions; so it's the format used in all SBCS definitions.
*/
struct sbcs_data {
/*
* This is a simple mapping table converting each SBCS position
* to a Unicode code point. Some positions may contain ERROR,
* indicating that that byte value is not defined in the SBCS
* in question and its occurrence in input is an error.
*/
unsigned long sbcs2ucs[256];
/*
* This lookup table is used to convert Unicode back to the
* SBCS. It consists of the valid byte values in the SBCS,
* sorted in order of their Unicode translation. So given a
* Unicode value U, you can do a binary search on this table
* using the above table as a lookup: when testing the Xth
* position in this table, you branch according to whether
* sbcs2ucs[ucs2sbcs[X]] is less than, greater than, or equal
* to U.
*
* Note that since there may be fewer than 256 valid byte
* values in a particular SBCS, we must supply the length of
* this table as well as the contents.
*/
unsigned char ucs2sbcs[256];
int nvalid;
};
/*
* Prototypes for internal library functions.
*/
charset_spec const *charset_find_spec(int charset);
void read_sbcs(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx);
int write_sbcs(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx);
long int sbcs_to_unicode(const struct sbcs_data *sd, long int input_chr);
long int sbcs_from_unicode(const struct sbcs_data *sd, long int input_chr);
void read_utf8(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output), void *emitctx);
int write_utf8(charset_spec const *charset, long int input_chr,
charset_state *state,
void (*emit)(void *ctx, long int output),
void *emitctx);
long int big5_to_unicode(int r, int c);
int unicode_to_big5(long int unicode, int *r, int *c);
long int cns11643_to_unicode(int p, int r, int c);
int unicode_to_cns11643(long int unicode, int *p, int *r, int *c);
long int cp949_to_unicode(int r, int c);
int unicode_to_cp949(long int unicode, int *r, int *c);
long int ksx1001_to_unicode(int r, int c);
int unicode_to_ksx1001(long int unicode, int *r, int *c);
long int gb2312_to_unicode(int r, int c);
int unicode_to_gb2312(long int unicode, int *r, int *c);
long int jisx0208_to_unicode(int r, int c);
int unicode_to_jisx0208(long int unicode, int *r, int *c);
long int jisx0212_to_unicode(int r, int c);
int unicode_to_jisx0212(long int unicode, int *r, int *c);
/*
* Placate compiler warning about unused parameters, of which we
* expect to have some in this library.
*/
#define UNUSEDARG(x) ( (x) = (x) )
#endif /* charset_internal_h */
|