diff options
Diffstat (limited to 'app/tools/halibut/charset/internal.h')
-rw-r--r-- | app/tools/halibut/charset/internal.h | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/app/tools/halibut/charset/internal.h b/app/tools/halibut/charset/internal.h new file mode 100644 index 0000000..5d215a5 --- /dev/null +++ b/app/tools/halibut/charset/internal.h @@ -0,0 +1,126 @@ +/* + * internal.h - internal header stuff for the charset library. + */ + +#ifndef charset_internal_h +#define charset_internal_h + +/* This invariably comes in handy */ +#define lenof(x) ( sizeof((x)) / sizeof(*(x)) ) + +/* This is an invalid Unicode value used to indicate an error. */ +#define ERROR 0xFFFFL /* Unicode value representing error */ + +#undef TRUE +#define TRUE 1 +#undef FALSE +#define FALSE 0 + +typedef struct charset_spec charset_spec; +typedef struct sbcs_data sbcs_data; + +struct charset_spec { + int charset; /* numeric identifier */ + + /* + * A function to read the character set and output Unicode + * characters. The `emit' function expects to get Unicode chars + * passed to it; it should be sent ERROR for any encoding error + * on the input. + */ + void (*read)(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx); + /* + * A function to read Unicode characters and output in this + * character set. The `emit' function expects to get byte + * values passed to it. + * + * A non-representable input character should cause a FALSE + * return, _before_ `emit' is called. Successful conversion + * causes a TRUE return. + * + * If `input_chr' is -1, this function must revert the encoding + * state to any default required at the end of a piece of + * encoded text. + */ + int (*write)(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx); + void const *data; +}; + +/* + * This is the format of `data' used by the SBCS read and write + * functions; so it's the format used in all SBCS definitions. + */ +struct sbcs_data { + /* + * This is a simple mapping table converting each SBCS position + * to a Unicode code point. Some positions may contain ERROR, + * indicating that that byte value is not defined in the SBCS + * in question and its occurrence in input is an error. + */ + unsigned long sbcs2ucs[256]; + + /* + * This lookup table is used to convert Unicode back to the + * SBCS. It consists of the valid byte values in the SBCS, + * sorted in order of their Unicode translation. So given a + * Unicode value U, you can do a binary search on this table + * using the above table as a lookup: when testing the Xth + * position in this table, you branch according to whether + * sbcs2ucs[ucs2sbcs[X]] is less than, greater than, or equal + * to U. + * + * Note that since there may be fewer than 256 valid byte + * values in a particular SBCS, we must supply the length of + * this table as well as the contents. + */ + unsigned char ucs2sbcs[256]; + int nvalid; +}; + +/* + * Prototypes for internal library functions. + */ +charset_spec const *charset_find_spec(int charset); +void read_sbcs(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx); +int write_sbcs(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx); +long int sbcs_to_unicode(const struct sbcs_data *sd, long int input_chr); +long int sbcs_from_unicode(const struct sbcs_data *sd, long int input_chr); + +void read_utf8(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), void *emitctx); +int write_utf8(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), + void *emitctx); + +long int big5_to_unicode(int r, int c); +int unicode_to_big5(long int unicode, int *r, int *c); +long int cns11643_to_unicode(int p, int r, int c); +int unicode_to_cns11643(long int unicode, int *p, int *r, int *c); +long int cp949_to_unicode(int r, int c); +int unicode_to_cp949(long int unicode, int *r, int *c); +long int ksx1001_to_unicode(int r, int c); +int unicode_to_ksx1001(long int unicode, int *r, int *c); +long int gb2312_to_unicode(int r, int c); +int unicode_to_gb2312(long int unicode, int *r, int *c); +long int jisx0208_to_unicode(int r, int c); +int unicode_to_jisx0208(long int unicode, int *r, int *c); +long int jisx0212_to_unicode(int r, int c); +int unicode_to_jisx0212(long int unicode, int *r, int *c); + +/* + * Placate compiler warning about unused parameters, of which we + * expect to have some in this library. + */ +#define UNUSEDARG(x) ( (x) = (x) ) + +#endif /* charset_internal_h */ |