summaryrefslogtreecommitdiff
path: root/app/tools/halibut/charset/utf16.c
diff options
context:
space:
mode:
Diffstat (limited to 'app/tools/halibut/charset/utf16.c')
-rw-r--r--app/tools/halibut/charset/utf16.c217
1 files changed, 217 insertions, 0 deletions
diff --git a/app/tools/halibut/charset/utf16.c b/app/tools/halibut/charset/utf16.c
new file mode 100644
index 0000000..a1af88b
--- /dev/null
+++ b/app/tools/halibut/charset/utf16.c
@@ -0,0 +1,217 @@
+/*
+ * utf16.c - routines to handle UTF-16 (RFC 2781).
+ */
+
+#ifndef ENUM_CHARSETS
+
+#include "charset.h"
+#include "internal.h"
+
+struct utf16 {
+ int s0; /* initial value of state->s0 */
+};
+
+static void read_utf16(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output),
+ void *emitctx)
+{
+ struct utf16 const *utf = (struct utf16 *)charset->data;
+ long int hw;
+
+ /*
+ * State variable s1 handles the combining of bytes into
+ * transport-endianness halfwords. It contains:
+ *
+ * - 0 if we're between halfwords
+ * - 0x100 plus the first byte if we're in mid-halfword
+ *
+ * State variable s0 handles everything from there upwards. It
+ * contains:
+ *
+ * - Bottom 16 bits are set to a surrogate value if we've just
+ * seen one.
+ * - Next two bits (17:16) indicate possible endiannesses. Bit
+ * 17 is set if we might be BE; bit 16 if we might be LE. If
+ * they're both zero, it has to be because this is right at
+ * the start, so the first thing we do is set them to the
+ * correct initial state.
+ * - The bit after that (18) is 1 iff we have already seen at
+ * least one halfword (meaning we should pass any further
+ * BOMs straight through).
+ */
+
+ /* Set up s0 if this is the start. */
+ if (state->s0 == 0)
+ state->s0 = utf->s0;
+
+ /* Accumulate a transport-endianness halfword. */
+ if (state->s1 == 0) {
+ state->s1 = 0x100 | input_chr;
+ return;
+ }
+ hw = ((state->s1 & 0xFF) << 8) + input_chr;
+ state->s1 = 0;
+
+ /* Process BOM and determine byte order. */
+ if (!(state->s0 & 0x40000)) {
+ state->s0 |= 0x40000;
+ if (hw == 0xFEFF && (state->s0 & 0x20000)) {
+ /*
+ * Text starts with a big-endian BOM, and big-
+ * endianness is a possibility. So clear the
+ * little-endian bit (the BOM confirms our endianness),
+ * and return without emitting the BOM in Unicode.
+ */
+ state->s0 &= ~0x10000;
+ return;
+ } else if (hw == 0xFFFE && (state->s0 & 0x10000)) {
+ /*
+ * Text starts with a little-endian BOM, and little-
+ * endianness is a possibility. So clear the big-endian
+ * bit (the BOM confirms our endianness), and return
+ * without emitting the BOM in Unicode.
+ */
+ state->s0 &= ~0x20000;
+ return;
+ } else {
+ /*
+ * Text does not begin with a BOM. RFC 2781 states that
+ * in this case we must assume big-endianness if we
+ * haven't been told otherwise by the content type.
+ */
+ if ((state->s0 & 0x30000) == 0x30000)
+ state->s0 &= ~0x10000; /* clear LE bit */
+ }
+ }
+
+ /*
+ * Byte-swap transport-endianness halfword if necessary. We may
+ * now test individual endianness bits, since we can be sure
+ * exactly one is set.
+ */
+ if (state->s0 & 0x10000)
+ hw = ((hw >> 8) | (hw << 8)) & 0xFFFF;
+
+ /*
+ * Now that the endianness issue has been dealt with, what
+ * reaches this point should be a stream of halfwords in
+ * sensible numeric form. So now we process surrogates.
+ */
+ if (state->s0 & 0xFFFF) {
+ /*
+ * We have already seen a high surrogate, so we expect a
+ * low surrogate. Whinge if we didn't get it.
+ */
+ if (hw < 0xDC00 || hw >= 0xE000) {
+ emit(emitctx, ERROR);
+ } else {
+ hw &= 0x3FF;
+ hw |= (state->s0 & 0x3FF) << 10;
+ emit(emitctx, hw + 0x10000);
+ }
+ state->s0 &= 0xFFFF0000;
+ } else {
+ /*
+ * Any low surrogate is an error.
+ */
+ if (hw >= 0xDC00 && hw < 0xE000) {
+ emit(emitctx, ERROR);
+ return;
+ }
+
+ /*
+ * Any high surrogate is simply stored until we see the
+ * next halfword.
+ */
+ if (hw >= 0xD800 && hw < 0xDC00) {
+ state->s0 |= hw;
+ return;
+ }
+
+ /*
+ * Anything else we simply output.
+ */
+ emit(emitctx, hw);
+ }
+}
+
+/*
+ * Repeated code in write_utf16 abstracted out for sanity.
+ */
+static void emithl(void (*emit)(void *ctx, long int output), void *emitctx,
+ unsigned long s0, long int hw)
+{
+ int h = (hw >> 8) & 0xFF, l = hw & 0xFF;
+
+ if (s0 & 0x20000) {
+ /* Big-endian takes priority over little, if both are allowed. */
+ emit(emitctx, h);
+ emit(emitctx, l);
+ } else {
+ emit(emitctx, l);
+ emit(emitctx, h);
+ }
+}
+
+static int write_utf16(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output),
+ void *emitctx)
+{
+ struct utf16 const *utf = (struct utf16 *)charset->data;
+
+ /*
+ * state->s0 == 0 means we have not output anything yet (and so
+ * must output a BOM before we do anything else). state->s0 ==
+ * 1 means we are off and running.
+ */
+
+ if (input_chr < 0)
+ return TRUE; /* no cleanup required */
+
+ if ((input_chr >= 0xD800 && input_chr < 0xE000) ||
+ input_chr >= 0x110000) {
+ /*
+ * We can't output surrogates, or anything above 0x10FFFF.
+ */
+ return FALSE;
+ }
+
+ if (!state->s0) {
+ state->s0 = 1;
+ emithl(emit, emitctx, utf->s0, 0xFEFF);
+ }
+
+ if (input_chr < 0x10000) {
+ emithl(emit, emitctx, utf->s0, input_chr);
+ } else {
+ input_chr -= 0x10000;
+ /* now input_chr is between 0 and 0xFFFFF inclusive */
+ emithl(emit, emitctx, utf->s0, 0xD800 | ((input_chr >> 10) & 0x3FF));
+ emithl(emit, emitctx, utf->s0, 0xDC00 | (input_chr & 0x3FF));
+ }
+ return TRUE;
+}
+
+static const struct utf16 utf16_bigendian = { 0x20000 };
+static const struct utf16 utf16_littleendian = { 0x10000 };
+static const struct utf16 utf16_variable_endianness = { 0x30000 };
+
+const charset_spec charset_CS_UTF16BE = {
+ CS_UTF16BE, read_utf16, write_utf16, &utf16_bigendian
+};
+const charset_spec charset_CS_UTF16LE = {
+ CS_UTF16LE, read_utf16, write_utf16, &utf16_littleendian
+};
+const charset_spec charset_CS_UTF16 = {
+ CS_UTF16, read_utf16, write_utf16, &utf16_variable_endianness
+};
+
+#else /* ENUM_CHARSETS */
+
+ENUM_CHARSET(CS_UTF16)
+ENUM_CHARSET(CS_UTF16BE)
+ENUM_CHARSET(CS_UTF16LE)
+
+#endif /* ENUM_CHARSETS */