summaryrefslogtreecommitdiff
path: root/app/tools/halibut/charset/utf7.c
diff options
context:
space:
mode:
Diffstat (limited to 'app/tools/halibut/charset/utf7.c')
-rw-r--r--app/tools/halibut/charset/utf7.c295
1 files changed, 295 insertions, 0 deletions
diff --git a/app/tools/halibut/charset/utf7.c b/app/tools/halibut/charset/utf7.c
new file mode 100644
index 0000000..588aa47
--- /dev/null
+++ b/app/tools/halibut/charset/utf7.c
@@ -0,0 +1,295 @@
+/*
+ * utf7.c - routines to handle UTF-7 (RFC 1642 / RFC 2152).
+ */
+
+#ifndef ENUM_CHARSETS
+
+#include "charset.h"
+#include "internal.h"
+
+/*
+ * This array is generated by a piece of Perl:
+
+perl -e 'for $i (0..32) { $a[$i] |= 2; } $a[32] |= 1;' \
+ -e 'for $i ("a".."z","A".."Z","0".."9","'\''","(",' \
+ -e ' ")",",","-",".","/",":","?") { $a[ord $i] |= 1; }' \
+ -e 'for $i ("!","\"","#","\$","%","&","*",";","<","=",">","\@",' \
+ -e ' "[","]","^","_","`","{","|","}") { $a[ord $i] |= 2; }' \
+ -e 'for $i ("a".."z","A".."Z","0".."9","+","/") { $a[ord $i] |= 4; }' \
+ -e 'for $i (0..127) { printf "%s%d,%s", $i%32?"":" ", $a[$i],' \
+ -e ' ($i+1)%32?"":"\n"; }'
+
+ */
+static const unsigned char utf7_ascii_properties[128] = {
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,2,2,2,2,2,2,1,1,1,2,4,1,1,1,5,5,5,5,5,5,5,5,5,5,5,1,2,2,2,2,1,
+ 2,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,2,0,2,2,2,
+ 2,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,2,2,2,0,0,
+};
+#define SET_D(c) ((c) >= 0 && (c) < 0x80 && (utf7_ascii_properties[(c)] & 1))
+#define SET_O(c) ((c) >= 0 && (c) < 0x80 && (utf7_ascii_properties[(c)] & 2))
+#define SET_B(c) ((c) >= 0 && (c) < 0x80 && (utf7_ascii_properties[(c)] & 4))
+
+#define base64_value(c) ( (c) >= 'A' && (c) <= 'Z' ? (c) - 'A' : \
+ (c) >= 'a' && (c) <= 'z' ? (c) - 'a' + 26 : \
+ (c) >= '0' && (c) <= '9' ? (c) - '0' + 52 : \
+ (c) == '+' ? 62 : 63 )
+
+static const char *const base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static void read_utf7(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output), void *emitctx)
+{
+ long int hw;
+
+ UNUSEDARG(charset);
+
+ /*
+ * state->s0 is used to handle the conversion of the UTF-7
+ * transport format into a stream of halfwords. Its layout is:
+ *
+ * - In normal ASCII mode, it is zero.
+ *
+ * - Otherwise, it holds a leading 1 followed by all the bits
+ * so far accumulated in base64 digits.
+ *
+ * - Special case: when we have only just seen the initial `+'
+ * which enters base64 mode, it is set to 2 rather than 1
+ * (this is an otherwise unused value since base64 always
+ * accumulates an even number of bits at a time), so that
+ * the special sequence `+-' can be made to encode `+'
+ * easily.
+ *
+ * state->s1 is used to handle the conversion of those
+ * halfwords into Unicode values. It contains a high surrogate
+ * value if we've just seen one, and 0 otherwise.
+ */
+
+ if (!state->s0) {
+ if (input_chr == '+')
+ state->s0 = 2;
+ else
+ emit(emitctx, input_chr);
+ return;
+ } else {
+ if (!SET_B(input_chr)) {
+ /*
+ * base64 mode ends here. Emit the character we have,
+ * unless it's a minus in which case we should swallow
+ * it.
+ */
+ if (input_chr != '-')
+ emit(emitctx, input_chr);
+ else if (state->s0 == 2)
+ emit(emitctx, '+'); /* special case */
+ state->s0 = 0;
+ return;
+ }
+
+ /*
+ * Now we have a base64 character, so add it to our state,
+ * first correcting the special case value of s0.
+ */
+ if (state->s0 == 2)
+ state->s0 = 1;
+ state->s0 = (state->s0 << 6) | base64_value(input_chr);
+ }
+
+ /*
+ * If we don't have a whole halfword at this point, bale out.
+ */
+ if (!(state->s0 & 0xFFFF0000))
+ return;
+
+ /*
+ * Otherwise, extract the halfword. There are three
+ * possibilities for where the top set bit might be.
+ */
+ if (state->s0 & 0x00100000) {
+ hw = (state->s0 >> 4) & 0xFFFF;
+ state->s0 = (state->s0 & 0xF) | 0x10;
+ } else if (state->s0 & 0x00040000) {
+ hw = (state->s0 >> 2) & 0xFFFF;
+ state->s0 = (state->s0 & 3) | 4;
+ } else {
+ hw = state->s0 & 0xFFFF;
+ state->s0 = 1;
+ }
+
+ /*
+ * Now what reaches this point should be a stream of halfwords
+ * in sensible numeric form. So now we process surrogates.
+ */
+ if (state->s1) {
+ /*
+ * We have already seen a high surrogate, so we expect a
+ * low surrogate. Whinge if we didn't get it.
+ */
+ if (hw < 0xDC00 || hw >= 0xE000) {
+ emit(emitctx, ERROR);
+ } else {
+ hw &= 0x3FF;
+ hw |= (state->s1 & 0x3FF) << 10;
+ emit(emitctx, hw + 0x10000);
+ }
+ state->s1 = 0;
+ } else {
+ /*
+ * Any low surrogate is an error.
+ */
+ if (hw >= 0xDC00 && hw < 0xE000) {
+ emit(emitctx, ERROR);
+ return;
+ }
+
+ /*
+ * Any high surrogate is simply stored until we see the
+ * next halfword.
+ */
+ if (hw >= 0xD800 && hw < 0xDC00) {
+ state->s1 = hw;
+ return;
+ }
+
+ /*
+ * Anything else we simply output.
+ */
+ emit(emitctx, hw);
+ }
+}
+
+/*
+ * For writing UTF-7, we supply two charset definitions, one of
+ * which will directly encode Set O characters and the other of
+ * which will cautiously base64 them.
+ */
+static int write_utf7(charset_spec const *charset, long int input_chr,
+ charset_state *state,
+ void (*emit)(void *ctx, long int output),
+ void *emitctx)
+{
+ unsigned long hws[2];
+ int nhws;
+ int i;
+
+ /*
+ * For writing: state->s0 contains accumulated base64 data with
+ * a 1 in front, and state->s1 indicates how many bits of it we
+ * have.
+ */
+
+ if ((input_chr >= 0xD800 && input_chr < 0xE000) ||
+ input_chr >= 0x110000) {
+ /*
+ * We can't output surrogates, or anything above 0x10FFFF.
+ */
+ return FALSE;
+ }
+
+ /*
+ * Look for characters which we output in ASCII mode. A special
+ * case here is +, which can be encoded as the empty base64
+ * escape sequence `+-': if we're _already_ in ASCII mode we do
+ * that, but if we're in base64 mode at the point we see the +
+ * then we simply stay in base64 mode and output it as a
+ * halfword. (Switching back would cost three bytes, whereas
+ * staying in base64 costs only 2 2/3.)
+ */
+ if (input_chr == -1 || SET_D(input_chr) ||
+ (charset->charset == CS_UTF7 && SET_O(input_chr)) ||
+ (!state->s0 && input_chr == '+')) {
+ if (state->s0) {
+ /*
+ * These characters are output in ASCII mode, so flush any
+ * lingering base64 data.
+ */
+ state->s0 <<= 6 - state->s1;
+ emit(emitctx, base64_chars[state->s0 & 0x3F]);
+ /*
+ * I'm going to arbitrarily decide to always use the
+ * terminating minus sign. It's easier than figuring out
+ * whether to do so or not, and looks prettier besides.
+ */
+ emit(emitctx, '-');
+ state->s0 = state->s1 = 0;
+ }
+
+ /*
+ * Now output the character.
+ */
+ if (input_chr != -1) /* special case: just reset state */
+ emit(emitctx, input_chr);
+ if (input_chr == '+')
+ emit(emitctx, '-'); /* +- encodes + */
+ return TRUE;
+ }
+
+ /*
+ * Now we know we have a character that needs to be output as
+ * either one base64-encoded halfword or two. So first figure
+ * out how many...
+ */
+ if (input_chr < 0x10000) {
+ nhws = 1;
+ hws[0] = input_chr;
+ } else {
+ input_chr -= 0x10000;
+ if (input_chr >= 0x100000) {
+ /* Anything above 0x10FFFF is outside UTF-7 range. */
+ return FALSE;
+ }
+
+ nhws = 2;
+ hws[0] = 0xD800 | ((input_chr >> 10) & 0x3FF);
+ hws[1] = 0xDC00 | (input_chr & 0x3FF);
+ }
+
+ /*
+ * ... switch into base64 mode if required ...
+ */
+ if (!state->s0) {
+ emit(emitctx, '+');
+ state->s0 = 1;
+ state->s1 = 0;
+ }
+
+ /*
+ * ... and do the base64 output.
+ */
+ for (i = 0; i < nhws; i++) {
+ state->s0 = (state->s0 << 16) | hws[i];
+ state->s1 += 16;
+
+ while (state->s1 >= 6) {
+ /*
+ * The top set bit must be in position 16, 18 or 20.
+ */
+ unsigned long out, topbit;
+
+ out = (state->s0 >> (state->s1 - 6)) & 0x3F;
+ state->s1 -= 6;
+ topbit = 1 << state->s1;
+ state->s0 = (state->s0 & (topbit-1)) | topbit;
+
+ emit(emitctx, base64_chars[out]);
+ }
+ }
+ return TRUE;
+}
+
+const charset_spec charset_CS_UTF7 = {
+ CS_UTF7, read_utf7, write_utf7, NULL
+};
+
+const charset_spec charset_CS_UTF7_CONSERVATIVE = {
+ CS_UTF7_CONSERVATIVE, read_utf7, write_utf7, NULL
+};
+
+#else /* ENUM_CHARSETS */
+
+ENUM_CHARSET(CS_UTF7)
+ENUM_CHARSET(CS_UTF7_CONSERVATIVE)
+
+#endif /* ENUM_CHARSETS */