summaryrefslogtreecommitdiff
path: root/lib/uniname/uniname.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/uniname/uniname.c')
-rw-r--r--lib/uniname/uniname.c278
1 files changed, 85 insertions, 193 deletions
diff --git a/lib/uniname/uniname.c b/lib/uniname/uniname.c
index 2191f09..e4b81cc 100644
--- a/lib/uniname/uniname.c
+++ b/lib/uniname/uniname.c
@@ -1,5 +1,5 @@
/* Association between Unicode characters and their names.
- Copyright (C) 2000-2002, 2005-2007, 2009-2015 Free Software Foundation, Inc.
+ Copyright (C) 2000-2002, 2005-2007, 2009-2010 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
@@ -45,11 +45,10 @@
#define UNICODE_CHARNAME_WORD_CJK 417
#define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
static const uint16_t unicode_names[68940] = ...;
- static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
- static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
+ static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...;
+ static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...;
#define UNICODE_CHARNAME_MAX_LENGTH 83
#define UNICODE_CHARNAME_MAX_WORDS 13
- static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
*/
/* Returns the word with a given index. */
@@ -128,82 +127,6 @@ unicode_name_word_lookup (const char *word, unsigned int length)
return -1;
}
-#define UNINAME_INVALID_INDEX UINT16_MAX
-
-/* Looks up the internal index of a Unicode character. */
-static uint16_t
-unicode_code_to_index (ucs4_t c)
-{
- /* Binary search in unicode_ranges. */
- unsigned int i1 = 0;
- unsigned int i2 = SIZEOF (unicode_ranges);
-
- for (;;)
- {
- unsigned int i = (i1 + i2) >> 1;
- ucs4_t start_code =
- unicode_ranges[i].index + unicode_ranges[i].gap;
- ucs4_t end_code =
- start_code + unicode_ranges[i].length - 1;
-
- if (start_code <= c && c <= end_code)
- return c - unicode_ranges[i].gap;
-
- if (end_code < c)
- {
- if (i1 == i)
- break;
- /* Note here: i1 < i < i2. */
- i1 = i;
- }
- else if (c < start_code)
- {
- if (i2 == i)
- break;
- /* Note here: i1 <= i < i2. */
- i2 = i;
- }
- }
- return UNINAME_INVALID_INDEX;
-}
-
-/* Looks up the codepoint of a Unicode character, from the given
- internal index. */
-static ucs4_t
-unicode_index_to_code (uint16_t index)
-{
- /* Binary search in unicode_ranges. */
- unsigned int i1 = 0;
- unsigned int i2 = SIZEOF (unicode_ranges);
-
- for (;;)
- {
- unsigned int i = (i1 + i2) >> 1;
- uint16_t start_index = unicode_ranges[i].index;
- uint16_t end_index = start_index + unicode_ranges[i].length - 1;
-
- if (start_index <= index && index <= end_index)
- return index + unicode_ranges[i].gap;
-
- if (end_index < index)
- {
- if (i1 == i)
- break;
- /* Note here: i1 < i < i2. */
- i1 = i;
- }
- else if (index < start_index)
- {
- if (i2 == i)
- break;
- /* Note here: i1 <= i < i2. */
- i2 = i;
- }
- }
- return UNINAME_INVALID;
-}
-
-
/* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
sections 3.11 and 4.4. */
static const char jamo_initial_short_name[19][3] =
@@ -278,59 +201,80 @@ unicode_character_name (ucs4_t c, char *buf)
*ptr = '\0';
return buf;
}
- else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
- {
- /* Special case for variation selectors. Keeps the tables
- small. */
-
- /* buf needs to have at least 19 + 3 bytes here. */
- sprintf (buf, "VARIATION SELECTOR-%d",
- c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
- return buf;
- }
else
{
- uint16_t index = unicode_code_to_index (c);
- const uint16_t *words = NULL;
+ const uint16_t *words;
- if (index != UNINAME_INVALID_INDEX)
+ /* Transform the code so that it fits in 16 bits. */
+ switch (c >> 12)
{
- /* Binary search in unicode_code_to_name. */
- unsigned int i1 = 0;
- unsigned int i2 = SIZEOF (unicode_index_to_name);
- for (;;)
- {
- unsigned int i = (i1 + i2) >> 1;
- if (unicode_index_to_name[i].index == index)
- {
- words = &unicode_names[unicode_index_to_name[i].name];
- break;
- }
- else if (unicode_index_to_name[i].index < index)
- {
- if (i1 == i)
- {
- words = NULL;
- break;
- }
- /* Note here: i1 < i < i2. */
- i1 = i;
- }
- else if (unicode_index_to_name[i].index > index)
- {
- if (i2 == i)
- {
- words = NULL;
- break;
- }
- /* Note here: i1 <= i < i2. */
- i2 = i;
- }
- }
+ case 0x00: case 0x01: case 0x02: case 0x03: case 0x04:
+ break;
+ case 0x0A:
+ c -= 0x05000;
+ break;
+ case 0x0F:
+ c -= 0x09000;
+ break;
+ case 0x10:
+ c -= 0x09000;
+ break;
+ case 0x12:
+ c -= 0x0A000;
+ break;
+ case 0x1D:
+ c -= 0x14000;
+ break;
+ case 0x1F:
+ c -= 0x15000;
+ break;
+ case 0x2F:
+ c -= 0x24000;
+ break;
+ case 0xE0:
+ c -= 0xD4000;
+ break;
+ default:
+ return NULL;
}
+
+ {
+ /* Binary search in unicode_code_to_name. */
+ unsigned int i1 = 0;
+ unsigned int i2 = SIZEOF (unicode_code_to_name);
+ for (;;)
+ {
+ unsigned int i = (i1 + i2) >> 1;
+ if (unicode_code_to_name[i].code == c)
+ {
+ words = &unicode_names[unicode_code_to_name[i].name];
+ break;
+ }
+ else if (unicode_code_to_name[i].code < c)
+ {
+ if (i1 == i)
+ {
+ words = NULL;
+ break;
+ }
+ /* Note here: i1 < i < i2. */
+ i1 = i;
+ }
+ else if (unicode_code_to_name[i].code > c)
+ {
+ if (i2 == i)
+ {
+ words = NULL;
+ break;
+ }
+ /* Note here: i1 <= i < i2. */
+ i2 = i;
+ }
+ }
+ }
if (words != NULL)
{
- /* Found it in unicode_index_to_name. Now concatenate the words. */
+ /* Found it in unicode_code_to_name. Now concatenate the words. */
/* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
char *ptr = buf;
for (;;)
@@ -375,37 +319,6 @@ unicode_name_character (const char *name)
if (false)
filled_buf:
{
- {
- /* Special case for variation selector aliases. Keeps the
- tables small. */
- const char *p1 = buf;
- if (ptr >= buf + 3 && *p1++ == 'V')
- {
- if (*p1++ == 'S')
- {
- if (*p1 != '0')
- {
- unsigned int c = 0;
- for (;;)
- {
- if (*p1 >= '0' && *p1 <= '9')
- c += (*p1 - '0');
- p1++;
- if (p1 == ptr)
- {
- if (c >= 1 && c <= 16)
- return c - 1 + 0xFE00;
- else if (c >= 17 && c <= 256)
- return c - 17 + 0xE0100;
- else
- break;
- }
- c = c * 10;
- }
- }
- }
- }
- }
/* Convert the constituents to uint16_t words. */
uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
uint16_t *wordptr = words;
@@ -537,38 +450,6 @@ unicode_name_character (const char *name)
}
}
}
- /* Special case for variation selectors. Keeps the
- tables small. */
- if (wordptr == &words[1]
- && words[0] == UNICODE_CHARNAME_WORD_VARIATION
- && p1 + 10 <= ptr
- && p1 + 12 >= ptr
- && memcmp (p1, "SELECTOR-", 9) == 0)
- {
- const char *p2 = p1 + 9;
-
- if (*p2 != '0')
- {
- unsigned int c = 0;
-
- for (;;)
- {
- if (*p2 >= '0' && *p2 <= '9')
- c += (*p2 - '0');
- p2++;
- if (p2 == ptr)
- {
- if (c >= 1 && c <= 16)
- return c - 1 + 0xFE00;
- else if (c >= 17 && c <= 256)
- return c - 17 + 0xE0100;
- else
- break;
- }
- c = c * 10;
- }
- }
- }
}
}
if (false)
@@ -582,15 +463,15 @@ unicode_name_character (const char *name)
for (; --i >= 0; )
words[i] = 2 * words[i] + 1;
}
- /* Binary search in unicode_name_to_index. */
+ /* Binary search in unicode_name_to_code. */
{
unsigned int i1 = 0;
- unsigned int i2 = SIZEOF (unicode_name_to_index);
+ unsigned int i2 = SIZEOF (unicode_name_to_code);
for (;;)
{
unsigned int i = (i1 + i2) >> 1;
const uint16_t *w = words;
- const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
+ const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
unsigned int n = words_length;
for (;;)
{
@@ -612,7 +493,18 @@ unicode_name_character (const char *name)
}
p++; w++; n--;
if (n == 0)
- return unicode_index_to_code (unicode_name_to_index[i].index);
+ {
+ unsigned int c = unicode_name_to_code[i].code;
+
+ /* Undo the transformation to 16-bit space. */
+ static const unsigned int offset[13] =
+ {
+ 0x00000, 0x00000, 0x00000, 0x00000, 0x00000,
+ 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000,
+ 0x15000, 0x24000, 0xD4000
+ };
+ return c + offset[c >> 12];
+ }
}
}
}