/* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent). Copyright (C) 2009-2022 Free Software Foundation, Inc. Written by Bruno Haible , 2009. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". You can redistribute it and/or modify it under either - the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version, or - the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version, or - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License and the GNU General Public License for more details. You should have received a copy of the GNU Lesser General Public License and of the GNU General Public License along with this program. If not, see . */ /* Quoting the Unicode standard, section "Default Case Algorithms": Find the word boundaries in X according to Unicode Standard Annex #29, “Text Boundaries.” For each word boundary, find the first cased character F following the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between F and the following word boundary to Lowercase_Mapping(C). */ UNIT * FUNC (const UNIT *s, size_t n, casing_prefix_context_t prefix_context, casing_suffix_context_t suffix_context, const char *iso639_language, uninorm_t nf, UNIT *resultbuf, size_t *lengthp) { /* The result being accumulated. */ UNIT *result; size_t length; size_t allocated; /* An array containing the word break positions. */ char *wordbreaks; /* Initialize the accumulator. */ if (nf != NULL || resultbuf == NULL) { result = NULL; allocated = 0; } else { result = resultbuf; allocated = *lengthp; } length = 0; /* Initialize the word breaks array. */ if (n > 0) { wordbreaks = (char *) malloc (n); if (wordbreaks == NULL) { errno = ENOMEM; goto fail2; } U_WORDBREAKS (s, n, wordbreaks); } else wordbreaks = NULL; { const UNIT *s_end = s + n; const char *wp = wordbreaks; /* When considering the string as segmented by word boundaries: For each such segment: - In the first part, we are searching for the first cased character. In this state, in_word_first_part = true, and no conversion takes place. - In the second part, we are converting every character: the first among these characters to title case, the other ones to lower case. In this state, in_word_first_part = false. */ bool in_word_first_part = true; /* Helper for evaluating the FINAL_SIGMA condition: Last character that was not case-ignorable. */ ucs4_t last_char_except_ignorable = prefix_context.last_char_except_ignorable; /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions: Last character that was of combining class 230 ("Above") or 0. */ ucs4_t last_char_normal_or_above = prefix_context.last_char_normal_or_above; while (s < s_end) { /* Fetch the next character. */ ucs4_t uc; int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); ucs4_t (*single_character_map) (ucs4_t); size_t offset_in_rule; /* offset in 'struct special_casing_rule' */ ucs4_t mapped_uc[3]; unsigned int mapped_count; if (*wp) /* Crossing a word boundary. */ in_word_first_part = true; /* Determine single_character_map, offset_in_rule. There are three possibilities: - uc should not be converted. - uc should be titlecased. - uc should be lowercased. */ if (in_word_first_part) { if (uc_is_cased (uc)) { /* uc is to be titlecased. */ single_character_map = uc_totitle; offset_in_rule = offsetof (struct special_casing_rule, title[0]); in_word_first_part = false; } else { /* uc is not converted. */ single_character_map = NULL; offset_in_rule = 0; } } else { /* uc is to be lowercased. */ single_character_map = uc_tolower; offset_in_rule = offsetof (struct special_casing_rule, lower[0]); } /* Actually map uc. */ if (single_character_map == NULL) { mapped_uc[0] = uc; mapped_count = 1; goto found_mapping; } if (uc < 0x10000) { /* Look first in the special-casing table. */ char code[3]; code[0] = (uc >> 8) & 0xff; code[1] = uc & 0xff; for (code[2] = 0; ; code[2]++) { const struct special_casing_rule *rule = gl_unicase_special_lookup (code, 3); if (rule == NULL) break; /* Test if the condition applies. */ /* Does the language apply? */ if (rule->language[0] == '\0' || (iso639_language != NULL && iso639_language[0] == rule->language[0] && iso639_language[1] == rule->language[1])) { /* Does the context apply? */ int context = rule->context; bool applies; if (context < 0) context = - context; switch (context) { case SCC_ALWAYS: applies = true; break; case SCC_FINAL_SIGMA: /* "Before" condition: preceded by a sequence consisting of a cased letter and a case-ignorable sequence. "After" condition: not followed by a sequence consisting of a case-ignorable sequence and then a cased letter. */ /* Test the "before" condition. */ applies = uc_is_cased (last_char_except_ignorable); /* Test the "after" condition. */ if (applies) { const UNIT *s2 = s + count; for (;;) { if (s2 < s_end) { ucs4_t uc2; int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); /* Our uc_is_case_ignorable function is known to return false for all cased characters. So we can call uc_is_case_ignorable first. */ if (!uc_is_case_ignorable (uc2)) { applies = ! uc_is_cased (uc2); break; } s2 += count2; } else { applies = ! uc_is_cased (suffix_context.first_char_except_ignorable); break; } } } break; case SCC_AFTER_SOFT_DOTTED: /* "Before" condition: There is a Soft_Dotted character before it, with no intervening character of combining class 0 or 230 (Above). */ /* Test the "before" condition. */ applies = uc_is_property_soft_dotted (last_char_normal_or_above); break; case SCC_MORE_ABOVE: /* "After" condition: followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above). */ /* Test the "after" condition. */ { const UNIT *s2 = s + count; applies = false; for (;;) { if (s2 < s_end) { ucs4_t uc2; int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); int ccc = uc_combining_class (uc2); if (ccc == UC_CCC_A) { applies = true; break; } if (ccc == UC_CCC_NR) break; s2 += count2; } else { applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0); break; } } } break; case SCC_BEFORE_DOT: /* "After" condition: followed by COMBINING DOT ABOVE (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may intervene between the current character and the combining dot above. */ /* Test the "after" condition. */ { const UNIT *s2 = s + count; applies = false; for (;;) { if (s2 < s_end) { ucs4_t uc2; int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); if (uc2 == 0x0307) /* COMBINING DOT ABOVE */ { applies = true; break; } { int ccc = uc_combining_class (uc2); if (ccc == UC_CCC_A || ccc == UC_CCC_NR) break; } s2 += count2; } else { applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0); break; } } } break; case SCC_AFTER_I: /* "Before" condition: There is an uppercase I before it, and there is no intervening character of combining class 0 or 230 (Above). */ /* Test the "before" condition. */ applies = (last_char_normal_or_above == 'I'); break; default: abort (); } if (rule->context < 0) applies = !applies; if (applies) { /* The rule applies. Look up the mapping (0 to 3 characters). */ const unsigned short *mapped_in_rule = (const unsigned short *)((const char *)rule + offset_in_rule); if (mapped_in_rule[0] == 0) mapped_count = 0; else { mapped_uc[0] = mapped_in_rule[0]; if (mapped_in_rule[1] == 0) mapped_count = 1; else { mapped_uc[1] = mapped_in_rule[1]; if (mapped_in_rule[2] == 0) mapped_count = 2; else { mapped_uc[2] = mapped_in_rule[2]; mapped_count = 3; } } } goto found_mapping; } } /* Optimization: Save a hash table lookup in the next round. */ if (!rule->has_next) break; } } /* No special-cased mapping. So use the locale and context independent mapping. */ mapped_uc[0] = single_character_map (uc); mapped_count = 1; found_mapping: /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */ { unsigned int i; for (i = 0; i < mapped_count; i++) { ucs4_t muc = mapped_uc[i]; /* Append muc to the result accumulator. */ if (length < allocated) { int ret = U_UCTOMB (result + length, muc, allocated - length); if (ret == -1) { errno = EINVAL; goto fail1; } if (ret >= 0) { length += ret; goto done_appending; } } { size_t old_allocated = allocated; size_t new_allocated = 2 * old_allocated; if (new_allocated < 64) new_allocated = 64; if (new_allocated < old_allocated) /* integer overflow? */ abort (); { UNIT *larger_result; if (result == NULL) { larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); if (larger_result == NULL) { errno = ENOMEM; goto fail1; } } else if (result == resultbuf) { larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); if (larger_result == NULL) { errno = ENOMEM; goto fail1; } U_CPY (larger_result, resultbuf, length); } else { larger_result = (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); if (larger_result == NULL) { errno = ENOMEM; goto fail1; } } result = larger_result; allocated = new_allocated; { int ret = U_UCTOMB (result + length, muc, allocated - length); if (ret == -1) { errno = EINVAL; goto fail1; } if (ret < 0) abort (); length += ret; goto done_appending; } } } done_appending: ; } } if (!uc_is_case_ignorable (uc)) last_char_except_ignorable = uc; { int ccc = uc_combining_class (uc); if (ccc == UC_CCC_A || ccc == UC_CCC_NR) last_char_normal_or_above = uc; } s += count; wp += count; } } free (wordbreaks); if (nf != NULL) { /* Finally, normalize the result. */ UNIT *normalized_result; normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp); if (normalized_result == NULL) goto fail2; free (result); return normalized_result; } if (length == 0) { if (result == NULL) { /* Return a non-NULL value. NULL means error. */ result = (UNIT *) malloc (1); if (result == NULL) { errno = ENOMEM; goto fail2; } } } else if (result != resultbuf && length < allocated) { /* Shrink the allocated memory if possible. */ UNIT *memory; memory = (UNIT *) realloc (result, length * sizeof (UNIT)); if (memory != NULL) result = memory; } *lengthp = length; return result; fail1: { int saved_errno = errno; free (wordbreaks); errno = saved_errno; } fail2: if (result != resultbuf) { int saved_errno = errno; free (result); errno = saved_errno; } return NULL; } /* * Local Variables: * coding: utf-8 * End: */