diff options
Diffstat (limited to 'lib/uniwbrk/u-wordbreaks.h')
-rw-r--r-- | lib/uniwbrk/u-wordbreaks.h | 85 |
1 files changed, 57 insertions, 28 deletions
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h index b0fd301..cdeab0b 100644 --- a/lib/uniwbrk/u-wordbreaks.h +++ b/lib/uniwbrk/u-wordbreaks.h @@ -1,5 +1,5 @@ /* Word breaks in UTF-8/UTF-16/UTF-32 strings. - Copyright (C) 2009-2010 Free Software Foundation, Inc. + Copyright (C) 2009-2015 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2009. This program is free software: you can redistribute it and/or modify it @@ -55,16 +55,12 @@ FUNC (const UNIT *s, size_t n, char *p) if (last_char_prop == WBP_CR && prop == WBP_LF) /* *p = 0 */; /* Break before and after newlines. */ - else if (last_char_prop >= WBP_NEWLINE - /* same as: - last_char_prop == WBP_CR - || last_char_prop == WBP_LF - || last_char_prop == WBP_NEWLINE */ - || prop >= WBP_NEWLINE - /* same as: - prop == WBP_CR - || prop == WBP_LF - || prop == WBP_NEWLINE */) + else if ((last_char_prop == WBP_CR + || last_char_prop == WBP_LF + || last_char_prop == WBP_NEWLINE) + || (prop == WBP_CR + || prop == WBP_LF + || prop == WBP_NEWLINE)) *p = 1; /* Ignore Format and Extend characters. */ else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) @@ -73,38 +69,66 @@ FUNC (const UNIT *s, size_t n, char *p) secondlast last current - ALetter (MidLetter | MidNumLet) × ALetter (WB7) - ALetter × (MidLetter | MidNumLet) ALetter (WB6) - Numeric (MidNum | MidNumLet) × Numeric (WB11) - Numeric × (MidNum | MidNumLet) Numeric (WB12) - ALetter × ALetter (WB5) - ALetter × Numeric (WB9) - Numeric × ALetter (WB10) + (ALetter | HL) (MidLetter | MidNumLet | SQ) × (ALetter | HL) (WB7) + (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6) + Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11) + Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12) + HL × DQ HL (WB7b) + HL DQ × HL (WB7c) + (ALetter | HL) × (ALetter | HL) (WB5) + (ALetter | HL) × Numeric (WB9) + Numeric × (ALetter | HL) (WB10) Numeric × Numeric (WB8) + HL × SQ (WB7a) Katakana × Katakana (WB13) - (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) + (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a) ExtendNumLet × ExtendNumLet (WB13a) - ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) + ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b) + Regional_Indicator × Regional_Indicator (WB13c) */ /* No break across certain punctuation. Also, disable word breaks that were recognized earlier (due to lookahead of only one complex character). */ - if ((prop == WBP_ALETTER + if (((prop == WBP_ALETTER + || prop == WBP_HL) && (last_compchar_prop == WBP_MIDLETTER - || last_compchar_prop == WBP_MIDNUMLET) - && secondlast_compchar_prop == WBP_ALETTER) + || last_compchar_prop == WBP_MIDNUMLET + || last_compchar_prop == WBP_SQ) + && (secondlast_compchar_prop == WBP_ALETTER + || secondlast_compchar_prop == WBP_HL)) || (prop == WBP_NUMERIC && (last_compchar_prop == WBP_MIDNUM - || last_compchar_prop == WBP_MIDNUMLET) - && secondlast_compchar_prop == WBP_NUMERIC)) + || last_compchar_prop == WBP_MIDNUMLET + || last_compchar_prop == WBP_SQ) + && secondlast_compchar_prop == WBP_NUMERIC) + || (prop == WBP_HL + && last_compchar_prop == WBP_DQ + && secondlast_compchar_prop == WBP_HL)) { *last_compchar_ptr = 0; /* *p = 0; */ } + /* Break after Format and Extend characters. */ + else if (last_compchar_prop == WBP_EXTEND + || last_compchar_prop == WBP_FORMAT) + *p = 1; else { + /* Normalize property value to table index, + skipping 5 properties: WBP_EXTEND, + WBP_FORMAT, WBP_NEWLINE, WBP_CR, and + WBP_LF. */ + int last_compchar_prop_index = last_compchar_prop; + int prop_index = prop; + + if (last_compchar_prop_index >= WBP_EXTEND) + last_compchar_prop_index -= 5; + + if (prop_index >= WBP_EXTEND) + prop_index -= 5; + /* Perform a single table lookup. */ - if (uniwbrk_table[last_compchar_prop][prop]) + if (uniwbrk_table[last_compchar_prop_index][prop_index]) *p = 1; /* else *p = 0; */ } @@ -112,8 +136,13 @@ FUNC (const UNIT *s, size_t n, char *p) } last_char_prop = prop; - /* Ignore Format and Extend characters, except at the start of the string. */ - if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) + /* Ignore Format and Extend characters, except at the start + of the line. */ + if (last_compchar_prop < 0 + || last_compchar_prop == WBP_CR + || last_compchar_prop == WBP_LF + || last_compchar_prop == WBP_NEWLINE + || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) { secondlast_compchar_prop = last_compchar_prop; last_compchar_prop = prop; |