summaryrefslogtreecommitdiff
path: root/lib/uniwbrk/u-wordbreaks.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/uniwbrk/u-wordbreaks.h')
-rw-r--r--lib/uniwbrk/u-wordbreaks.h59
1 files changed, 35 insertions, 24 deletions
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 0d881c7..e8eb01a 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -1,28 +1,30 @@
/* Word breaks in UTF-8/UTF-16/UTF-32 strings. -*- coding: utf-8 -*-
- Copyright (C) 2009-2018 Free Software Foundation, Inc.
+ Copyright (C) 2009-2022 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2009.
- This program is free software: you can redistribute it and/or
- modify it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
- or
-
- * the GNU General Public License as published by the Free
- Software Foundation; either version 2 of the License, or (at your
- option) any later version.
-
- or both in parallel, as here.
- This program is distributed in the hope that it will be useful,
+ This file is free software.
+ It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
+ You can redistribute it and/or modify it under either
+ - the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version, or
+ - the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option)
+ any later version, or
+ - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
+
+ This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
+ Lesser General Public License and the GNU General Public License
+ for more details.
- You should have received a copy of the GNU Lesser General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>. */
+ You should have received a copy of the GNU Lesser General Public
+ License and of the GNU General Public License along with this
+ program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* This file implements section 4 "Word Boundaries"
+ of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/>. */
void
FUNC (const UNIT *s, size_t n, char *p)
@@ -48,6 +50,8 @@ FUNC (const UNIT *s, size_t n, char *p)
-1 at the very beginning of the string. */
int secondlast_compchar_prop = -1;
+ /* Number of consecutive regional indicator (RI) characters seen
+ immediately before the current point. */
size_t ri_count = 0;
/* Don't break inside multibyte characters. */
@@ -74,11 +78,18 @@ FUNC (const UNIT *s, size_t n, char *p)
|| prop == WBP_NEWLINE))
*p = 1;
/* No break within emoji zwj sequence (WB3c). */
- else if (last_char_prop == WBP_ZWJ &&
- (prop == WBP_GAZ || prop == WBP_EBG))
+ else if (last_char_prop == WBP_ZWJ
+ && uc_is_property_extended_pictographic (uc))
+ /* *p = 0 */;
+ /* Keep horizontal whitespace together (WB3d). */
+ else if (last_char_prop == WBP_WSS && prop == WBP_WSS)
/* *p = 0 */;
- /* Ignore Format and Extend characters. */
- else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
+ /* Ignore Format and Extend characters (WB4). */
+ else if (prop == WBP_EXTEND
+ || prop == WBP_FORMAT
+ || prop == WBP_ZWJ)
+ /* *p = 0 */;
+ else
{
/* No break in these situations (see UAX #29):
@@ -147,7 +158,7 @@ FUNC (const UNIT *s, size_t n, char *p)
last_char_prop = prop;
/* Ignore Format and Extend characters, except at the
- start of the line. */
+ start of the line (WB4). */
if (last_compchar_prop < 0
|| last_compchar_prop == WBP_CR
|| last_compchar_prop == WBP_LF