summaryrefslogtreecommitdiff
path: root/lib/unilbrk/u8-possible-linebreaks.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/unilbrk/u8-possible-linebreaks.c')
-rw-r--r--lib/unilbrk/u8-possible-linebreaks.c314
1 files changed, 189 insertions, 125 deletions
diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c
index 57fe491..923028e 100644
--- a/lib/unilbrk/u8-possible-linebreaks.c
+++ b/lib/unilbrk/u8-possible-linebreaks.c
@@ -1,33 +1,33 @@
/* Line breaking of UTF-8 strings.
- Copyright (C) 2001-2003, 2006-2018 Free Software Foundation, Inc.
+ Copyright (C) 2001-2003, 2006-2022 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2001.
- This program is free software: you can redistribute it and/or
- modify it under the terms of either:
+ This file is free software.
+ It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
+ You can redistribute it and/or modify it under either
+ - the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version, or
+ - the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option)
+ any later version, or
+ - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
- or
-
- * the GNU General Public License as published by the Free
- Software Foundation; either version 2 of the License, or (at your
- option) any later version.
-
- or both in parallel, as here.
- This program is distributed in the hope that it will be useful,
+ This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
+ Lesser General Public License and the GNU General Public License
+ for more details.
- You should have received a copy of the GNU Lesser General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>. */
+ You should have received a copy of the GNU Lesser General Public
+ License and of the GNU General Public License along with this
+ program. If not, see <https://www.gnu.org/licenses/>. */
#include <config.h>
/* Specification. */
#include "unilbrk.h"
+#include "unilbrk/internal.h"
#include <stdlib.h>
#include <string.h>
@@ -36,142 +36,202 @@
#include "uniwidth/cjk.h"
#include "unistr.h"
+/* This file implements
+ Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>. */
+
void
-u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *p)
+u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding,
+ int cr, char *p)
{
- int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
- const uint8_t *s_end = s + n;
- int last_prop = LBP_BK; /* line break property of last non-space character */
- char *seen_space = NULL; /* Was a space seen after the last non-space character? */
- char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
+ if (n > 0)
+ {
+ int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL);
+ const uint8_t *s_end = s + n;
+ int prev_prop = LBP_BK; /* line break property of last character */
+ int last_prop = LBP_BK; /* line break property of last non-space character */
+ char *seen_space = NULL; /* Was a space seen after the last non-space character? */
- /* Don't break inside multibyte characters. */
- memset (p, UC_BREAK_PROHIBITED, n);
+ /* Don't break inside multibyte characters. */
+ memset (p, UC_BREAK_PROHIBITED, n);
- while (s < s_end)
- {
- ucs4_t uc;
- int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
- int prop = unilbrkprop_lookup (uc);
+ /* Number of consecutive regional indicator (RI) characters seen
+ immediately before the current point. */
+ size_t ri_count = 0;
- if (prop == LBP_BK)
+ do
{
- /* Mandatory break. */
- *p = UC_BREAK_MANDATORY;
- last_prop = LBP_BK;
- seen_space = NULL;
- seen_space2 = NULL;
- }
- else
- {
- char *q;
-
- /* Resolve property values whose behaviour is not fixed. */
- switch (prop)
- {
- case LBP_AI:
- /* Resolve ambiguous. */
- prop = LBP_AI_REPLACEMENT;
- break;
- case LBP_CB:
- /* This is arbitrary. */
- prop = LBP_ID;
- break;
- case LBP_SA:
- /* We don't handle complex scripts yet.
- Treat LBP_SA like LBP_XX. */
- case LBP_XX:
- /* This is arbitrary. */
- prop = LBP_AL;
- break;
- }
+ ucs4_t uc;
+ int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
+ int prop = unilbrkprop_lookup (uc);
- /* Deal with spaces and combining characters. */
- q = p;
- if (prop == LBP_SP)
+ if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
{
- /* Don't break just before a space. */
- *p = UC_BREAK_PROHIBITED;
- seen_space2 = seen_space;
- seen_space = p;
- }
- else if (prop == LBP_ZW)
- {
- /* Don't break just before a zero-width space. */
- *p = UC_BREAK_PROHIBITED;
- last_prop = LBP_ZW;
+ /* (LB4,LB5,LB6) Mandatory break. */
+ *p = UC_BREAK_MANDATORY;
+ /* cr is either LBP_CR or -1. In the first case, recognize
+ a CR-LF sequence. */
+ if (prev_prop == cr && prop == LBP_LF)
+ p[-1] = UC_BREAK_CR_BEFORE_LF;
+ prev_prop = prop;
+ last_prop = LBP_BK;
seen_space = NULL;
- seen_space2 = NULL;
}
- else if (prop == LBP_CM)
+ else
{
- /* Don't break just before a combining character, except immediately after a
- zero-width space. */
- if (last_prop == LBP_ZW)
+ /* Resolve property values whose behaviour is not fixed. */
+ switch (prop)
{
- /* Break after zero-width space. */
- *p = UC_BREAK_POSSIBLE;
- /* A combining character turns a preceding space into LBP_ID. */
- last_prop = LBP_ID;
+ case LBP_AI:
+ /* Resolve ambiguous. */
+ prop = LBP_AI_REPLACEMENT;
+ break;
+ case LBP_CB:
+ /* This is arbitrary. */
+ prop = LBP_ID1;
+ break;
+ case LBP_SA:
+ /* We don't handle complex scripts yet.
+ Treat LBP_SA like LBP_XX. */
+ case LBP_XX:
+ /* This is arbitrary. */
+ prop = LBP_AL;
+ break;
}
- else
+
+ /* Deal with spaces and combining characters. */
+ if (prop == LBP_SP)
{
+ /* (LB7) Don't break just before a space. */
*p = UC_BREAK_PROHIBITED;
- /* A combining character turns a preceding space into LBP_ID. */
- if (seen_space != NULL)
- {
- q = seen_space;
- seen_space = seen_space2;
- prop = LBP_ID;
- goto lookup_via_table;
- }
+ seen_space = p;
}
- }
- else
- {
- lookup_via_table:
- /* prop must be usable as an index for table 7.3 of UTR #14. */
- if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
- abort ();
-
- if (last_prop == LBP_BK)
+ else if (prop == LBP_ZW)
{
- /* Don't break at the beginning of a line. */
- *q = UC_BREAK_PROHIBITED;
+ /* (LB7) Don't break just before a zero-width space. */
+ *p = UC_BREAK_PROHIBITED;
+ last_prop = LBP_ZW;
+ seen_space = NULL;
}
- else if (last_prop == LBP_ZW)
+ else if (prop == LBP_CM || prop == LBP_ZWJ)
{
- /* Break after zero-width space. */
- *q = UC_BREAK_POSSIBLE;
+ /* (LB9) Don't break just before a combining character or
+ zero-width joiner, except immediately after a mandatory
+ break character, space, or zero-width space. */
+ if (last_prop == LBP_BK)
+ {
+ /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
+ *p = UC_BREAK_PROHIBITED;
+ /* (LB10) Treat CM or ZWJ as AL. */
+ last_prop = LBP_AL;
+ seen_space = NULL;
+ }
+ else if (last_prop == LBP_ZW || seen_space != NULL)
+ {
+ /* (LB8) Break after zero-width space. */
+ /* (LB18) Break after spaces.
+ We do *not* implement the "legacy support for space
+ character as base for combining marks" because now the
+ NBSP CM sequence is recommended instead of SP CM. */
+ *p = UC_BREAK_POSSIBLE;
+ /* (LB10) Treat CM or ZWJ as AL. */
+ last_prop = LBP_AL;
+ seen_space = NULL;
+ }
+ else
+ {
+ /* Treat X CM as if it were X. */
+ *p = UC_BREAK_PROHIBITED;
+ }
}
else
{
- switch (unilbrk_table [last_prop] [prop])
+ /* prop must be usable as an index for table 7.3 of UTR #14. */
+ if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
+ abort ();
+
+ if (last_prop == LBP_BK)
+ {
+ /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
+ *p = UC_BREAK_PROHIBITED;
+ }
+ else if (last_prop == LBP_ZW)
+ {
+ /* (LB8) Break after zero-width space. */
+ *p = UC_BREAK_POSSIBLE;
+ }
+ else if (prev_prop == LBP_ZWJ)
{
- case D:
- *q = UC_BREAK_POSSIBLE;
- break;
- case I:
- *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
- break;
- case P:
- *q = UC_BREAK_PROHIBITED;
- break;
- default:
- abort ();
+ /* (LB8a) Don't break right after a zero-width joiner. */
+ *p = UC_BREAK_PROHIBITED;
}
+ else if (last_prop == LBP_RI && prop == LBP_RI)
+ {
+ /* (LB30a) Break between two regional indicator symbols
+ if and only if there are an even number of regional
+ indicators preceding the position of the break. */
+ *p = (seen_space != NULL || (ri_count % 2) == 0
+ ? UC_BREAK_POSSIBLE
+ : UC_BREAK_PROHIBITED);
+ }
+ else if (prev_prop == LBP_HL_BA)
+ {
+ /* (LB21a) Don't break after Hebrew + Hyphen/Break-After. */
+ *p = UC_BREAK_PROHIBITED;
+ }
+ else
+ {
+ switch (unilbrk_table [last_prop] [prop])
+ {
+ case D:
+ *p = UC_BREAK_POSSIBLE;
+ break;
+ case I:
+ *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
+ break;
+ case P:
+ *p = UC_BREAK_PROHIBITED;
+ break;
+ default:
+ abort ();
+ }
+ }
+ last_prop = prop;
+ seen_space = NULL;
}
- last_prop = prop;
- seen_space = NULL;
- seen_space2 = NULL;
+
+ prev_prop = (prev_prop == LBP_HL && (prop == LBP_HY || prop == LBP_BA)
+ ? LBP_HL_BA
+ : prop);
}
- }
- s += count;
- p += count;
+ if (prop == LBP_RI)
+ ri_count++;
+ else
+ ri_count = 0;
+
+ s += count;
+ p += count;
+ }
+ while (s < s_end);
}
}
+#undef u8_possible_linebreaks
+
+void
+u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding,
+ char *p)
+{
+ u8_possible_linebreaks_loop (s, n, encoding, -1, p);
+}
+
+void
+u8_possible_linebreaks_v2 (const uint8_t *s, size_t n, const char *encoding,
+ char *p)
+{
+ u8_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
+}
+
#ifdef TEST
@@ -237,7 +297,7 @@ main (int argc, char * argv[])
char *breaks = malloc (length);
int i;
- u8_possible_linebreaks ((uint8_t *) input, length, "UTF-8", breaks);
+ u8_possible_linebreaks_v2 ((uint8_t *) input, length, "UTF-8", breaks);
for (i = 0; i < length; i++)
{
@@ -251,6 +311,10 @@ main (int argc, char * argv[])
/* U+21B2 (or U+21B5) in UTF-8 encoding */
putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
break;
+ case UC_BREAK_CR_BEFORE_LF:
+ /* U+21E4 in UTF-8 encoding */
+ putc (0xe2, stdout); putc (0x87, stdout); putc (0xa4, stdout);
+ break;
case UC_BREAK_PROHIBITED:
break;
default: