From a76fa337cc657dbe669ffb8dbdac606d4d6616f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Wed, 31 Aug 2016 03:42:05 +0200 Subject: Imported Upstream version 6.1.0 --- src/utf8.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index b78e7eb..219b7ea 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -29,7 +29,7 @@ #include "regenc.h" -#define USE_INVALID_CODE_SCHEME +//#define USE_INVALID_CODE_SCHEME #ifdef USE_INVALID_CODE_SCHEME /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ @@ -39,6 +39,7 @@ #endif #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) +#define utf8_istail(c) ((UChar )((c) & 0xc0) == 0x80) static const int EncLen_UTF8[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -65,6 +66,30 @@ mbc_enc_len(const UChar* p) return EncLen_UTF8[*p]; } +static int +is_valid_mbc_string(const UChar* p, const UChar* end) +{ + int i, len; + + while (p < end) { + if (! utf8_islead(*p)) + return FALSE; + + len = mbc_enc_len(p++); + if (len > 1) { + for (i = 1; i < len; i++) { + if (p == end) + return FALSE; + + if (! utf8_istail(*p++)) + return FALSE; + } + } + } + + return TRUE; +} + static int is_mbc_newline(const UChar* p, const UChar* end) { @@ -91,12 +116,14 @@ is_mbc_newline(const UChar* p, const UChar* end) } static OnigCodePoint -mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) +mbc_to_code(const UChar* p, const UChar* end) { int c, len; OnigCodePoint n; - len = enclen(ONIG_ENCODING_UTF8, p); + len = mbc_enc_len(p); + if (len > end - p) len = end - p; + c = *p++; if (len > 1) { len--; @@ -303,5 +330,6 @@ OnigEncodingType OnigEncodingUTF8 = { left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; -- cgit v1.2.3