summaryrefslogtreecommitdiff
path: root/src/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/utf8.c')
-rw-r--r--src/utf8.c33
1 files changed, 24 insertions, 9 deletions
diff --git a/src/utf8.c b/src/utf8.c
index 4d1f9ec..70c1503 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -2,7 +2,7 @@
utf8.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
+ * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -29,6 +29,9 @@
#include "regenc.h"
+/* U+0000 - U+10FFFF */
+#define USE_RFC3629_RANGE
+
/* #define USE_INVALID_CODE_SCHEME */
#ifdef USE_INVALID_CODE_SCHEME
@@ -57,7 +60,11 @@ static const int EncLen_UTF8[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+#ifdef USE_RFC3629_RANGE
4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+#else
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
+#endif
};
static int
@@ -78,11 +85,11 @@ is_valid_mbc_string(const UChar* p, const UChar* end)
len = mbc_enc_len(p++);
if (len > 1) {
for (i = 1; i < len; i++) {
- if (p == end)
- return FALSE;
+ if (p == end)
+ return FALSE;
- if (! utf8_istail(*p++))
- return FALSE;
+ if (! utf8_istail(*p++))
+ return FALSE;
}
}
}
@@ -153,8 +160,10 @@ code_to_mbclen(OnigCodePoint code)
else if ((code & 0xfffff800) == 0) return 2;
else if ((code & 0xffff0000) == 0) return 3;
else if ((code & 0xffe00000) == 0) return 4;
+#ifndef USE_RFC3629_RANGE
else if ((code & 0xfc000000) == 0) return 5;
else if ((code & 0x80000000) == 0) return 6;
+#endif
#ifdef USE_INVALID_CODE_SCHEME
else if (code == INVALID_CODE_FE) return 1;
else if (code == INVALID_CODE_FF) return 1;
@@ -188,6 +197,7 @@ code_to_mbc(OnigCodePoint code, UChar *buf)
*p++ = UTF8_TRAILS(code, 12);
*p++ = UTF8_TRAILS(code, 6);
}
+#ifndef USE_RFC3629_RANGE
else if ((code & 0xfc000000) == 0) {
*p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
*p++ = UTF8_TRAILS(code, 18);
@@ -201,6 +211,7 @@ code_to_mbc(OnigCodePoint code, UChar *buf)
*p++ = UTF8_TRAILS(code, 12);
*p++ = UTF8_TRAILS(code, 6);
}
+#endif
#ifdef USE_INVALID_CODE_SCHEME
else if (code == INVALID_CODE_FE) {
*p = 0xfe;
@@ -222,7 +233,7 @@ code_to_mbc(OnigCodePoint code, UChar *buf)
static int
mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
- const UChar* end, UChar* fold)
+ const UChar* end, UChar* fold)
{
const UChar* p = *pp;
@@ -244,13 +255,13 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
}
else {
return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF8, flag,
- pp, end, fold);
+ pp, end, fold);
}
}
static int
get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
- const OnigCodePoint* ranges[])
+ const OnigCodePoint* ranges[])
{
*sb_out = 0x80;
return onigenc_unicode_ctype_code_range(ctype, ranges);
@@ -274,13 +285,17 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag,
const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
{
return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF8,
- flag, p, end, items);
+ flag, p, end, items);
}
OnigEncodingType OnigEncodingUTF8 = {
mbc_enc_len,
"UTF-8", /* name */
+#ifdef USE_RFC3629_RANGE
4, /* max enc length */
+#else
+ 6,
+#endif
1, /* min enc length */
onigenc_is_mbc_newline_0x0a,
mbc_to_code,