summaryrefslogtreecommitdiff
path: root/src/unicode.c
diff options
context:
space:
mode:
authorJörg Frings-Fürst <debian@jff.email>2020-04-20 20:34:10 +0200
committerJörg Frings-Fürst <debian@jff.email>2020-04-20 20:34:10 +0200
commitf3d6e46ce3762b6f51a166119d3982fd3715507a (patch)
tree0935fb6da7f1d9728b42ddf08395a0e977e1c228 /src/unicode.c
parent043fff5b6f2461aeccb1c62cb771826cfe301832 (diff)
parent73c6133c32cddae59813cbadf655cb50a3a7356a (diff)
Merge branch 'feature/upstream' into develop
Diffstat (limited to 'src/unicode.c')
-rw-r--r--src/unicode.c253
1 files changed, 159 insertions, 94 deletions
diff --git a/src/unicode.c b/src/unicode.c
index 474436a..080da74 100644
--- a/src/unicode.c
+++ b/src/unicode.c
@@ -278,9 +278,12 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end,
OnigCaseFoldCodeItem items[])
{
- int n, m, i, j, k, len;
- OnigCodePoint code, codes[3];
- const struct ByUnfoldKey* buk;
+ int n, m, i, j, k, len, lens[3];
+ int index;
+ int fn, ncs[3];
+ OnigCodePoint cs[3][4];
+ OnigCodePoint code, codes[3], orig_codes[3];
+ const struct ByUnfoldKey* buk1;
n = 0;
@@ -316,38 +319,161 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
}
#endif
- buk = onigenc_unicode_unfold_key(code);
- if (buk != 0) {
- if (buk->fold_len == 1) {
+ orig_codes[0] = code;
+ lens[0] = len;
+ p += len;
+
+ buk1 = onigenc_unicode_unfold_key(orig_codes[0]);
+ if (buk1 != 0 && buk1->fold_len == 1) {
+ codes[0] = *FOLDS1_FOLD(buk1->index);
+ }
+ else
+ codes[0] = orig_codes[0];
+
+ if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
+ goto fold1;
+
+ if (p < end) {
+ const struct ByUnfoldKey* buk;
+
+ code = ONIGENC_MBC_TO_CODE(enc, p, end);
+ orig_codes[1] = code;
+ len = enclen(enc, p);
+ lens[1] = lens[0] + len;
+ buk = onigenc_unicode_unfold_key(orig_codes[1]);
+ if (buk != 0 && buk->fold_len == 1) {
+ codes[1] = *FOLDS1_FOLD(buk->index);
+ }
+ else
+ codes[1] = orig_codes[1];
+
+ p += len;
+ if (p < end) {
+ code = ONIGENC_MBC_TO_CODE(enc, p, end);
+ orig_codes[2] = code;
+ len = enclen(enc, p);
+ lens[2] = lens[1] + len;
+ buk = onigenc_unicode_unfold_key(orig_codes[2]);
+ if (buk != 0 && buk->fold_len == 1) {
+ codes[2] = *FOLDS1_FOLD(buk->index);
+ }
+ else
+ codes[2] = orig_codes[2];
+
+ index = onigenc_unicode_fold3_key(codes);
+ if (index >= 0) {
+ m = FOLDS3_UNFOLDS_NUM(index);
+ for (i = 0; i < m; i++) {
+ items[n].byte_len = lens[2];
+ items[n].code_len = 1;
+ items[n].code[0] = FOLDS3_UNFOLDS(index)[i];
+ n++;
+ }
+
+ for (fn = 0; fn < 3; fn++) {
+ int sindex;
+ cs[fn][0] = FOLDS3_FOLD(index)[fn];
+ ncs[fn] = 1;
+ sindex = onigenc_unicode_fold1_key(&cs[fn][0]);
+ if (sindex >= 0) {
+ int m = FOLDS1_UNFOLDS_NUM(sindex);
+ for (i = 0; i < m; i++) {
+ cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i];
+ }
+ ncs[fn] += m;
+ }
+ }
+
+ for (i = 0; i < ncs[0]; i++) {
+ for (j = 0; j < ncs[1]; j++) {
+ for (k = 0; k < ncs[2]; k++) {
+ items[n].byte_len = lens[2];
+ items[n].code_len = 3;
+ items[n].code[0] = cs[0][i];
+ items[n].code[1] = cs[1][j];
+ items[n].code[2] = cs[2][k];
+ if (items[n].code[0] == orig_codes[0] &&
+ items[n].code[1] == orig_codes[1] &&
+ items[n].code[2] == orig_codes[2])
+ continue;
+ n++;
+ }
+ }
+ }
+
+ return n;
+ }
+ }
+
+ index = onigenc_unicode_fold2_key(codes);
+ if (index >= 0) {
+ m = FOLDS2_UNFOLDS_NUM(index);
+ for (i = 0; i < m; i++) {
+ items[n].byte_len = lens[1];
+ items[n].code_len = 1;
+ items[n].code[0] = FOLDS2_UNFOLDS(index)[i];
+ n++;
+ }
+
+ for (fn = 0; fn < 2; fn++) {
+ int sindex;
+ cs[fn][0] = FOLDS2_FOLD(index)[fn];
+ ncs[fn] = 1;
+ sindex = onigenc_unicode_fold1_key(&cs[fn][0]);
+ if (sindex >= 0) {
+ int m = FOLDS1_UNFOLDS_NUM(sindex);
+ for (i = 0; i < m; i++) {
+ cs[fn][i+1] = FOLDS1_UNFOLDS(sindex)[i];
+ }
+ ncs[fn] += m;
+ }
+ }
+
+ for (i = 0; i < ncs[0]; i++) {
+ for (j = 0; j < ncs[1]; j++) {
+ items[n].byte_len = lens[1];
+ items[n].code_len = 2;
+ items[n].code[0] = cs[0][i];
+ items[n].code[1] = cs[1][j];
+ if (items[n].code[0] == orig_codes[0] &&
+ items[n].code[1] == orig_codes[1])
+ continue;
+ n++;
+ }
+ }
+
+ return n;
+ }
+ }
+
+ fold1:
+ if (buk1 != 0) {
+ if (buk1->fold_len == 1) {
int un;
- items[0].byte_len = len;
+ items[0].byte_len = lens[0];
items[0].code_len = 1;
- items[0].code[0] = *FOLDS1_FOLD(buk->index);
+ items[0].code[0] = *FOLDS1_FOLD(buk1->index);
n++;
- un = FOLDS1_UNFOLDS_NUM(buk->index);
+ un = FOLDS1_UNFOLDS_NUM(buk1->index);
for (i = 0; i < un; i++) {
- OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i];
- if (unfold != code) {
- items[n].byte_len = len;
+ OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i];
+ if (unfold != orig_codes[0]) {
+ items[n].byte_len = lens[0];
items[n].code_len = 1;
items[n].code[0] = unfold;
n++;
}
}
- code = items[0].code[0]; /* for multi-code to unfold search. */
}
else if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
- OnigCodePoint cs[3][4];
- int fn, ncs[3];
-
- if (buk->fold_len == 2) {
- m = FOLDS2_UNFOLDS_NUM(buk->index);
+ if (buk1->fold_len == 2) {
+ m = FOLDS2_UNFOLDS_NUM(buk1->index);
for (i = 0; i < m; i++) {
- OnigCodePoint unfold = FOLDS2_UNFOLDS(buk->index)[i];
- if (unfold == code) continue;
+ OnigCodePoint unfold = FOLDS2_UNFOLDS(buk1->index)[i];
+ if (unfold == orig_codes[0]) continue;
- items[n].byte_len = len;
+ items[n].byte_len = lens[0];
items[n].code_len = 1;
items[n].code[0] = unfold;
n++;
@@ -355,7 +481,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
for (fn = 0; fn < 2; fn++) {
int index;
- cs[fn][0] = FOLDS2_FOLD(buk->index)[fn];
+ cs[fn][0] = FOLDS2_FOLD(buk1->index)[fn];
ncs[fn] = 1;
index = onigenc_unicode_fold1_key(&cs[fn][0]);
if (index >= 0) {
@@ -369,7 +495,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
for (i = 0; i < ncs[0]; i++) {
for (j = 0; j < ncs[1]; j++) {
- items[n].byte_len = len;
+ items[n].byte_len = lens[0];
items[n].code_len = 2;
items[n].code[0] = cs[0][i];
items[n].code[1] = cs[1][j];
@@ -378,12 +504,12 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
}
}
else { /* fold_len == 3 */
- m = FOLDS3_UNFOLDS_NUM(buk->index);
+ m = FOLDS3_UNFOLDS_NUM(buk1->index);
for (i = 0; i < m; i++) {
- OnigCodePoint unfold = FOLDS3_UNFOLDS(buk->index)[i];
- if (unfold == code) continue;
+ OnigCodePoint unfold = FOLDS3_UNFOLDS(buk1->index)[i];
+ if (unfold == orig_codes[0]) continue;
- items[n].byte_len = len;
+ items[n].byte_len = lens[0];
items[n].code_len = 1;
items[n].code[0] = unfold;
n++;
@@ -391,7 +517,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
for (fn = 0; fn < 3; fn++) {
int index;
- cs[fn][0] = FOLDS3_FOLD(buk->index)[fn];
+ cs[fn][0] = FOLDS3_FOLD(buk1->index)[fn];
ncs[fn] = 1;
index = onigenc_unicode_fold1_key(&cs[fn][0]);
if (index >= 0) {
@@ -406,7 +532,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
for (i = 0; i < ncs[0]; i++) {
for (j = 0; j < ncs[1]; j++) {
for (k = 0; k < ncs[2]; k++) {
- items[n].byte_len = len;
+ items[n].byte_len = lens[0];
items[n].code_len = 3;
items[n].code[0] = cs[0][i];
items[n].code[1] = cs[1][j];
@@ -416,17 +542,14 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
}
}
}
-
- /* multi char folded code is not head of another folded multi char */
- return n;
}
}
else {
- int index = onigenc_unicode_fold1_key(&code);
+ int index = onigenc_unicode_fold1_key(orig_codes);
if (index >= 0) {
int m = FOLDS1_UNFOLDS_NUM(index);
for (i = 0; i < m; i++) {
- items[n].byte_len = len;
+ items[n].byte_len = lens[0];
items[n].code_len = 1;
items[n].code[0] = FOLDS1_UNFOLDS(index)[i];
n++;
@@ -434,64 +557,6 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
}
}
- if ((flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) == 0)
- return n;
-
- p += len;
- if (p < end) {
- int clen;
- int index;
-
- codes[0] = code;
- code = ONIGENC_MBC_TO_CODE(enc, p, end);
-
- buk = onigenc_unicode_unfold_key(code);
- if (buk != 0 && buk->fold_len == 1) {
- codes[1] = *FOLDS1_FOLD(buk->index);
- }
- else
- codes[1] = code;
-
- clen = enclen(enc, p);
- len += clen;
-
- index = onigenc_unicode_fold2_key(codes);
- if (index >= 0) {
- m = FOLDS2_UNFOLDS_NUM(index);
- for (i = 0; i < m; i++) {
- items[n].byte_len = len;
- items[n].code_len = 1;
- items[n].code[0] = FOLDS2_UNFOLDS(index)[i];
- n++;
- }
- }
-
- p += clen;
- if (p < end) {
- code = ONIGENC_MBC_TO_CODE(enc, p, end);
- buk = onigenc_unicode_unfold_key(code);
- if (buk != 0 && buk->fold_len == 1) {
- codes[2] = *FOLDS1_FOLD(buk->index);
- }
- else
- codes[2] = code;
-
- clen = enclen(enc, p);
- len += clen;
-
- index = onigenc_unicode_fold3_key(codes);
- if (index >= 0) {
- m = FOLDS3_UNFOLDS_NUM(index);
- for (i = 0; i < m; i++) {
- items[n].byte_len = len;
- items[n].code_len = 1;
- items[n].code[0] = FOLDS3_UNFOLDS(index)[i];
- n++;
- }
- }
- }
- }
-
return n;
}
@@ -930,7 +995,7 @@ onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
if (! ONIGENC_IS_UNICODE_ENCODING(enc)) {
- return from != 0x000d || to != 0x000a;
+ return from != 0x000d || to != NEWLINE_CODE;
}
btype = unicode_egcb_is_break_2code(from, to);
@@ -973,7 +1038,7 @@ onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
return 1;
#else
- return from != 0x000d || to != 0x000a;
+ return from != 0x000d || to != NEWLINE_CODE;
#endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */
}