summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJörg Frings-Fürst <debian@jff.email>2021-04-26 17:40:21 +0200
committerJörg Frings-Fürst <debian@jff.email>2021-04-26 17:40:21 +0200
commitd3a83c35311ec631a46b59b66c38ef8d3a2a629a (patch)
tree28fc4dd524fa29f712020b61e565ab47b1fefd8e /src
parent77a04959299aa252579a98655e626d1b8f5f9f34 (diff)
parent98f7065a3f7b386564840bb5b24b94f9335b2e97 (diff)
Update upstream source from tag 'upstream/6.9.7.1'
Update to upstream version '6.9.7.1' with Debian dir c2c92e088b7e91033d7f5bee51ac7827148eaf4b
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.windows39
-rw-r--r--src/cp1251.c10
-rw-r--r--src/gb18030.c46
-rw-r--r--src/iso8859_1.c96
-rw-r--r--src/iso8859_10.c8
-rw-r--r--src/iso8859_13.c8
-rw-r--r--src/iso8859_14.c8
-rw-r--r--src/iso8859_15.c8
-rw-r--r--src/iso8859_16.c8
-rw-r--r--src/iso8859_2.c8
-rw-r--r--src/iso8859_3.c8
-rw-r--r--src/iso8859_4.c8
-rw-r--r--src/iso8859_5.c10
-rw-r--r--src/iso8859_7.c10
-rw-r--r--src/iso8859_9.c8
-rw-r--r--src/koi8.c10
-rw-r--r--src/koi8_r.c8
-rw-r--r--src/oniguruma.h32
-rw-r--r--src/regcomp.c1105
-rw-r--r--src/regenc.c19
-rw-r--r--src/regenc.h16
-rw-r--r--src/regerror.c6
-rw-r--r--src/regexec.c417
-rw-r--r--src/regint.h93
-rw-r--r--src/regparse.c408
-rw-r--r--src/regparse.h25
-rw-r--r--src/regposix.c4
-rw-r--r--src/regsyntax.c31
-rw-r--r--src/unicode.c95
29 files changed, 1658 insertions, 894 deletions
diff --git a/src/Makefile.windows b/src/Makefile.windows
index 11d6fd8..b637772 100644
--- a/src/Makefile.windows
+++ b/src/Makefile.windows
@@ -2,8 +2,9 @@
product_name = oniguruma
-TEST_DIR = $(ONIG_DIR)/../test
-WIN_DIR = $(ONIG_DIR)/../windows
+TEST_DIR = $(ONIG_DIR)/../test
+SAMPLE_DIR = $(ONIG_DIR)/../sample
+WIN_DIR = $(ONIG_DIR)/../windows
CPPFLAGS =
CFLAGS = -O2 -nologo /W3
@@ -15,6 +16,8 @@ ARDLL = cl
ARDLL_FLAGS = -nologo -LD $(LINKFLAGS) -dll
LINKFLAGS = -link -incremental:no -pdb:none
+SAMPLE_CFLAGS = $(CFLAGS) /I$(ONIG_DIR)
+
INSTALL = install -c
CP = copy
CC = cl
@@ -89,11 +92,6 @@ makeargs = $(MFLAGS) CPPFLAGS='$(CPPFLAGS)' CFLAGS='$(CFLAGS)' CC='$(CC)'
# targets
default: all
-setup:
- $(CP) ..\win32\config.h config.h
- $(CP) ..\win32\testc.c testc.c
-
-
all: $(libname) $(dllname)
$(libname): $(libobjs) $(encobjs)
@@ -155,7 +153,7 @@ $(BUILD_DIR)/unicode_fold1_key.obj: $(ONIG_DIR)/unicode_fold1_key.c $(ONIG_DIR)/
$(BUILD_DIR)/unicode_fold2_key.obj: $(ONIG_DIR)/unicode_fold2_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h
$(BUILD_DIR)/unicode_fold3_key.obj: $(ONIG_DIR)/unicode_fold3_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h
-all-test: test_syntax test_regset test_utf8 testc testp testu
+all-test: test_syntax test_regset test_utf8 test_options test_back testc testp testu
test_syntax: $(TEST_DIR)/test_syntax.c $(libname)
$(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_syntax.c $(libname)
@@ -166,6 +164,12 @@ test_regset: $(TEST_DIR)/test_regset.c $(libname)
test_utf8: $(TEST_DIR)/test_utf8.c $(libname)
$(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_utf8.c $(libname)
+test_options: $(TEST_DIR)/test_options.c $(libname)
+ $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_options.c $(libname)
+
+test_back: $(TEST_DIR)/test_back.c $(libname)
+ $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_back.c $(libname)
+
testc: $(WIN_DIR)/testc.c $(libname)
$(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(WIN_DIR)/testc.c $(libname)
@@ -176,14 +180,17 @@ testu: $(TEST_DIR)/testu.c $(libname)
$(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(TEST_DIR)/testu.c $(libname)
clean:
- del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_syntax.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe
+ del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_syntax.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\test_options.exe $(BUILD_DIR)\test_back.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe
samples: all
- $(CC) $(CFLAGS) -I. /Fe:simple $(ONIG_DIR)\sample\simple.c $(dlllib)
- $(CC) $(CFLAGS) -I. /Fe:posix $(ONIG_DIR)\sample\posix.c $(dlllib)
- $(CC) $(CFLAGS) -I. /Fe:names $(ONIG_DIR)\sample\names.c $(dlllib)
- $(CC) $(CFLAGS) -I. /Fe:listcap $(ONIG_DIR)\sample\listcap.c $(dlllib)
- $(CC) $(CFLAGS) -I. /Fe:sql $(ONIG_DIR)\sample\sql.c $(dlllib)
- $(CC) $(CFLAGS) -I. /Fe:encode $(ONIG_DIR)\sample\encode.c $(dlllib)
- $(CC) $(CFLAGS) -I. /Fe:syntax $(ONIG_DIR)\sample\syntax.c $(dlllib)
+ $(CC) $(SAMPLE_CFLAGS) /Fe:simple $(SAMPLE_DIR)\simple.c $(dlllib)
+ $(CC) $(SAMPLE_CFLAGS) /Fe:posix $(SAMPLE_DIR)\posix.c $(dlllib)
+ $(CC) $(SAMPLE_CFLAGS) /Fe:names $(SAMPLE_DIR)\names.c $(dlllib)
+ $(CC) $(SAMPLE_CFLAGS) /Fe:listcap $(SAMPLE_DIR)\listcap.c $(dlllib)
+ $(CC) $(SAMPLE_CFLAGS) /Fe:sql $(SAMPLE_DIR)\sql.c $(dlllib)
+ $(CC) $(SAMPLE_CFLAGS) /Fe:encode $(SAMPLE_DIR)\encode.c $(dlllib)
+ $(CC) $(SAMPLE_CFLAGS) /Fe:syntax $(SAMPLE_DIR)\syntax.c $(dlllib)
+ $(CC) $(SAMPLE_CFLAGS) /Fe:count $(SAMPLE_DIR)\count.c $(dlllib)
+ $(CC) $(SAMPLE_CFLAGS) /Fe:regset $(SAMPLE_DIR)\regset.c $(dlllib)
+ $(CC) $(SAMPLE_CFLAGS) /Fe:callback_each_match $(SAMPLE_DIR)\callback_each_match.c $(dlllib)
diff --git a/src/cp1251.c b/src/cp1251.c
index fa20780..36b36f6 100644
--- a/src/cp1251.c
+++ b/src/cp1251.c
@@ -2,7 +2,7 @@
cp1251.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2006-2019 Byte <byte AT mail DOT kna DOT ru>
+ * Copyright (c) 2006-2020 Byte <byte AT mail DOT kna DOT ru>
* K.Kosako
* All rights reserved.
*
@@ -105,12 +105,16 @@ static const unsigned short EncCP1251_CtypeTable[256] = {
};
static int
-cp1251_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
+cp1251_mbc_case_fold(OnigCaseFoldType flag,
const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
{
const UChar* p = *pp;
- *lower = ENC_CP1251_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_CP1251_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1;
}
diff --git a/src/gb18030.c b/src/gb18030.c
index 7409d3e..1da19b4 100644
--- a/src/gb18030.c
+++ b/src/gb18030.c
@@ -30,9 +30,11 @@
#include "regenc.h"
-#if 1
+/* #define DEBUG_GB18030 */
-#define DEBUG_GB18030(arg)
+#ifndef DEBUG_GB18030
+
+#define DEBUG_OUT(arg)
#else
@@ -43,7 +45,7 @@
/* for printf() */
#include "regint.h"
-#define DEBUG_GB18030(arg) printf arg
+#define DEBUG_OUT(arg) printf arg
#endif
@@ -177,8 +179,8 @@ gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)
}
enum state {
- S_START,
- S_one_C2,
+ S_START = 0,
+ S_one_C2 = 1,
S_one_C4,
S_one_CM,
@@ -210,15 +212,43 @@ enum state {
S_odd_CM_even_C4CM,
};
+#ifdef DEBUG_GB18030
+static char* StateNames[] = {
+ "S_START",
+ "S_one_C2",
+ "S_one_C4",
+ "S_one_CM",
+ "S_odd_CM_one_CX",
+ "S_even_CM_one_CX",
+ "S_one_CMC4",
+ "S_odd_CMC4",
+ "S_one_C4_odd_CMC4",
+ "S_even_CMC4",
+ "S_one_C4_even_CMC4",
+ "S_odd_CM_odd_CMC4",
+ "S_even_CM_odd_CMC4",
+ "S_odd_CM_even_CMC4",
+ "S_even_CM_even_CMC4",
+ "S_odd_C4CM",
+ "S_one_CM_odd_C4CM",
+ "S_even_C4CM",
+ "S_one_CM_even_C4CM",
+ "S_even_CM_odd_C4CM",
+ "S_odd_CM_odd_C4CM",
+ "S_even_CM_even_C4CM",
+ "S_odd_CM_even_C4CM"
+};
+#endif
+
static UChar*
gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
{
const UChar *p;
enum state state = S_START;
- DEBUG_GB18030(("----------------\n"));
+ DEBUG_OUT(("----------------\n"));
for (p = s; p >= start; p--) {
- DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p));
+ DEBUG_OUT(("%5d: state %-19s (0x%02x)->\n", (int )(p - start), StateNames[state], *p));
switch (state) {
case S_START:
switch (GB18030_MAP[*p]) {
@@ -499,7 +529,7 @@ gb18030_left_adjust_char_head(const UChar* start, const UChar* s)
}
}
- DEBUG_GB18030(("state %d\n", state));
+ DEBUG_OUT(("state %-19s\n", StateNames[state]));
switch (state) {
case S_START: return (UChar *)(s - 0);
case S_one_C2: return (UChar *)(s - 0);
diff --git a/src/iso8859_1.c b/src/iso8859_1.c
index d75509e..2013e75 100644
--- a/src/iso8859_1.c
+++ b/src/iso8859_1.c
@@ -2,7 +2,7 @@
iso8859_1.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -114,7 +114,7 @@ apply_all_case_fold(OnigCaseFoldType flag,
}
static int
-get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
+get_case_fold_codes_by_str(OnigCaseFoldType flag,
const OnigUChar* p, const OnigUChar* end,
OnigCaseFoldCodeItem items[])
{
@@ -123,7 +123,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
if (0x41 <= *p && *p <= 0x5a) {
if (*p == LARGE_S && end > p + 1
- && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */
+ && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)
+ && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* SS */
ss_combination:
items[0].byte_len = 2;
items[0].code_len = 1;
@@ -152,7 +153,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
}
else if (0x61 <= *p && *p <= 0x7a) {
if (*p == SMALL_S && end > p + 1
- && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { /* ss */
+ && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)
+ && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* ss */
goto ss_combination;
}
@@ -161,56 +163,58 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
items[0].code[0] = (OnigCodePoint )(*p - 0x20);
return 1;
}
- else if (0xc0 <= *p && *p <= 0xcf) {
- items[0].byte_len = 1;
- items[0].code_len = 1;
- items[0].code[0] = (OnigCodePoint )(*p + 0x20);
- return 1;
- }
- else if (0xd0 <= *p && *p <= 0xdf) {
- if (*p == 0xdf) {
+ else if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
+ if (0xc0 <= *p && *p <= 0xcf) {
items[0].byte_len = 1;
- items[0].code_len = 2;
- items[0].code[0] = (OnigCodePoint )'s';
- items[0].code[1] = (OnigCodePoint )'s';
+ items[0].code_len = 1;
+ items[0].code[0] = (OnigCodePoint )(*p + 0x20);
+ return 1;
+ }
+ else if (0xd0 <= *p && *p <= 0xdf) {
+ if (*p == 0xdf) {
+ items[0].byte_len = 1;
+ items[0].code_len = 2;
+ items[0].code[0] = (OnigCodePoint )'s';
+ items[0].code[1] = (OnigCodePoint )'s';
- items[1].byte_len = 1;
- items[1].code_len = 2;
- items[1].code[0] = (OnigCodePoint )'S';
- items[1].code[1] = (OnigCodePoint )'S';
+ items[1].byte_len = 1;
+ items[1].code_len = 2;
+ items[1].code[0] = (OnigCodePoint )'S';
+ items[1].code[1] = (OnigCodePoint )'S';
- items[2].byte_len = 1;
- items[2].code_len = 2;
- items[2].code[0] = (OnigCodePoint )'s';
- items[2].code[1] = (OnigCodePoint )'S';
+ items[2].byte_len = 1;
+ items[2].code_len = 2;
+ items[2].code[0] = (OnigCodePoint )'s';
+ items[2].code[1] = (OnigCodePoint )'S';
- items[3].byte_len = 1;
- items[3].code_len = 2;
- items[3].code[0] = (OnigCodePoint )'S';
- items[3].code[1] = (OnigCodePoint )'s';
+ items[3].byte_len = 1;
+ items[3].code_len = 2;
+ items[3].code[0] = (OnigCodePoint )'S';
+ items[3].code[1] = (OnigCodePoint )'s';
- return 4;
- }
- else if (*p != 0xd7) {
- items[0].byte_len = 1;
- items[0].code_len = 1;
- items[0].code[0] = (OnigCodePoint )(*p + 0x20);
- return 1;
+ return 4;
+ }
+ else if (*p != 0xd7) {
+ items[0].byte_len = 1;
+ items[0].code_len = 1;
+ items[0].code[0] = (OnigCodePoint )(*p + 0x20);
+ return 1;
+ }
}
- }
- else if (0xe0 <= *p && *p <= 0xef) {
- items[0].byte_len = 1;
- items[0].code_len = 1;
- items[0].code[0] = (OnigCodePoint )(*p - 0x20);
- return 1;
- }
- else if (0xf0 <= *p && *p <= 0xfe) {
- if (*p != 0xf7) {
+ else if (0xe0 <= *p && *p <= 0xef) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p - 0x20);
return 1;
}
+ else if (0xf0 <= *p && *p <= 0xfe) {
+ if (*p != 0xf7) {
+ items[0].byte_len = 1;
+ items[0].code_len = 1;
+ items[0].code[0] = (OnigCodePoint )(*p - 0x20);
+ return 1;
+ }
+ }
}
return 0;
@@ -229,7 +233,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
return 2;
}
- *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1;
}
diff --git a/src/iso8859_10.c b/src/iso8859_10.c
index e98cffb..e4bf599 100644
--- a/src/iso8859_10.c
+++ b/src/iso8859_10.c
@@ -2,7 +2,7 @@
iso8859_10.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}
- *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1;
}
diff --git a/src/iso8859_13.c b/src/iso8859_13.c
index 2bd460f..dbf747f 100644
--- a/src/iso8859_13.c
+++ b/src/iso8859_13.c
@@ -2,7 +2,7 @@
iso8859_13.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}
- *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1;
}
diff --git a/src/iso8859_14.c b/src/iso8859_14.c
index 5030b55..a6d6b71 100644
--- a/src/iso8859_14.c
+++ b/src/iso8859_14.c
@@ -2,7 +2,7 @@
iso8859_14.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}
- *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1; /* return byte length of converted char to lower */
}
diff --git a/src/iso8859_15.c b/src/iso8859_15.c
index f32c3de..0bb6b12 100644
--- a/src/iso8859_15.c
+++ b/src/iso8859_15.c
@@ -2,7 +2,7 @@
iso8859_15.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}
- *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1; /* return byte length of converted char to lower */
}
diff --git a/src/iso8859_16.c b/src/iso8859_16.c
index 22a653a..bfd0a5b 100644
--- a/src/iso8859_16.c
+++ b/src/iso8859_16.c
@@ -2,7 +2,7 @@
iso8859_16.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}
- *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1; /* return byte length of converted char to lower */
}
diff --git a/src/iso8859_2.c b/src/iso8859_2.c
index dc3d0a1..d08140e 100644
--- a/src/iso8859_2.c
+++ b/src/iso8859_2.c
@@ -2,7 +2,7 @@
iso8859_2.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}
- *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1; /* return byte length of converted char to lower */
}
diff --git a/src/iso8859_3.c b/src/iso8859_3.c
index 49dc6b2..69b96fd 100644
--- a/src/iso8859_3.c
+++ b/src/iso8859_3.c
@@ -2,7 +2,7 @@
iso8859_3.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
return 2;
}
- *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1;
}
diff --git a/src/iso8859_4.c b/src/iso8859_4.c
index f3f6ba9..949b7a1 100644
--- a/src/iso8859_4.c
+++ b/src/iso8859_4.c
@@ -2,7 +2,7 @@
iso8859_4.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}
- *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1; /* return byte length of converted char to lower */
}
diff --git a/src/iso8859_5.c b/src/iso8859_5.c
index a5f587c..9e5d418 100644
--- a/src/iso8859_5.c
+++ b/src/iso8859_5.c
@@ -2,7 +2,7 @@
iso8859_5.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_5_CtypeTable[256] = {
};
static int
-mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
+mbc_case_fold(OnigCaseFoldType flag,
const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
{
const UChar* p = *pp;
- *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1;
}
diff --git a/src/iso8859_7.c b/src/iso8859_7.c
index 018efac..07b1360 100644
--- a/src/iso8859_7.c
+++ b/src/iso8859_7.c
@@ -2,7 +2,7 @@
iso8859_7.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_7_CtypeTable[256] = {
};
static int
-mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
+mbc_case_fold(OnigCaseFoldType flag,
const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
{
const UChar* p = *pp;
- *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1;
}
diff --git a/src/iso8859_9.c b/src/iso8859_9.c
index 1f9bdea..6f205e5 100644
--- a/src/iso8859_9.c
+++ b/src/iso8859_9.c
@@ -2,7 +2,7 @@
iso8859_9.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}
- *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1;
}
diff --git a/src/koi8.c b/src/koi8.c
index 37023c6..90a04f9 100644
--- a/src/koi8.c
+++ b/src/koi8.c
@@ -2,7 +2,7 @@
koi8.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -105,12 +105,16 @@ static const unsigned short EncKOI8_CtypeTable[256] = {
static int
-koi8_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
+koi8_mbc_case_fold(OnigCaseFoldType flag,
const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
{
const UChar* p = *pp;
- *lower = ENC_KOI8_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_KOI8_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1;
}
diff --git a/src/koi8_r.c b/src/koi8_r.c
index c77302f..31cc870 100644
--- a/src/koi8_r.c
+++ b/src/koi8_r.c
@@ -2,7 +2,7 @@
koi8_r.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -109,7 +109,11 @@ koi8_r_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
{
const UChar* p = *pp;
- *lower = ENC_KOI8_R_TO_LOWER_CASE(*p);
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
+ *lower = ENC_KOI8_R_TO_LOWER_CASE(*p);
+ else
+ *lower = *p;
+
(*pp)++;
return 1;
}
diff --git a/src/oniguruma.h b/src/oniguruma.h
index d983fc9..a7b9d8f 100644
--- a/src/oniguruma.h
+++ b/src/oniguruma.h
@@ -4,7 +4,7 @@
oniguruma.h - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2020 K.Kosako
+ * Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -36,9 +36,9 @@ extern "C" {
#define ONIGURUMA
#define ONIGURUMA_VERSION_MAJOR 6
#define ONIGURUMA_VERSION_MINOR 9
-#define ONIGURUMA_VERSION_TEENY 6
+#define ONIGURUMA_VERSION_TEENY 7
-#define ONIGURUMA_VERSION_INT 60906
+#define ONIGURUMA_VERSION_INT 60907
#ifndef P_
#if defined(__STDC__) || defined(_WIN32)
@@ -91,6 +91,7 @@ typedef unsigned int OnigCaseFoldType; /* case fold flag */
ONIG_EXTERN OnigCaseFoldType OnigDefaultCaseFoldFlag;
+#define ONIGENC_CASE_FOLD_ASCII_ONLY (1)
/* #define ONIGENC_CASE_FOLD_HIRAGANA_KATAKANA (1<<1) */
/* #define ONIGENC_CASE_FOLD_KATAKANA_WIDTH (1<<2) */
#define ONIGENC_CASE_FOLD_TURKISH_AZERI (1<<20)
@@ -387,9 +388,9 @@ typedef unsigned int OnigOptionType;
#define ONIG_OPTION_NOTEOL (ONIG_OPTION_NOTBOL << 1)
#define ONIG_OPTION_POSIX_REGION (ONIG_OPTION_NOTEOL << 1)
#define ONIG_OPTION_CHECK_VALIDITY_OF_STRING (ONIG_OPTION_POSIX_REGION << 1)
-/* #define ONIG_OPTION_CRLF_AS_LINE_SEPARATOR (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 1) */
/* options (compile time) */
-#define ONIG_OPTION_WORD_IS_ASCII (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 4)
+#define ONIG_OPTION_IGNORECASE_IS_ASCII (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 3)
+#define ONIG_OPTION_WORD_IS_ASCII (ONIG_OPTION_IGNORECASE_IS_ASCII << 1)
#define ONIG_OPTION_DIGIT_IS_ASCII (ONIG_OPTION_WORD_IS_ASCII << 1)
#define ONIG_OPTION_SPACE_IS_ASCII (ONIG_OPTION_DIGIT_IS_ASCII << 1)
#define ONIG_OPTION_POSIX_IS_ASCII (ONIG_OPTION_SPACE_IS_ASCII << 1)
@@ -399,8 +400,9 @@ typedef unsigned int OnigOptionType;
#define ONIG_OPTION_NOT_BEGIN_STRING (ONIG_OPTION_TEXT_SEGMENT_WORD << 1)
#define ONIG_OPTION_NOT_END_STRING (ONIG_OPTION_NOT_BEGIN_STRING << 1)
#define ONIG_OPTION_NOT_BEGIN_POSITION (ONIG_OPTION_NOT_END_STRING << 1)
+#define ONIG_OPTION_CALLBACK_EACH_MATCH (ONIG_OPTION_NOT_BEGIN_POSITION << 1)
-#define ONIG_OPTION_MAXBIT ONIG_OPTION_NOT_BEGIN_POSITION
+#define ONIG_OPTION_MAXBIT ONIG_OPTION_CALLBACK_EACH_MATCH
#define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt))
#define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt))
@@ -425,6 +427,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxJava;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl;
ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl_NG;
ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby;
+ONIG_EXTERN OnigSyntaxType OnigSyntaxPython;
ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma;
/* predefined syntaxes (see regsyntax.c) */
@@ -438,6 +441,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma;
#define ONIG_SYNTAX_PERL (&OnigSyntaxPerl)
#define ONIG_SYNTAX_PERL_NG (&OnigSyntaxPerl_NG)
#define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby)
+#define ONIG_SYNTAX_PYTHON (&OnigSyntaxPython)
#define ONIG_SYNTAX_ONIGURUMA (&OnigSyntaxOniguruma)
/* default syntax */
@@ -510,6 +514,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS (1U<<28) /* (?{...}) (?{{...}}) */
#define ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME (1U<<29) /* (*name) (*name{a,..}) */
#define ONIG_SYN_OP2_OPTION_ONIGURUMA (1U<<30) /* (?imxWDSPy) */
+#define ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME (1U<<31) /* (?P<name>...) (?P=name) */
/* syntax (behavior) */
#define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */
@@ -525,6 +530,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */
#define ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH (1U<<10) /* ..(?i)...|... */
#define ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND (1U<<11) /* (?<=a+|..) */
+#define ONIG_SYN_PYTHON (1U<<12) /* \UHHHHHHHH */
/* syntax (behavior) in char class [...] */
#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */
@@ -548,8 +554,10 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
/* error codes */
#define ONIG_IS_PATTERN_ERROR(ecode) ((ecode) <= -100 && (ecode) > -1000)
+
/* normal return */
#define ONIG_NORMAL 0
+#define ONIG_VALUE_IS_NOT_SET 1
#define ONIG_MISMATCH -1
#define ONIG_NO_SUPPORT_CONFIG -2
#define ONIG_ABORT -3
@@ -607,6 +615,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209
#define ONIGERR_TOO_MANY_CAPTURES -210
#define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212
+#define ONIGERR_UNDEFINED_OPERATOR -213
#define ONIGERR_EMPTY_GROUP_NAME -214
#define ONIGERR_INVALID_GROUP_NAME -215
#define ONIGERR_INVALID_CHAR_IN_GROUP_NAME -216
@@ -633,6 +642,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIGERR_INVALID_COMBINATION_OF_OPTIONS -403
#define ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS -404
#define ONIGERR_TOO_LONG_PROPERTY_NAME -405
+#define ONIGERR_VERY_INEFFICIENT_PATTERN -406
#define ONIGERR_LIBRARY_IS_NOT_INITIALIZED -500
/* errors related to thread */
@@ -717,6 +727,8 @@ typedef struct {
OnigCaseFoldType case_fold_flag;
} OnigCompileInfo;
+typedef int (*OnigCallbackEachMatchFunc)(const OnigUChar* str, const OnigUChar* end, const OnigUChar* match_start, OnigRegion* region, void* user_data);
+
/* types for callout */
typedef enum {
@@ -940,6 +952,12 @@ const char* onig_version P_((void));
ONIG_EXTERN
const char* onig_copyright P_((void));
+/* for callback each match */
+ONIG_EXTERN
+OnigCallbackEachMatchFunc onig_get_callback_each_match P_((void));
+ONIG_EXTERN
+int onig_set_callback_each_match P_((OnigCallbackEachMatchFunc f));
+
/* for OnigMatchParam */
ONIG_EXTERN
OnigMatchParam* onig_new_match_param P_((void));
@@ -981,6 +999,8 @@ ONIG_EXTERN
int onig_get_callout_data_by_tag P_((OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType* type, OnigValue* val));
ONIG_EXTERN
int onig_set_callout_data_by_tag P_((OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType type, OnigValue* val));
+ONIG_EXTERN
+int onig_get_callout_data_by_tag_dont_clear_old P_((regex_t* reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType* type, OnigValue* val));
/* used in callout functions */
ONIG_EXTERN
diff --git a/src/regcomp.c b/src/regcomp.c
index dd2b328..d80551d 100644
--- a/src/regcomp.c
+++ b/src/regcomp.c
@@ -2,7 +2,7 @@
regcomp.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2020 K.Kosako
+ * Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,9 @@
#define OPS_INIT_SIZE 8
+#define NODE_IS_REAL_IGNORECASE(node) \
+ (NODE_IS_IGNORECASE(node) && !NODE_STRING_IS_CRUDE(node))
+
typedef struct {
OnigLen min;
OnigLen max;
@@ -44,7 +47,7 @@ typedef struct {
OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
-static OnigLen node_min_byte_len(Node* node, ScanEnv* env);
+static OnigLen node_min_byte_len(Node* node, ParseEnv* env);
#if 0
typedef struct {
@@ -129,27 +132,22 @@ ops_init(regex_t* reg, int init_alloc_size)
Operation* p;
size_t size;
- if (init_alloc_size > 0) {
- size = sizeof(Operation) * init_alloc_size;
- p = (Operation* )xrealloc(reg->ops, size);
- CHECK_NULL_RETURN_MEMERR(p);
- reg->ops = p;
+ if (init_alloc_size <= 0)
+ return ONIGERR_PARSER_BUG;
+
+ size = sizeof(Operation) * init_alloc_size;
+ p = (Operation* )xrealloc(reg->ops, size);
+ CHECK_NULL_RETURN_MEMERR(p);
+ reg->ops = p;
#ifdef USE_DIRECT_THREADED_CODE
- {
- enum OpCode* cp;
- size = sizeof(enum OpCode) * init_alloc_size;
- cp = (enum OpCode* )xrealloc(reg->ocs, size);
- CHECK_NULL_RETURN_MEMERR(cp);
- reg->ocs = cp;
- }
-#endif
+ {
+ enum OpCode* cp;
+ size = sizeof(enum OpCode) * init_alloc_size;
+ cp = (enum OpCode* )xrealloc(reg->ocs, size);
+ CHECK_NULL_RETURN_MEMERR(cp);
+ reg->ocs = cp;
}
- else {
- reg->ops = (Operation* )0;
-#ifdef USE_DIRECT_THREADED_CODE
- reg->ocs = (enum OpCode* )0;
#endif
- }
reg->ops_curr = 0; /* !!! not yet done ops_new() */
reg->ops_alloc = init_alloc_size;
@@ -159,19 +157,16 @@ ops_init(regex_t* reg, int init_alloc_size)
}
static int
-ops_expand(regex_t* reg, int n)
+ops_resize(regex_t* reg, int n)
{
-#define MIN_OPS_EXPAND_SIZE 4
-
#ifdef USE_DIRECT_THREADED_CODE
enum OpCode* cp;
#endif
Operation* p;
size_t size;
- if (n <= 0) n = MIN_OPS_EXPAND_SIZE;
-
- n += reg->ops_alloc;
+ if (n == reg->ops_alloc) return ONIG_NORMAL;
+ if (n <= 0) return ONIGERR_PARSER_BUG;
size = sizeof(Operation) * n;
p = (Operation* )xrealloc(reg->ops, size);
@@ -197,10 +192,8 @@ ops_expand(regex_t* reg, int n)
static int
ops_new(regex_t* reg)
{
- int r;
-
if (reg->ops_used >= reg->ops_alloc) {
- r = ops_expand(reg, reg->ops_alloc);
+ int r = ops_resize(reg, reg->ops_alloc << 1);
if (r != ONIG_NORMAL) return r;
}
@@ -669,6 +662,8 @@ mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt)
if (to->max < alt->max) to->max = alt->max;
}
+#ifndef ONIG_DONT_OPTIMIZE
+
static int
mml_is_equal(MinMaxLen* a, MinMaxLen* b)
{
@@ -709,9 +704,11 @@ mml_alt_merge(MinMaxLen* to, MinMaxLen* alt)
if (to->max < alt->max) to->max = alt->max;
}
+#endif
+
/* fixed size pattern node only */
static int
-node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,
+node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env,
int level)
{
MinMaxCharLen tci;
@@ -768,7 +765,8 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,
StrNode* sn = STR_(node);
UChar *s = sn->s;
- if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) {
+ if (NODE_IS_REAL_IGNORECASE(node) &&
+ CASE_FOLD_IS_NOT_ASCII_ONLY(env->case_fold_flag)) {
/* Such a case is possible.
ex. /(?i)(?<=\1)(a)/
Backref node refer to capture group, but it doesn't tune yet.
@@ -917,7 +915,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,
{
int i;
int* backs;
- MemEnv* mem_env = SCANENV_MEMENV(env);
+ MemEnv* mem_env = PARSEENV_MEMENV(env);
BackRefNode* br = BACKREF_(node);
backs = BACKREFS_P(br);
@@ -943,7 +941,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,
}
static int
-node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env)
+node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env)
{
return node_char_len1(node, reg, ci, env, 0);
}
@@ -967,7 +965,7 @@ add_op(regex_t* reg, int opcode)
}
static int compile_length_tree(Node* node, regex_t* reg);
-static int compile_tree(Node* node, regex_t* reg, ScanEnv* env);
+static int compile_tree(Node* node, regex_t* reg, ParseEnv* env);
#define IS_NEED_STR_LEN_OP(op) \
@@ -1035,7 +1033,7 @@ is_strict_real_node(Node* node)
}
static int
-compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env)
+compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ParseEnv* env)
{
int r;
int saved_num_empty_check;
@@ -1060,14 +1058,20 @@ compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env)
if (emptiness == BODY_MAY_BE_EMPTY)
r = add_op(reg, OP_EMPTY_CHECK_END);
else if (emptiness == BODY_MAY_BE_EMPTY_MEM) {
- if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0)
+ if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0 && qn->empty_status_mem != 0) {
r = add_op(reg, OP_EMPTY_CHECK_END_MEMST);
+ if (r != 0) return r;
+ COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem;
+ }
else
r = add_op(reg, OP_EMPTY_CHECK_END);
}
#ifdef USE_CALL
- else if (emptiness == BODY_MAY_BE_EMPTY_REC)
+ else if (emptiness == BODY_MAY_BE_EMPTY_REC) {
r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH);
+ if (r != 0) return r;
+ COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem;
+ }
#endif
if (r != 0) return r;
@@ -1078,7 +1082,7 @@ compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env)
#ifdef USE_CALL
static int
-compile_call(CallNode* node, regex_t* reg, ScanEnv* env)
+compile_call(CallNode* node, regex_t* reg, ParseEnv* env)
{
int r;
int offset;
@@ -1098,7 +1102,7 @@ compile_call(CallNode* node, regex_t* reg, ScanEnv* env)
#endif
static int
-compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env)
+compile_tree_n_times(Node* node, int n, regex_t* reg, ParseEnv* env)
{
int i, r;
@@ -1356,7 +1360,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index)
static int
compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness,
- regex_t* reg, ScanEnv* env)
+ regex_t* reg, ParseEnv* env)
{
int r;
int num_repeat = reg->num_repeat++;
@@ -1469,7 +1473,7 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg)
}
static int
-compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)
+compile_quantifier_node(QuantNode* qn, regex_t* reg, ParseEnv* env)
{
int i, r, mod_tlen;
int infinite = IS_INFINITE_REPEAT(qn->upper);
@@ -1649,7 +1653,7 @@ compile_length_option_node(BagNode* node, regex_t* reg)
}
static int
-compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env)
+compile_option_node(BagNode* node, regex_t* reg, ParseEnv* env)
{
int r;
@@ -1765,7 +1769,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg)
}
static int
-compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env)
+compile_bag_memory_node(BagNode* node, regex_t* reg, ParseEnv* env)
{
int r;
@@ -1845,7 +1849,7 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env)
}
static int
-compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)
+compile_bag_node(BagNode* node, regex_t* reg, ParseEnv* env)
{
int r, len;
@@ -2036,7 +2040,7 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg)
}
static int
-compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
+compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
{
int r;
@@ -2150,7 +2154,7 @@ compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
static int
compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
- ScanEnv* env)
+ ParseEnv* env)
{
int r;
int len;
@@ -2279,7 +2283,7 @@ compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,
}
static int
-compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
+compile_anchor_node(AnchorNode* node, regex_t* reg, ParseEnv* env)
{
int r, len;
enum OpCode op;
@@ -2573,7 +2577,7 @@ compile_length_tree(Node* node, regex_t* reg)
}
static int
-compile_tree(Node* node, regex_t* reg, ScanEnv* env)
+compile_tree(Node* node, regex_t* reg, ParseEnv* env)
{
int n, len, pos, r = 0;
@@ -2983,7 +2987,7 @@ numbered_ref_check(Node* node)
}
static int
-disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
+disable_noname_group_capture(Node** root, regex_t* reg, ParseEnv* env)
{
int r, i, pos, counter;
MemStatusType loc;
@@ -3003,7 +3007,7 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
for (i = 1, pos = 1; i <= env->num_mem; i++) {
if (map[i].new_val > 0) {
- SCANENV_MEMENV(env)[pos] = SCANENV_MEMENV(env)[i];
+ PARSEENV_MEMENV(env)[pos] = PARSEENV_MEMENV(env)[i];
pos++;
}
}
@@ -3285,8 +3289,7 @@ get_tree_head_literal(Node* node, int exact, regex_t* reg)
if (sn->end <= sn->s)
break;
- if (exact == 0 ||
- ! NODE_IS_IGNORECASE(node) || NODE_STRING_IS_CRUDE(node)) {
+ if (exact == 0 || !NODE_IS_REAL_IGNORECASE(node)) {
n = node;
}
}
@@ -3381,7 +3384,7 @@ get_tree_tail_literal(Node* node, Node** rnode, regex_t* reg)
break;
}
- if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) {
+ if (NODE_IS_REAL_IGNORECASE(node)) {
r = GET_VALUE_NONE;
break;
}
@@ -3601,7 +3604,7 @@ check_node_in_look_behind(Node* node, int not, int* used)
}
static OnigLen
-node_min_byte_len(Node* node, ScanEnv* env)
+node_min_byte_len(Node* node, ParseEnv* env)
{
OnigLen len;
OnigLen tmin;
@@ -3612,7 +3615,7 @@ node_min_byte_len(Node* node, ScanEnv* env)
if (! NODE_IS_CHECKER(node)) {
int i;
int* backs;
- MemEnv* mem_env = SCANENV_MEMENV(env);
+ MemEnv* mem_env = PARSEENV_MEMENV(env);
BackRefNode* br = BACKREF_(node);
if (NODE_IS_RECURSION(node)) break;
@@ -3629,10 +3632,8 @@ node_min_byte_len(Node* node, ScanEnv* env)
case NODE_CALL:
{
Node* t = NODE_BODY(node);
- if (NODE_IS_RECURSION(node)) {
- if (NODE_IS_FIXED_MIN(t))
- len = BAG_(t)->min_len;
- }
+ if (NODE_IS_FIXED_MIN(t))
+ len = BAG_(t)->min_len;
else
len = node_min_byte_len(t, env);
}
@@ -3742,143 +3743,8 @@ node_min_byte_len(Node* node, ScanEnv* env)
return len;
}
-static OnigLen
-node_max_byte_len(Node* node, ScanEnv* env)
-{
- OnigLen len;
- OnigLen tmax;
-
- len = 0;
- switch (NODE_TYPE(node)) {
- case NODE_LIST:
- do {
- tmax = node_max_byte_len(NODE_CAR(node), env);
- len = distance_add(len, tmax);
- } while (IS_NOT_NULL(node = NODE_CDR(node)));
- break;
-
- case NODE_ALT:
- do {
- tmax = node_max_byte_len(NODE_CAR(node), env);
- if (len < tmax) len = tmax;
- } while (IS_NOT_NULL(node = NODE_CDR(node)));
- break;
-
- case NODE_STRING:
- {
- StrNode* sn = STR_(node);
- len = (OnigLen )(sn->end - sn->s);
- }
- break;
-
- case NODE_CTYPE:
- case NODE_CCLASS:
- len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
- break;
-
- case NODE_BACKREF:
- if (! NODE_IS_CHECKER(node)) {
- int i;
- int* backs;
- MemEnv* mem_env = SCANENV_MEMENV(env);
- BackRefNode* br = BACKREF_(node);
- if (NODE_IS_RECURSION(node)) {
-#ifdef USE_BACKREF_WITH_LEVEL
- if (NODE_IS_NEST_LEVEL(node)) {
- len = INFINITE_LEN;
- }
-#endif
- break;
- }
- backs = BACKREFS_P(br);
- for (i = 0; i < br->back_num; i++) {
- tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env);
- if (len < tmax) len = tmax;
- }
- }
- break;
-
-#ifdef USE_CALL
- case NODE_CALL:
- if (! NODE_IS_RECURSION(node))
- len = node_max_byte_len(NODE_BODY(node), env);
- else
- len = INFINITE_LEN;
- break;
-#endif
-
- case NODE_QUANT:
- {
- QuantNode* qn = QUANT_(node);
-
- if (qn->upper != 0) {
- len = node_max_byte_len(NODE_BODY(node), env);
- if (len != 0) {
- if (! IS_INFINITE_REPEAT(qn->upper))
- len = distance_multiply(len, qn->upper);
- else
- len = INFINITE_LEN;
- }
- }
- }
- break;
-
- case NODE_BAG:
- {
- BagNode* en = BAG_(node);
- switch (en->type) {
- case BAG_MEMORY:
- if (NODE_IS_FIXED_MAX(node))
- len = en->max_len;
- else {
- if (NODE_IS_MARK1(node))
- len = INFINITE_LEN;
- else {
- NODE_STATUS_ADD(node, MARK1);
- len = node_max_byte_len(NODE_BODY(node), env);
- NODE_STATUS_REMOVE(node, MARK1);
-
- en->max_len = len;
- NODE_STATUS_ADD(node, FIXED_MAX);
- }
- }
- break;
-
- case BAG_OPTION:
- case BAG_STOP_BACKTRACK:
- len = node_max_byte_len(NODE_BODY(node), env);
- break;
- case BAG_IF_ELSE:
- {
- OnigLen tlen, elen;
-
- len = node_max_byte_len(NODE_BODY(node), env);
- if (IS_NOT_NULL(en->te.Then)) {
- tlen = node_max_byte_len(en->te.Then, env);
- len = distance_add(len, tlen);
- }
- if (IS_NOT_NULL(en->te.Else))
- elen = node_max_byte_len(en->te.Else, env);
- else elen = 0;
-
- if (elen > len) len = elen;
- }
- break;
- }
- }
- break;
-
- case NODE_ANCHOR:
- case NODE_GIMMICK:
- default:
- break;
- }
-
- return len;
-}
-
static int
-check_backrefs(Node* node, ScanEnv* env)
+check_backrefs(Node* node, ParseEnv* env)
{
int r;
@@ -3923,7 +3789,7 @@ check_backrefs(Node* node, ScanEnv* env)
int i;
BackRefNode* br = BACKREF_(node);
int* backs = BACKREFS_P(br);
- MemEnv* mem_env = SCANENV_MEMENV(env);
+ MemEnv* mem_env = PARSEENV_MEMENV(env);
for (i = 0; i < br->back_num; i++) {
if (backs[i] > env->num_mem)
@@ -3944,7 +3810,7 @@ check_backrefs(Node* node, ScanEnv* env)
}
static int
-set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env)
+set_empty_repeat_node_trav(Node* node, Node* empty, ParseEnv* env)
{
int r;
@@ -3998,7 +3864,7 @@ set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env)
if (en->type == BAG_MEMORY) {
if (NODE_IS_BACKREF(node)) {
if (IS_NOT_NULL(empty))
- SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty;
+ PARSEENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty;
}
}
else if (en->type == BAG_IF_ELSE) {
@@ -4034,7 +3900,7 @@ is_ancestor_node(Node* node, Node* me)
}
static void
-set_empty_status_check_trav(Node* node, ScanEnv* env)
+set_empty_status_check_trav(Node* node, ParseEnv* env)
{
switch (NODE_TYPE(node)) {
case NODE_LIST:
@@ -4078,14 +3944,14 @@ set_empty_status_check_trav(Node* node, ScanEnv* env)
{
int i;
int* backs;
- MemEnv* mem_env = SCANENV_MEMENV(env);
+ MemEnv* mem_env = PARSEENV_MEMENV(env);
BackRefNode* br = BACKREF_(node);
backs = BACKREFS_P(br);
for (i = 0; i < br->back_num; i++) {
Node* ernode = mem_env[backs[i]].empty_repeat_node;
if (IS_NOT_NULL(ernode)) {
if (! is_ancestor_node(ernode, node)) {
- MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]);
+ MEM_STATUS_LIMIT_ON(QUANT_(ernode)->empty_status_mem, backs[i]);
NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK);
NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK);
}
@@ -4150,7 +4016,7 @@ set_parent_node_trav(Node* node, Node* parent)
#define RECURSION_INFINITE (1<<2)
static int
-infinite_recursive_call_check(Node* node, ScanEnv* env, int head)
+infinite_recursive_call_check(Node* node, ParseEnv* env, int head)
{
int ret;
int r = 0;
@@ -4191,6 +4057,8 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head)
break;
case NODE_QUANT:
+ if (QUANT_(node)->upper == 0) break;
+
r = infinite_recursive_call_check(NODE_BODY(node), env, head);
if (r < 0) return r;
if ((r & RECURSION_MUST) != 0) {
@@ -4265,7 +4133,7 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head)
}
static int
-infinite_recursive_call_check_trav(Node* node, ScanEnv* env)
+infinite_recursive_call_check_trav(Node* node, ParseEnv* env)
{
int r;
@@ -4403,7 +4271,7 @@ recursive_call_check(Node* node)
#define FOUND_CALLED_NODE 1
static int
-recursive_call_check_trav(Node* node, ScanEnv* env, int state)
+recursive_call_check_trav(Node* node, ParseEnv* env, int state)
{
int r = 0;
@@ -4443,19 +4311,21 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state)
BagNode* en = BAG_(node);
if (en->type == BAG_MEMORY) {
- if (NODE_IS_CALLED(node) || (state & IN_RECURSION) != 0) {
+ if (NODE_IS_CALLED(node)) {
+ r = FOUND_CALLED_NODE;
+ goto check_recursion;
+ }
+ else if ((state & IN_RECURSION) != 0) {
+ check_recursion:
if (! NODE_IS_RECURSION(node)) {
NODE_STATUS_ADD(node, MARK1);
- r = recursive_call_check(NODE_BODY(node));
- if (r != 0) {
+ ret = recursive_call_check(NODE_BODY(node));
+ if (ret != 0) {
NODE_STATUS_ADD(node, RECURSION);
MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);
}
NODE_STATUS_REMOVE(node, MARK1);
}
-
- if (NODE_IS_CALLED(node))
- r = FOUND_CALLED_NODE;
}
}
@@ -4616,8 +4486,9 @@ reduce_string_list(Node* node, OnigEncoding enc)
#define IN_VAR_REPEAT (1<<3)
#define IN_ZERO_REPEAT (1<<4)
#define IN_MULTI_ENTRY (1<<5)
-#define IN_LOOK_BEHIND (1<<6)
-
+#define IN_PREC_READ (1<<6)
+#define IN_LOOK_BEHIND (1<<7)
+#define IN_PEEK (1<<8)
/* divide different length alternatives in look-behind.
(?<=A|B) ==> (?<=A)|(?<=B)
@@ -4706,7 +4577,7 @@ list_reduce_in_look_behind(Node* node)
}
static int
-alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env)
+alt_reduce_in_look_behind(Node* node, regex_t* reg, ParseEnv* env)
{
int r;
@@ -4725,10 +4596,10 @@ alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env)
return r;
}
-static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env);
+static int tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env);
static int
-tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env)
+tune_look_behind(Node* node, regex_t* reg, int state, ParseEnv* env)
{
int r;
int state1;
@@ -5183,7 +5054,7 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state)
return r;
}
-#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
+#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT
static enum BodyEmptyType
quantifiers_memory_node_info(Node* node)
{
@@ -5265,7 +5136,7 @@ quantifiers_memory_node_info(Node* node)
return r;
}
-#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */
+#endif /* USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT */
#ifdef USE_CALL
@@ -5274,9 +5145,9 @@ quantifiers_memory_node_info(Node* node)
__inline
#endif
static int
-check_call_reference(CallNode* cn, ScanEnv* env, int state)
+check_call_reference(CallNode* cn, ParseEnv* env, int state)
{
- MemEnv* mem_env = SCANENV_MEMENV(env);
+ MemEnv* mem_env = PARSEENV_MEMENV(env);
if (cn->by_number != 0) {
int gnum = cn->called_gnum;
@@ -5393,7 +5264,7 @@ tune_call2_call(Node* node)
}
static int
-tune_call(Node* node, ScanEnv* env, int state)
+tune_call(Node* node, ParseEnv* env, int state)
{
int r;
@@ -5539,6 +5410,8 @@ tune_called_state_call(Node* node, int state)
state |= IN_REAL_REPEAT;
if (qn->lower != qn->upper)
state |= IN_VAR_REPEAT;
+ if ((state & IN_PEEK) != 0)
+ NODE_STATUS_ADD(node, INPEEK);
tune_called_state_call(NODE_QUANT_BODY(qn), state);
}
@@ -5551,10 +5424,12 @@ tune_called_state_call(Node* node, int state)
switch (an->type) {
case ANCR_PREC_READ_NOT:
case ANCR_LOOK_BEHIND_NOT:
- state |= IN_NOT;
- /* fall */
+ state |= (IN_NOT | IN_PEEK);
+ tune_called_state_call(NODE_ANCHOR_BODY(an), state);
+ break;
case ANCR_PREC_READ:
case ANCR_LOOK_BEHIND:
+ state |= IN_PEEK;
tune_called_state_call(NODE_ANCHOR_BODY(an), state);
break;
default:
@@ -5597,6 +5472,11 @@ tune_called_state_call(Node* node, int state)
break;
case NODE_CALL:
+ if ((state & IN_PEEK) != 0)
+ NODE_STATUS_ADD(node, INPEEK);
+ if ((state & IN_REAL_REPEAT) != 0)
+ NODE_STATUS_ADD(node, IN_REAL_REPEAT);
+
tune_called_state_call(NODE_BODY(node), state);
break;
@@ -5620,6 +5500,11 @@ tune_called_state(Node* node, int state)
#ifdef USE_CALL
case NODE_CALL:
+ if ((state & IN_PEEK) != 0)
+ NODE_STATUS_ADD(node, INPEEK);
+ if ((state & IN_REAL_REPEAT) != 0)
+ NODE_STATUS_ADD(node, IN_REAL_REPEAT);
+
tune_called_state_call(node, state);
break;
#endif
@@ -5659,6 +5544,8 @@ tune_called_state(Node* node, int state)
state |= IN_REAL_REPEAT;
if (qn->lower != qn->upper)
state |= IN_VAR_REPEAT;
+ if ((state & IN_PEEK) != 0)
+ NODE_STATUS_ADD(node, INPEEK);
tune_called_state(NODE_QUANT_BODY(qn), state);
}
@@ -5671,10 +5558,12 @@ tune_called_state(Node* node, int state)
switch (an->type) {
case ANCR_PREC_READ_NOT:
case ANCR_LOOK_BEHIND_NOT:
- state |= IN_NOT;
- /* fall */
+ state |= (IN_NOT | IN_PEEK);
+ tune_called_state(NODE_ANCHOR_BODY(an), state);
+ break;
case ANCR_PREC_READ:
case ANCR_LOOK_BEHIND:
+ state |= IN_PEEK;
tune_called_state(NODE_ANCHOR_BODY(an), state);
break;
default:
@@ -5700,17 +5589,18 @@ tune_called_state(Node* node, int state)
__inline
#endif
static int
-tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env)
+tune_anchor(Node* node, regex_t* reg, int state, ParseEnv* env)
{
int r;
AnchorNode* an = ANCHOR_(node);
switch (an->type) {
case ANCR_PREC_READ:
- r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env);
+ r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ), env);
break;
case ANCR_PREC_READ_NOT:
- r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env);
+ r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ | IN_NOT),
+ env);
break;
case ANCR_LOOK_BEHIND:
@@ -5730,7 +5620,7 @@ tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env)
__inline
#endif
static int
-tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env)
+tune_quant(Node* node, regex_t* reg, int state, ParseEnv* env)
{
int r;
QuantNode* qn = QUANT_(node);
@@ -5746,7 +5636,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env)
if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {
OnigLen d = node_min_byte_len(body, env);
if (d == 0) {
-#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
+#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT
qn->emptiness = quantifiers_memory_node_info(body);
#else
qn->emptiness = BODY_MAY_BE_EMPTY;
@@ -5807,7 +5697,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env)
6. expand repeated string.
*/
static int
-tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
+tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env)
{
int r = 0;
@@ -5832,7 +5722,7 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
break;
case NODE_STRING:
- if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) {
+ if (NODE_IS_REAL_IGNORECASE(node)) {
r = unravel_case_fold_string(node, reg, state);
}
break;
@@ -5918,6 +5808,9 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
break;
case NODE_QUANT:
+ if ((state & (IN_PREC_READ | IN_LOOK_BEHIND)) != 0)
+ NODE_STATUS_ADD(node, INPEEK);
+
r = tune_quant(node, reg, state, env);
break;
@@ -5938,6 +5831,7 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
return r;
}
+#ifndef ONIG_DONT_OPTIMIZE
static int
set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,
UChar* s, UChar* end,
@@ -6007,6 +5901,7 @@ set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,
return 0;
}
+#endif
#define OPT_EXACT_MAXLEN 24
@@ -6019,7 +5914,7 @@ typedef struct {
MinMaxLen mm;
OnigEncoding enc;
OnigCaseFoldType case_fold_flag;
- ScanEnv* scan_env;
+ ParseEnv* scan_env;
} OptEnv;
typedef struct {
@@ -6052,6 +5947,8 @@ typedef struct {
} OptNode;
+#ifndef ONIG_DONT_OPTIMIZE
+
static int
map_position_value(OnigEncoding enc, int i)
{
@@ -6540,6 +6437,140 @@ alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env)
mml_alt_merge(&to->len, &add->len);
}
+static OnigLen
+node_max_byte_len(Node* node, ParseEnv* env)
+{
+ OnigLen len;
+ OnigLen tmax;
+
+ len = 0;
+ switch (NODE_TYPE(node)) {
+ case NODE_LIST:
+ do {
+ tmax = node_max_byte_len(NODE_CAR(node), env);
+ len = distance_add(len, tmax);
+ } while (IS_NOT_NULL(node = NODE_CDR(node)));
+ break;
+
+ case NODE_ALT:
+ do {
+ tmax = node_max_byte_len(NODE_CAR(node), env);
+ if (len < tmax) len = tmax;
+ } while (IS_NOT_NULL(node = NODE_CDR(node)));
+ break;
+
+ case NODE_STRING:
+ {
+ StrNode* sn = STR_(node);
+ len = (OnigLen )(sn->end - sn->s);
+ }
+ break;
+
+ case NODE_CTYPE:
+ case NODE_CCLASS:
+ len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
+ break;
+
+ case NODE_BACKREF:
+ if (! NODE_IS_CHECKER(node)) {
+ int i;
+ int* backs;
+ MemEnv* mem_env = PARSEENV_MEMENV(env);
+ BackRefNode* br = BACKREF_(node);
+ if (NODE_IS_RECURSION(node)) {
+#ifdef USE_BACKREF_WITH_LEVEL
+ if (NODE_IS_NEST_LEVEL(node)) {
+ len = INFINITE_LEN;
+ }
+#endif
+ break;
+ }
+ backs = BACKREFS_P(br);
+ for (i = 0; i < br->back_num; i++) {
+ tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env);
+ if (len < tmax) len = tmax;
+ }
+ }
+ break;
+
+#ifdef USE_CALL
+ case NODE_CALL:
+ if (! NODE_IS_RECURSION(node))
+ len = node_max_byte_len(NODE_BODY(node), env);
+ else
+ len = INFINITE_LEN;
+ break;
+#endif
+
+ case NODE_QUANT:
+ {
+ QuantNode* qn = QUANT_(node);
+
+ if (qn->upper != 0) {
+ len = node_max_byte_len(NODE_BODY(node), env);
+ if (len != 0) {
+ if (! IS_INFINITE_REPEAT(qn->upper))
+ len = distance_multiply(len, qn->upper);
+ else
+ len = INFINITE_LEN;
+ }
+ }
+ }
+ break;
+
+ case NODE_BAG:
+ {
+ BagNode* en = BAG_(node);
+ switch (en->type) {
+ case BAG_MEMORY:
+ if (NODE_IS_FIXED_MAX(node))
+ len = en->max_len;
+ else {
+ if (NODE_IS_MARK1(node))
+ len = INFINITE_LEN;
+ else {
+ NODE_STATUS_ADD(node, MARK1);
+ len = node_max_byte_len(NODE_BODY(node), env);
+ NODE_STATUS_REMOVE(node, MARK1);
+
+ en->max_len = len;
+ NODE_STATUS_ADD(node, FIXED_MAX);
+ }
+ }
+ break;
+
+ case BAG_OPTION:
+ case BAG_STOP_BACKTRACK:
+ len = node_max_byte_len(NODE_BODY(node), env);
+ break;
+ case BAG_IF_ELSE:
+ {
+ OnigLen tlen, elen;
+
+ len = node_max_byte_len(NODE_BODY(node), env);
+ if (IS_NOT_NULL(en->te.Then)) {
+ tlen = node_max_byte_len(en->te.Then, env);
+ len = distance_add(len, tlen);
+ }
+ if (IS_NOT_NULL(en->te.Else))
+ elen = node_max_byte_len(en->te.Else, env);
+ else elen = 0;
+
+ if (elen > len) len = elen;
+ }
+ break;
+ }
+ }
+ break;
+
+ case NODE_ANCHOR:
+ case NODE_GIMMICK:
+ default:
+ break;
+ }
+
+ return len;
+}
#define MAX_NODE_OPT_INFO_REF_COUNT 5
@@ -6822,22 +6853,22 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env)
{
OptEnv nenv;
- copy_opt_env(&nenv, env);
- r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv);
- if (r == 0) {
- mml_add(&nenv.mm, &xo.len);
- concat_left_node_opt_info(enc, opt, &xo);
- if (IS_NOT_NULL(en->te.Then)) {
- r = optimize_nodes(en->te.Then, &xo, &nenv);
- if (r == 0) {
- concat_left_node_opt_info(enc, opt, &xo);
+ if (IS_NOT_NULL(en->te.Else)) {
+ copy_opt_env(&nenv, env);
+ r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv);
+ if (r == 0) {
+ mml_add(&nenv.mm, &xo.len);
+ concat_left_node_opt_info(enc, opt, &xo);
+ if (IS_NOT_NULL(en->te.Then)) {
+ r = optimize_nodes(en->te.Then, &xo, &nenv);
+ if (r == 0) {
+ concat_left_node_opt_info(enc, opt, &xo);
+ }
}
- }
- if (IS_NOT_NULL(en->te.Else)) {
- r = optimize_nodes(en->te.Else, &xo, env);
- if (r == 0)
- alt_merge_node_opt_info(opt, &xo, env);
+ r = optimize_nodes(en->te.Else, &xo, env);
+ if (r == 0)
+ alt_merge_node_opt_info(opt, &xo, env);
}
}
}
@@ -6930,7 +6961,7 @@ static void print_optimize_info(FILE* f, regex_t* reg);
#endif
static int
-set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
+set_optimize_info_from_tree(Node* node, regex_t* reg, ParseEnv* scan_env)
{
int r;
OptNode opt;
@@ -6985,6 +7016,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
#endif
return r;
}
+#endif /* ONIG_DONT_OPTIMIZE */
static void
clear_optimize_info(regex_t* reg)
@@ -7031,14 +7063,43 @@ static void print_enc_string(FILE* fp, OnigEncoding enc,
s++;
}
}
+}
- fprintf(fp, "/\n");
+static void
+print_options(FILE* fp, OnigOptionType o)
+{
+ if ((o & ONIG_OPTION_IGNORECASE) != 0) fprintf(fp, " IGNORECASE");
+ if ((o & ONIG_OPTION_EXTEND) != 0) fprintf(fp, " EXTEND");
+ if ((o & ONIG_OPTION_MULTILINE) != 0) fprintf(fp, " MULTILINE");
+ if ((o & ONIG_OPTION_SINGLELINE) != 0) fprintf(fp, " SINGLELINE");
+ if ((o & ONIG_OPTION_FIND_LONGEST) != 0) fprintf(fp, " FIND_LONGEST");
+ if ((o & ONIG_OPTION_FIND_NOT_EMPTY) != 0) fprintf(fp, " FIND_NOT_EMPTY");
+ if ((o & ONIG_OPTION_NEGATE_SINGLELINE) != 0) fprintf(fp, " NEGATE_SINGLELINE");
+ if ((o & ONIG_OPTION_DONT_CAPTURE_GROUP) != 0) fprintf(fp, " DONT_CAPTURE_GROUP");
+ if ((o & ONIG_OPTION_CAPTURE_GROUP) != 0) fprintf(fp, " CAPTURE_GROUP");
+ if ((o & ONIG_OPTION_NOTBOL) != 0) fprintf(fp, " NOTBOL");
+ if ((o & ONIG_OPTION_NOTEOL) != 0) fprintf(fp, " NOTEOL");
+ if ((o & ONIG_OPTION_POSIX_REGION) != 0) fprintf(fp, " POSIX_REGION");
+ if ((o & ONIG_OPTION_CHECK_VALIDITY_OF_STRING) != 0) fprintf(fp, " CHECK_VALIDITY_OF_STRING");
+ if ((o & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) fprintf(fp, " IGNORECASE_IS_ASCII");
+ if ((o & ONIG_OPTION_WORD_IS_ASCII) != 0) fprintf(fp, " WORD_IS_ASCII");
+ if ((o & ONIG_OPTION_DIGIT_IS_ASCII) != 0) fprintf(fp, " DIGIT_IS_ASCII");
+ if ((o & ONIG_OPTION_SPACE_IS_ASCII) != 0) fprintf(fp, " SPACE_IS_ASCII");
+ if ((o & ONIG_OPTION_POSIX_IS_ASCII) != 0) fprintf(fp, " POSIX_IS_ASCII");
+ if ((o & ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER) != 0) fprintf(fp, " TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER");
+ if ((o & ONIG_OPTION_TEXT_SEGMENT_WORD) != 0) fprintf(fp, " TEXT_SEGMENT_WORD");
+ if ((o & ONIG_OPTION_NOT_BEGIN_STRING) != 0) fprintf(fp, " NOT_BIGIN_STRING");
+ if ((o & ONIG_OPTION_NOT_END_STRING) != 0) fprintf(fp, " NOT_END_STRING");
+ if ((o & ONIG_OPTION_NOT_BEGIN_POSITION) != 0) fprintf(fp, " NOT_BEGIN_POSITION");
+ if ((o & ONIG_OPTION_CALLBACK_EACH_MATCH) != 0) fprintf(fp, " CALLBACK_EACH_MATCH");
}
#endif /* ONIG_DEBUG */
#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
+#ifndef ONIG_DONT_OPTIMIZE
+
static void
print_distance_range(FILE* f, OnigLen a, OnigLen b)
{
@@ -7161,7 +7222,8 @@ print_optimize_info(FILE* f, regex_t* reg)
}
}
}
-#endif
+#endif /* ONIG_DONT_OPTIMIZE */
+#endif /* defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) */
extern RegexExt*
@@ -7259,93 +7321,150 @@ static void print_tree P_((FILE* f, Node* node));
extern int onig_init_for_match_at(regex_t* reg);
-extern int
-onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
- OnigErrorInfo* einfo)
-{
- int r;
- Node* root;
- ScanEnv scan_env;
+static int parse_and_tune(regex_t* reg, const UChar* pattern,
+ const UChar* pattern_end, ParseEnv *scan_env, Node** rroot,
+ OnigErrorInfo* einfo
#ifdef USE_CALL
- UnsetAddrList uslist = {0};
+ , UnsetAddrList* uslist
#endif
+)
+{
+ int r;
+ Node* root;
- root = 0;
+ root = NULL_NODE;
if (IS_NOT_NULL(einfo)) {
einfo->enc = reg->enc;
einfo->par = (UChar* )NULL;
}
-#ifdef ONIG_DEBUG
- fprintf(DBGFP, "\nPATTERN: /");
- print_enc_string(DBGFP, reg->enc, pattern, pattern_end);
-#endif
-
- if (reg->ops_alloc == 0) {
- r = ops_init(reg, OPS_INIT_SIZE);
- if (r != 0) goto end;
- }
- else
- reg->ops_used = 0;
-
- r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env);
+ r = onig_parse_tree(&root, pattern, pattern_end, reg, scan_env);
if (r != 0) goto err;
r = reduce_string_list(root, reg->enc);
if (r != 0) goto err;
/* mixed use named group and no-named group */
- if (scan_env.num_named > 0 &&
- IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
+ if (scan_env->num_named > 0 &&
+ IS_SYNTAX_BV(scan_env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
! OPTON_CAPTURE_GROUP(reg->options)) {
- if (scan_env.num_named != scan_env.num_mem)
- r = disable_noname_group_capture(&root, reg, &scan_env);
+ if (scan_env->num_named != scan_env->num_mem)
+ r = disable_noname_group_capture(&root, reg, scan_env);
else
r = numbered_ref_check(root);
if (r != 0) goto err;
}
- r = check_backrefs(root, &scan_env);
+ r = check_backrefs(root, scan_env);
if (r != 0) goto err;
#ifdef USE_CALL
- if (scan_env.num_call > 0) {
- r = unset_addr_list_init(&uslist, scan_env.num_call);
+ if (scan_env->num_call > 0) {
+ r = unset_addr_list_init(uslist, scan_env->num_call);
if (r != 0) goto err;
- scan_env.unset_addr_list = &uslist;
- r = tune_call(root, &scan_env, 0);
+ scan_env->unset_addr_list = uslist;
+ r = tune_call(root, scan_env, 0);
if (r != 0) goto err_unset;
r = tune_call2(root);
if (r != 0) goto err_unset;
- r = recursive_call_check_trav(root, &scan_env, 0);
+ r = recursive_call_check_trav(root, scan_env, 0);
if (r < 0) goto err_unset;
- r = infinite_recursive_call_check_trav(root, &scan_env);
+ r = infinite_recursive_call_check_trav(root, scan_env);
if (r != 0) goto err_unset;
tune_called_state(root, 0);
}
- reg->num_call = scan_env.num_call;
+ reg->num_call = scan_env->num_call;
#endif
#ifdef ONIG_DEBUG_PARSE
- fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth);
- fprintf(DBGFP, "TREE (parsed)\n");
- print_tree(DBGFP, root);
- fprintf(DBGFP, "\n");
+ fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env->max_parse_depth);
#endif
- r = tune_tree(root, reg, 0, &scan_env);
- if (r != 0) goto err_unset;
+ r = tune_tree(root, reg, 0, scan_env);
+ if (r != 0) {
+#ifdef ONIG_DEBUG_PARSE
+ fprintf(DBGFP, "TREE (error in tune)\n");
+ print_tree(DBGFP, root);
+ fprintf(DBGFP, "\n");
+#endif
+ goto err_unset;
+ }
- if (scan_env.backref_num != 0) {
+ if (scan_env->backref_num != 0) {
set_parent_node_trav(root, NULL_NODE);
- r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env);
+ r = set_empty_repeat_node_trav(root, NULL_NODE, scan_env);
if (r != 0) goto err_unset;
- set_empty_status_check_trav(root, &scan_env);
+ set_empty_status_check_trav(root, scan_env);
}
+ *rroot = root;
+ return r;
+
+ err_unset:
+#ifdef USE_CALL
+ if (scan_env->num_call > 0) {
+ unset_addr_list_end(uslist);
+ }
+#endif
+ err:
+ if (IS_NOT_NULL(scan_env->error)) {
+ if (IS_NOT_NULL(einfo)) {
+ einfo->par = scan_env->error;
+ einfo->par_end = scan_env->error_end;
+ }
+ }
+
+ onig_node_free(root);
+ if (IS_NOT_NULL(scan_env->mem_env_dynamic))
+ xfree(scan_env->mem_env_dynamic);
+
+ *rroot = NULL_NODE;
+ return r;
+}
+
+extern int
+onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
+ OnigErrorInfo* einfo)
+{
+ int r;
+ Node* root;
+ ParseEnv scan_env;
+#ifdef USE_CALL
+ UnsetAddrList uslist = {0};
+#endif
+
+#ifdef ONIG_DEBUG
+ fprintf(DBGFP, "\nPATTERN: /");
+ print_enc_string(DBGFP, reg->enc, pattern, pattern_end);
+ fprintf(DBGFP, "/\n");
+ fprintf(DBGFP, "OPTIONS:");
+ print_options(DBGFP, reg->options);
+ fprintf(DBGFP, "\n");
+#endif
+
+ if (reg->ops_alloc == 0) {
+ r = ops_init(reg, OPS_INIT_SIZE);
+ if (r != 0) {
+ if (IS_NOT_NULL(einfo)) {
+ einfo->enc = reg->enc;
+ einfo->par = (UChar* )NULL;
+ }
+ return r;
+ }
+ }
+ else
+ reg->ops_used = 0;
+
+ r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, einfo
+#ifdef USE_CALL
+ , &uslist
+#endif
+ );
+ if (r != 0) return r;
+
#ifdef ONIG_DEBUG_PARSE
fprintf(DBGFP, "TREE (after tune)\n");
print_tree(DBGFP, root);
@@ -7377,7 +7496,14 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
clear_optimize_info(reg);
#ifndef ONIG_DONT_OPTIMIZE
r = set_optimize_info_from_tree(root, reg, &scan_env);
- if (r != 0) goto err_unset;
+ if (r != 0) {
+#ifdef USE_CALL
+ if (scan_env.num_call > 0) {
+ unset_addr_list_end(&uslist);
+ }
+#endif
+ goto err;
+ }
#endif
if (IS_NOT_NULL(scan_env.mem_env_dynamic)) {
@@ -7407,6 +7533,9 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
}
#endif
+ r = ops_resize(reg, reg->ops_used);
+ if (r != ONIG_NORMAL) goto err;
+
set_addr_in_repeat_range(reg);
if ((reg->push_mem_end != 0)
@@ -7449,15 +7578,8 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
onig_init_for_match_at(reg);
#endif
- end:
return r;
- err_unset:
-#ifdef USE_CALL
- if (scan_env.num_call > 0) {
- unset_addr_list_end(&uslist);
- }
-#endif
err:
if (IS_NOT_NULL(scan_env.error)) {
if (IS_NOT_NULL(einfo)) {
@@ -7513,6 +7635,12 @@ onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_fl
else
option |= syntax->options;
+ if ((option & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) {
+ case_fold_flag &= ~(INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR |
+ ONIGENC_CASE_FOLD_TURKISH_AZERI);
+ case_fold_flag |= ONIGENC_CASE_FOLD_ASCII_ONLY;
+ }
+
(reg)->enc = enc;
(reg)->options = option;
(reg)->syntax = syntax;
@@ -7703,15 +7831,145 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
return onig_is_code_in_cc_len(len, code, cc);
}
+
+#define MANY_REPEAT_OF_ANYCHAR 20
+
+typedef enum {
+ MJ_NO = 0,
+ MJ_YES = 1,
+ MJ_IGNORE = 2,
+} MJ_RESULT;
+
+static MJ_RESULT
+mostly_just_anychar(Node* node, int in_reluctant)
+{
+ MJ_RESULT r;
+
+ r = MJ_NO;
+ switch (NODE_TYPE(node)) {
+ case NODE_LIST:
+ {
+ int found = FALSE;
+ do {
+ r = mostly_just_anychar(NODE_CAR(node), in_reluctant);
+ if (r == MJ_NO) break;
+ if (r == MJ_YES) found = TRUE;
+ } while (IS_NOT_NULL(node = NODE_CDR(node)));
+ if (r == MJ_IGNORE) {
+ if (found == TRUE) r = MJ_YES;
+ }
+ }
+ break;
+
+ case NODE_ALT:
+ r = MJ_IGNORE;
+ do {
+ r = mostly_just_anychar(NODE_CAR(node), in_reluctant);
+ if (r == MJ_YES) break;
+ } while (IS_NOT_NULL(node = NODE_CDR(node)));
+ break;
+
+ case NODE_QUANT:
+ {
+ QuantNode* qn = QUANT_(node);
+
+ if (qn->upper == 0)
+ r = MJ_IGNORE;
+ else {
+ if (in_reluctant == FALSE) {
+ if (qn->greedy != 0 &&
+ (! IS_INFINITE_REPEAT(qn->upper) &&
+ qn->upper <= MANY_REPEAT_OF_ANYCHAR)) {
+ in_reluctant = TRUE;
+ }
+ }
+ r = mostly_just_anychar(NODE_BODY(node), in_reluctant);
+ }
+ }
+ break;
+
+ case NODE_ANCHOR:
+ switch (ANCHOR_(node)->type) {
+ case ANCR_PREC_READ:
+ case ANCR_PREC_READ_NOT:
+ case ANCR_LOOK_BEHIND:
+ case ANCR_LOOK_BEHIND_NOT:
+ case ANCR_TEXT_SEGMENT_BOUNDARY: /* \y */
+ r = MJ_IGNORE;
+ break;
+ default:
+ break;
+ }
+ break;
+
+ case NODE_BAG:
+ {
+ BagNode* en = BAG_(node);
+
+ if (en->type == BAG_IF_ELSE) {
+ if (IS_NOT_NULL(en->te.Then)) {
+ r = mostly_just_anychar(en->te.Then, in_reluctant);
+ if (r == MJ_YES) break;
+ }
+ if (IS_NOT_NULL(en->te.Else)) {
+ r = mostly_just_anychar(en->te.Else, in_reluctant);
+ }
+ }
+ else {
+ r = mostly_just_anychar(NODE_BODY(node), in_reluctant);
+ }
+ }
+ break;
+
+ case NODE_CTYPE:
+ if (CTYPE_(node)->ctype == CTYPE_ANYCHAR)
+ r = MJ_YES;
+ else
+ r = MJ_NO;
+ break;
+
+ case NODE_STRING:
+ if (NODE_STRING_LEN(node) == 0) {
+ r = MJ_IGNORE;
+ break;
+ }
+ /* fall */
+ case NODE_CCLASS:
+ r = MJ_NO;
+ break;
+
+#ifdef USE_CALL
+ case NODE_CALL:
+ /* ignore call */
+#endif
+ case NODE_BACKREF:
+ case NODE_GIMMICK:
+ r = MJ_IGNORE;
+ break;
+
+ default:
+ break;
+ }
+
+ return r;
+}
+
+#define MAX_CALLS_IN_DETECT 10
+
typedef struct {
int prec_read;
int look_behind;
+ int backref;
int backref_with_level;
int call;
+ int anychar_reluctant_many;
+ int empty_check_nest_level;
+ int max_empty_check_nest_level;
+ int heavy_element;
} SlowElementCount;
static int
-node_detect_can_be_slow(Node* node, SlowElementCount* ct)
+detect_can_be_slow(Node* node, SlowElementCount* ct, int ncall, int calls[])
{
int r;
@@ -7720,13 +7978,45 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct)
case NODE_LIST:
case NODE_ALT:
do {
- r = node_detect_can_be_slow(NODE_CAR(node), ct);
+ r = detect_can_be_slow(NODE_CAR(node), ct, ncall, calls);
if (r != 0) return r;
} while (IS_NOT_NULL(node = NODE_CDR(node)));
break;
case NODE_QUANT:
- r = node_detect_can_be_slow(NODE_BODY(node), ct);
+ {
+ int prev_heavy_element;
+ QuantNode* qn;
+ Node* body;
+
+ qn = QUANT_(node);
+ body = NODE_BODY(node);
+
+ if (qn->emptiness != BODY_IS_NOT_EMPTY) {
+ prev_heavy_element = ct->heavy_element;
+ ct->empty_check_nest_level++;
+ if (ct->empty_check_nest_level > ct->max_empty_check_nest_level)
+ ct->max_empty_check_nest_level = ct->empty_check_nest_level;
+ }
+ else if (IS_INFINITE_REPEAT(qn->upper) ||
+ qn->upper > MANY_REPEAT_OF_ANYCHAR) {
+ MJ_RESULT mr = mostly_just_anychar(body, (qn->greedy == 0));
+ if (mr == MJ_YES)
+ ct->anychar_reluctant_many++;
+ }
+
+ r = detect_can_be_slow(body, ct, ncall, calls);
+
+ if (qn->emptiness != BODY_IS_NOT_EMPTY) {
+ if (NODE_IS_INPEEK(node)) {
+ if (ct->empty_check_nest_level > 2) {
+ if (prev_heavy_element == ct->heavy_element)
+ ct->heavy_element++;
+ }
+ }
+ ct->empty_check_nest_level--;
+ }
+ }
break;
case NODE_ANCHOR:
@@ -7744,23 +8034,23 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct)
}
if (ANCHOR_HAS_BODY(ANCHOR_(node)))
- r = node_detect_can_be_slow(NODE_BODY(node), ct);
+ r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls);
break;
case NODE_BAG:
{
BagNode* en = BAG_(node);
- r = node_detect_can_be_slow(NODE_BODY(node), ct);
+ r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls);
if (r != 0) return r;
if (en->type == BAG_IF_ELSE) {
if (IS_NOT_NULL(en->te.Then)) {
- r = node_detect_can_be_slow(en->te.Then, ct);
+ r = detect_can_be_slow(en->te.Then, ct, ncall, calls);
if (r != 0) return r;
}
if (IS_NOT_NULL(en->te.Else)) {
- r = node_detect_can_be_slow(en->te.Else, ct);
+ r = detect_can_be_slow(en->te.Else, ct, ncall, calls);
if (r != 0) return r;
}
}
@@ -7771,12 +8061,44 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct)
case NODE_BACKREF:
if (NODE_IS_NEST_LEVEL(node))
ct->backref_with_level++;
+ else
+ ct->backref++;
break;
#endif
#ifdef USE_CALL
case NODE_CALL:
- ct->call++;
+ {
+ int i;
+ int found;
+ int gnum;
+
+ gnum = CALL_(node)->called_gnum;
+ ct->call++;
+
+ if (NODE_IS_RECURSION(node) && NODE_IS_INPEEK(node) &&
+ NODE_IS_IN_REAL_REPEAT(node)) {
+ ct->heavy_element += 10;
+ }
+
+ found = FALSE;
+ for (i = 0; i < ncall; i++) {
+ if (gnum == calls[i]) {
+ found = TRUE;
+ break;
+ }
+ }
+
+ if (! found) {
+ if (ncall + 1 < MAX_CALLS_IN_DETECT) {
+ calls[ncall] = gnum;
+ r = detect_can_be_slow(NODE_BODY(node), ct, ncall + 1, calls);
+ }
+ else {
+ ct->heavy_element++;
+ }
+ }
+ }
break;
#endif
@@ -7795,8 +8117,12 @@ onig_detect_can_be_slow_pattern(const UChar* pattern,
int r;
regex_t* reg;
Node* root;
- ScanEnv scan_env;
+ ParseEnv scan_env;
SlowElementCount count;
+ int calls[MAX_CALLS_IN_DETECT];
+#ifdef USE_CALL
+ UnsetAddrList uslist = {0};
+#endif
reg = (regex_t* )xmalloc(sizeof(regex_t));
if (IS_NULL(reg)) return ONIGERR_MEMORY;
@@ -7807,25 +8133,44 @@ onig_detect_can_be_slow_pattern(const UChar* pattern,
return r;
}
- root = 0;
- r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env);
+ r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, NULL
+#ifdef USE_CALL
+ , &uslist
+#endif
+ );
+ if (r != 0) goto err;
+
+#ifdef USE_CALL
+ if (scan_env.num_call > 0) {
+ unset_addr_list_end(&uslist);
+ }
+#endif
+
+ count.prec_read = 0;
+ count.look_behind = 0;
+ count.backref = 0;
+ count.backref_with_level = 0;
+ count.call = 0;
+ count.anychar_reluctant_many = 0;
+ count.empty_check_nest_level = 0;
+ count.max_empty_check_nest_level = 0;
+ count.heavy_element = 0;
+
+ r = detect_can_be_slow(root, &count, 0, calls);
if (r == 0) {
- count.prec_read = 0;
- count.look_behind = 0;
- count.backref_with_level = 0;
- count.call = 0;
-
- r = node_detect_can_be_slow(root, &count);
- if (r == 0) {
- int n = count.prec_read + count.look_behind
- + count.backref_with_level + count.call;
- r = n;
- }
+ int n = count.prec_read + count.look_behind
+ + count.backref + count.backref_with_level + count.call
+ + count.anychar_reluctant_many;
+ if (count.heavy_element != 0)
+ n += count.heavy_element * 10;
+
+ r = n;
}
if (IS_NOT_NULL(scan_env.mem_env_dynamic))
xfree(scan_env.mem_env_dynamic);
+ err:
onig_node_free(root);
onig_free(reg);
return r;
@@ -7853,6 +8198,8 @@ Indent(FILE* f, int indent)
static void
print_indent_tree(FILE* f, Node* node, int indent)
{
+ static char* emptiness_name[] = { "", " empty", " empty_mem", " empty_rec" };
+
int i;
NodeType type;
UChar* p;
@@ -8019,69 +8366,83 @@ print_indent_tree(FILE* f, Node* node, int indent)
fprintf(f, "<call:%p>", node);
fprintf(f, " num: %d, name", cn->called_gnum);
p_string(f, cn->name_end - cn->name, cn->name);
+ if (NODE_IS_RECURSION(node)) fprintf(f, ", recursion");
+ if (NODE_IS_INPEEK(node)) fprintf(f, ", in-peek");
+ if (NODE_IS_IN_REAL_REPEAT(node)) fprintf(f, ", in-real-repeat");
}
break;
#endif
case NODE_QUANT:
- fprintf(f, "<quantifier:%p>{%d,%d}%s%s\n", node,
- QUANT_(node)->lower, QUANT_(node)->upper,
- (QUANT_(node)->greedy ? "" : "?"),
- QUANT_(node)->include_referred == 0 ? "" : " referred");
- print_indent_tree(f, NODE_BODY(node), indent + add);
+ {
+ fprintf(f, "<quantifier:%p>{%d,%d}%s%s%s", node,
+ QUANT_(node)->lower, QUANT_(node)->upper,
+ (QUANT_(node)->greedy ? "" : "?"),
+ QUANT_(node)->include_referred == 0 ? "" : " referred",
+ emptiness_name[QUANT_(node)->emptiness]);
+ if (NODE_IS_INPEEK(node)) fprintf(f, ", in-peek");
+ fprintf(f, "\n");
+ print_indent_tree(f, NODE_BODY(node), indent + add);
+ }
break;
case NODE_BAG:
- fprintf(f, "<bag:%p> ", node);
- if (BAG_(node)->type == BAG_IF_ELSE) {
- Node* Then;
- Node* Else;
- BagNode* bn;
-
- bn = BAG_(node);
- fprintf(f, "if-else\n");
- print_indent_tree(f, NODE_BODY(node), indent + add);
+ {
+ BagNode* bn = BAG_(node);
+ fprintf(f, "<bag:%p> ", node);
+ if (bn->type == BAG_IF_ELSE) {
+ Node* Then;
+ Node* Else;
+
+ fprintf(f, "if-else\n");
+ print_indent_tree(f, NODE_BODY(node), indent + add);
+
+ Then = bn->te.Then;
+ Else = bn->te.Else;
+ if (IS_NULL(Then)) {
+ Indent(f, indent + add);
+ fprintf(f, "THEN empty\n");
+ }
+ else
+ print_indent_tree(f, Then, indent + add);
- Then = bn->te.Then;
- Else = bn->te.Else;
- if (IS_NULL(Then)) {
- Indent(f, indent + add);
- fprintf(f, "THEN empty\n");
+ if (IS_NULL(Else)) {
+ Indent(f, indent + add);
+ fprintf(f, "ELSE empty\n");
+ }
+ else
+ print_indent_tree(f, Else, indent + add);
}
- else
- print_indent_tree(f, Then, indent + add);
+ else {
+ switch (bn->type) {
+ case BAG_OPTION:
+ fprintf(f, "option:%d", bn->o.options);
+ break;
+ case BAG_MEMORY:
+ fprintf(f, "memory:%d", bn->m.regnum);
+ if (NODE_IS_CALLED(node)) {
+ fprintf(f, ", called");
+ if (NODE_IS_RECURSION(node))
+ fprintf(f, ", recursion");
+ }
+ else if (NODE_IS_REFERENCED(node))
+ fprintf(f, ", referenced");
- if (IS_NULL(Else)) {
- Indent(f, indent + add);
- fprintf(f, "ELSE empty\n");
+ if (NODE_IS_FIXED_ADDR(node))
+ fprintf(f, ", fixed-addr");
+ if ((bn->m.called_state & IN_PEEK) != 0)
+ fprintf(f, ", in-peek");
+ break;
+ case BAG_STOP_BACKTRACK:
+ fprintf(f, "stop-bt");
+ break;
+ default:
+ break;
+ }
+ fprintf(f, "\n");
+ print_indent_tree(f, NODE_BODY(node), indent + add);
}
- else
- print_indent_tree(f, Else, indent + add);
-
- break;
}
-
- switch (BAG_(node)->type) {
- case BAG_OPTION:
- fprintf(f, "option:%d", BAG_(node)->o.options);
- break;
- case BAG_MEMORY:
- fprintf(f, "memory:%d", BAG_(node)->m.regnum);
- if (NODE_IS_CALLED(node))
- fprintf(f, ", called");
- else if (NODE_IS_REFERENCED(node))
- fprintf(f, ", referenced");
- if (NODE_IS_FIXED_ADDR(node))
- fprintf(f, ", fixed-addr");
- break;
- case BAG_STOP_BACKTRACK:
- fprintf(f, "stop-bt");
- break;
- default:
- break;
- }
- fprintf(f, "\n");
- print_indent_tree(f, NODE_BODY(node), indent + add);
break;
case NODE_GIMMICK:
diff --git a/src/regenc.c b/src/regenc.c
index 27e4549..84afd1e 100644
--- a/src/regenc.c
+++ b/src/regenc.c
@@ -2,7 +2,7 @@
regenc.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -569,6 +569,9 @@ onigenc_apply_all_case_fold_with_map(int map_size,
r = onigenc_ascii_apply_all_case_fold(flag, f, arg);
if (r != 0) return r;
+ if (CASE_FOLD_IS_ASCII_ONLY(flag))
+ return 0;
+
for (i = 0; i < map_size; i++) {
code = map[i].to;
r = (*f)(map[i].from, &code, 1, arg);
@@ -588,7 +591,7 @@ onigenc_apply_all_case_fold_with_map(int map_size,
extern int
onigenc_get_case_fold_codes_by_str_with_map(int map_size,
const OnigPairCaseFoldCodes map[],
- int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
+ int ess_tsett_flag, OnigCaseFoldType flag,
const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
{
int i, j, n;
@@ -596,7 +599,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size,
if (0x41 <= *p && *p <= 0x5a) { /* A - Z */
if (*p == LARGE_S && ess_tsett_flag != 0 && end > p + 1
- && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */
+ && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) /* SS */
+ && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
ss_combination:
items[0].byte_len = 2;
items[0].code_len = 1;
@@ -625,7 +629,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size,
}
else if (0x61 <= *p && *p <= 0x7a) { /* a - z */
if (*p == SMALL_S && ess_tsett_flag != 0 && end > p + 1
- && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) {
+ && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)
+ && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
goto ss_combination;
}
@@ -634,7 +639,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size,
items[0].code[0] = (OnigCodePoint )(*p - 0x20);
return 1;
}
- else if (*p == 0xdf && ess_tsett_flag != 0) {
+ else if (*p == 0xdf && ess_tsett_flag != 0
+ && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
items[0].byte_len = 1;
items[0].code_len = 2;
items[0].code[0] = (OnigCodePoint )'s';
@@ -660,6 +666,9 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size,
else {
int i;
+ if (CASE_FOLD_IS_ASCII_ONLY(flag))
+ return 0;
+
for (i = 0; i < map_size; i++) {
if (*p == map[i].from) {
items[0].byte_len = 1;
diff --git a/src/regenc.h b/src/regenc.h
index d183b97..d0b447d 100644
--- a/src/regenc.h
+++ b/src/regenc.h
@@ -142,6 +142,10 @@ struct PropertyNameCtype {
#define ENC_GET_SKIP_OFFSET(enc) \
(((enc)->flag & ENC_FLAG_SKIP_OFFSET_MASK)>>2)
+#define CASE_FOLD_IS_ASCII_ONLY(flag) \
+ (((flag) & ONIGENC_CASE_FOLD_ASCII_ONLY) != 0)
+#define CASE_FOLD_IS_NOT_ASCII_ONLY(flag) \
+ (((flag) & ONIGENC_CASE_FOLD_ASCII_ONLY) == 0)
/* for encoding system implementation (internal) */
extern int onigenc_end(void);
@@ -202,12 +206,12 @@ extern int onigenc_wb_is_break_position P_((OnigEncoding enc, UChar* p, UChar* p
#define FOLDS1_UNFOLDS_NUM(i) (OnigUnicodeFolds1[(i)+1])
#define FOLDS2_UNFOLDS_NUM(i) (OnigUnicodeFolds2[(i)+2])
#define FOLDS3_UNFOLDS_NUM(i) (OnigUnicodeFolds3[(i)+3])
-#define FOLDS1_UNFOLDS(i) (OnigUnicodeFolds1 + (i) + 2)
-#define FOLDS2_UNFOLDS(i) (OnigUnicodeFolds2 + (i) + 3)
-#define FOLDS3_UNFOLDS(i) (OnigUnicodeFolds3 + (i) + 4)
-#define FOLDS1_NEXT_INDEX(i) ((i) + 2 + OnigUnicodeFolds1[(i)+1])
-#define FOLDS2_NEXT_INDEX(i) ((i) + 3 + OnigUnicodeFolds2[(i)+2])
-#define FOLDS3_NEXT_INDEX(i) ((i) + 4 + OnigUnicodeFolds3[(i)+3])
+#define FOLDS1_UNFOLDS(i) (FOLDS1_FOLD(i) + 2)
+#define FOLDS2_UNFOLDS(i) (FOLDS2_FOLD(i) + 3)
+#define FOLDS3_UNFOLDS(i) (FOLDS3_FOLD(i) + 4)
+#define FOLDS1_NEXT_INDEX(i) ((i) + 2 + FOLDS1_UNFOLDS_NUM(i))
+#define FOLDS2_NEXT_INDEX(i) ((i) + 3 + FOLDS2_UNFOLDS_NUM(i))
+#define FOLDS3_NEXT_INDEX(i) ((i) + 4 + FOLDS3_UNFOLDS_NUM(i))
#define FOLDS_FOLD_ADDR_BUK(buk, addr) do {\
if ((buk)->fold_len == 1)\
diff --git a/src/regerror.c b/src/regerror.c
index dc1c8b6..18a5bdd 100644
--- a/src/regerror.c
+++ b/src/regerror.c
@@ -2,7 +2,7 @@
regerror.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2020 K.Kosako
+ * Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -146,6 +146,8 @@ onig_error_code_to_format(int code)
p = "too big wide-char value"; break;
case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE:
p = "too long wide-char value"; break;
+ case ONIGERR_UNDEFINED_OPERATOR:
+ p = "undefined operator"; break;
case ONIGERR_INVALID_CODE_POINT_VALUE:
p = "invalid code point value"; break;
case ONIGERR_EMPTY_GROUP_NAME:
@@ -190,6 +192,8 @@ onig_error_code_to_format(int code)
p = "not supported encoding combination"; break;
case ONIGERR_INVALID_COMBINATION_OF_OPTIONS:
p = "invalid combination of options"; break;
+ case ONIGERR_VERY_INEFFICIENT_PATTERN:
+ p = "very inefficient pattern"; break;
case ONIGERR_LIBRARY_IS_NOT_INITIALIZED:
p = "library is not initialized"; break;
diff --git a/src/regexec.c b/src/regexec.c
index bb6b474..a3cf60a 100644
--- a/src/regexec.c
+++ b/src/regexec.c
@@ -2,7 +2,7 @@
regexec.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2020 K.Kosako
+ * Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -54,6 +54,13 @@
(MEM_STATUS_AT((reg)->push_mem_end, (idx)) != 0 ? \
STACK_AT(mem_end_stk[idx].i)->u.mem.pstr : mem_end_stk[idx].s)
+#ifdef _MSC_VER
+#define DIST_CAST(d) (size_t )(d)
+#else
+#define DIST_CAST(d) (d)
+#endif
+
+
static int forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, UChar* range, UChar** low, UChar** high);
static int
@@ -76,11 +83,12 @@ struct OnigMatchParamStruct {
unsigned long retry_limit_in_match;
unsigned long retry_limit_in_search;
#endif
+
+ void* callout_user_data; /* used in callback each match */
#ifdef USE_CALLOUT
OnigCalloutFunc progress_callout_of_contents;
OnigCalloutFunc retraction_callout_of_contents;
int match_at_call_counter;
- void* callout_user_data;
CalloutData* callout_data;
int callout_data_alloc_num;
#endif
@@ -143,12 +151,8 @@ onig_set_retraction_callout_of_match_param(OnigMatchParam* param, OnigCalloutFun
extern int
onig_set_callout_user_data_of_match_param(OnigMatchParam* param, void* user_data)
{
-#ifdef USE_CALLOUT
param->callout_user_data = user_data;
return ONIG_NORMAL;
-#else
- return ONIG_NO_SUPPORT_CONFIG;
-#endif
}
@@ -873,6 +877,23 @@ onig_get_capture_tree(OnigRegion* region)
}
#endif /* USE_CAPTURE_HISTORY */
+
+static OnigCallbackEachMatchFunc CallbackEachMatch;
+
+extern OnigCallbackEachMatchFunc
+onig_get_callback_each_match(void)
+{
+ return CallbackEachMatch;
+}
+
+extern int
+onig_set_callback_each_match(OnigCallbackEachMatchFunc f)
+{
+ CallbackEachMatch = f;
+ return ONIG_NORMAL;
+}
+
+
extern void
onig_region_clear(OnigRegion* region)
{
@@ -1238,7 +1259,7 @@ struct OnigCalloutArgsStruct {
#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \
(msa).stack_p = (void* )0;\
- (msa).options = (arg_option);\
+ (msa).options = (arg_option)|(reg)->options;\
(msa).region = (arg_region);\
(msa).start = (arg_start);\
(msa).match_stack_limit = (mpv)->match_stack_limit;\
@@ -1251,7 +1272,7 @@ struct OnigCalloutArgsStruct {
#else
#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \
(msa).stack_p = (void* )0;\
- (msa).options = (arg_option);\
+ (msa).options = (arg_option)|(reg)->options;\
(msa).region = (arg_region);\
(msa).start = (arg_start);\
(msa).match_stack_limit = (mpv)->match_stack_limit;\
@@ -1405,6 +1426,7 @@ onig_set_subexp_call_limit_in_search(unsigned long n)
#endif
+
#ifdef USE_CALLOUT
static OnigCalloutFunc DefaultProgressCallout;
static OnigCalloutFunc DefaultRetractionCallout;
@@ -1452,11 +1474,12 @@ onig_initialize_match_param(OnigMatchParam* mp)
mp->retry_limit_in_search = RetryLimitInSearch;
#endif
+ mp->callout_user_data = 0;
+
#ifdef USE_CALLOUT
mp->progress_callout_of_contents = DefaultProgressCallout;
mp->retraction_callout_of_contents = DefaultRetractionCallout;
mp->match_at_call_counter = 0;
- mp->callout_user_data = 0;
mp->callout_data = 0;
mp->callout_data_alloc_num = 0;
#endif
@@ -1532,13 +1555,26 @@ onig_get_callout_data_dont_clear_old(regex_t* reg, OnigMatchParam* mp,
t = d->slot[slot].type;
if (IS_NOT_NULL(type)) *type = t;
if (IS_NOT_NULL(val)) *val = d->slot[slot].val;
- return (t == ONIG_TYPE_VOID ? 1 : ONIG_NORMAL);
+ return (t == ONIG_TYPE_VOID ? ONIG_VALUE_IS_NOT_SET : ONIG_NORMAL);
+}
+
+extern int
+onig_get_callout_data_by_tag_dont_clear_old(regex_t* reg,
+ OnigMatchParam* mp, const UChar* tag, const UChar* tag_end, int slot,
+ OnigType* type, OnigValue* val)
+{
+ int num;
+
+ num = onig_get_callout_num_by_tag(reg, tag, tag_end);
+ if (num < 0) return num;
+ if (num == 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME;
+
+ return onig_get_callout_data_dont_clear_old(reg, mp, num, slot, type, val);
}
extern int
-onig_get_callout_data_by_callout_args_self_dont_clear_old(OnigCalloutArgs* args,
- int slot, OnigType* type,
- OnigValue* val)
+onig_get_callout_data_by_callout_args_self_dont_clear_old(
+ OnigCalloutArgs* args, int slot, OnigType* type, OnigValue* val)
{
return onig_get_callout_data_dont_clear_old(args->regex, args->msa->mp,
args->num, slot, type, val);
@@ -1563,7 +1599,7 @@ onig_get_callout_data(regex_t* reg, OnigMatchParam* mp,
t = d->slot[slot].type;
if (IS_NOT_NULL(type)) *type = t;
if (IS_NOT_NULL(val)) *val = d->slot[slot].val;
- return (t == ONIG_TYPE_VOID ? 1 : ONIG_NORMAL);
+ return (t == ONIG_TYPE_VOID ? ONIG_VALUE_IS_NOT_SET : ONIG_NORMAL);
}
extern int
@@ -2171,65 +2207,90 @@ stack_double(int* is_alloca, char** arg_alloc_base,
}\
} while (0)
-#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
-#define STACK_EMPTY_CHECK_MEM(isnull, sid, s, reg) do {\
- StackType* k;\
- GET_EMPTY_CHECK_START(sid, k);\
- if (k->u.empty_check.pstr != (s)) {\
+#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT
+#define STACK_EMPTY_CHECK_MEM(isnull, sid, empty_status_mem, s, reg) do {\
+ StackType* klow;\
+ GET_EMPTY_CHECK_START(sid, klow);\
+ if (klow->u.empty_check.pstr != (s)) {\
+ stack_empty_check_mem_not_empty:\
(isnull) = 0;\
}\
else {\
- UChar* endp;\
+ StackType *k, *kk;\
+ MemStatusType ms = (empty_status_mem);\
(isnull) = 1;\
- while (k < stk) {\
- if (k->type == STK_MEM_START &&\
- MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid)) {\
- STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\
- if (endp == 0) {\
- (isnull) = 0; break;\
- }\
- else if (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != endp) {\
- (isnull) = 0; break;\
- }\
- else if (endp != s) {\
- (isnull) = -1; /* empty, but position changed */ \
+ k = stk;\
+ while (k > klow) {\
+ k--;\
+ if (k->type == STK_MEM_END && MEM_STATUS_LIMIT_AT(ms, k->zid)) {\
+ kk = klow;\
+ while (kk < k) {\
+ if (kk->type == STK_MEM_START && kk->zid == k->zid) {\
+ if (kk->u.mem.prev_end.i == INVALID_STACK_INDEX || \
+ ((STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr) && (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr))) {\
+ goto stack_empty_check_mem_not_empty;\
+ }\
+ else {\
+ ms &= ~((MemStatusType )1 << k->zid);\
+ break;\
+ }\
+ }\
+ kk++;\
}\
+ if (ms == 0) break;\
}\
- k++;\
}\
}\
} while(0)
-#define STACK_EMPTY_CHECK_MEM_REC(isnull,sid,s,reg) do {\
+#define STACK_EMPTY_CHECK_MEM_REC(isnull,sid,empty_status_mem,s,reg) do {\
int level = 0;\
- StackType* k = stk;\
+ StackType* klow = stk;\
while (1) {\
- k--;\
- STACK_BASE_CHECK(k, "STACK_EMPTY_CHECK_MEM_REC");\
- if (k->type == STK_EMPTY_CHECK_START) {\
- if (k->zid == (sid)) {\
+ klow--;\
+ STACK_BASE_CHECK(klow, "STACK_EMPTY_CHECK_MEM_REC");\
+ if (klow->type == STK_EMPTY_CHECK_START) {\
+ if (klow->zid == (sid)) {\
if (level == 0) {\
- if (k->u.empty_check.pstr != (s)) {\
+ if (klow->u.empty_check.pstr != (s)) {\
+ stack_empty_check_mem_rec_not_empty:\
(isnull) = 0;\
break;\
}\
else {\
- UChar* endp;\
+ StackType *k, *kk;\
+ MemStatusType ms;\
(isnull) = 1;\
- while (k < stk) {\
- if (k->type == STK_MEM_START) {\
- if (level == 0 && \
- MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid) !=0) {\
- STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\
- if (endp == 0) {\
- (isnull) = 0; break;\
- }\
- else if (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != endp) { \
- (isnull) = 0; break;\
- }\
- else if (endp != s) {\
- (isnull) = -1; /* empty, but position changed */\
+ if ((empty_status_mem) == 0) break;\
+ ms = (empty_status_mem);\
+ k = stk;\
+ while (k > klow) {\
+ k--;\
+ if (k->type == STK_MEM_END) {\
+ if (level == 0 && MEM_STATUS_LIMIT_AT(ms, k->zid)) {\
+ kk = klow;\
+ kk++;\
+ while (kk < k) {\
+ if (kk->type == STK_MEM_START && kk->zid == k->zid) {\
+ if (kk->u.mem.prev_end.i == INVALID_STACK_INDEX || \
+ ((STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr) && (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr))) {\
+ goto stack_empty_check_mem_rec_not_empty;\
+ }\
+ else {\
+ ms &= ~((MemStatusType )1 << k->zid);\
+ break;\
+ }\
+ }\
+ else if (kk->type == STK_EMPTY_CHECK_START) {\
+ if (kk->zid == (sid)) level++;\
+ }\
+ else if (kk->type == STK_EMPTY_CHECK_END) {\
+ if (kk->zid == (sid)) level--;\
+ }\
+ kk++;\
}\
+ level = 0;\
+ if (ms == 0) break;\
}\
}\
else if (k->type == STK_EMPTY_CHECK_START) {\
@@ -2238,7 +2299,6 @@ stack_double(int* is_alloca, char** arg_alloc_base,
else if (k->type == STK_EMPTY_CHECK_END) {\
if (k->zid == (sid)) level--;\
}\
- k++;\
}\
break;\
}\
@@ -2248,8 +2308,8 @@ stack_double(int* is_alloca, char** arg_alloc_base,
}\
}\
}\
- else if (k->type == STK_EMPTY_CHECK_END) {\
- if (k->zid == (sid)) level++;\
+ else if (klow->type == STK_EMPTY_CHECK_END) {\
+ if (klow->zid == (sid)) level++;\
}\
}\
} while(0)
@@ -2274,7 +2334,7 @@ stack_double(int* is_alloca, char** arg_alloc_base,
}\
}\
} while(0)
-#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */
+#endif /* USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT */
#define STACK_GET_REPEAT_COUNT_SEARCH(sid, c) do {\
StackType* k = stk;\
@@ -2888,6 +2948,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
StackType *stkp; /* used as any purpose. */
StkPtrType *mem_start_stk, *mem_end_stk;
UChar* keep;
+ OnigRegion* region;
#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR
StackIndex *repeat_stk;
@@ -2905,8 +2966,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
unsigned long subexp_call_counters[MAX_SUBEXP_CALL_COUNTERS];
#endif
+ OnigOptionType options;
Operation* p = reg->ops;
- OnigOptionType option = reg->options;
OnigEncoding encode = reg->enc;
OnigCaseFoldType case_fold_flag = reg->case_fold_flag;
@@ -2936,6 +2997,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
}
#endif
+ options = msa->options;
+
#ifdef USE_CALLOUT
msa->mp->match_at_call_counter++;
#endif
@@ -2976,102 +3039,113 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
BYTECODE_INTERPRETER_START {
CASE_OP(END)
n = (int )(s - sstart);
+ if (n == 0 && OPTON_FIND_NOT_EMPTY(options)) {
+ best_len = ONIG_MISMATCH;
+ goto fail; /* for retry */
+ }
+
if (n > best_len) {
- OnigRegion* region;
#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
- if (OPTON_FIND_LONGEST(option)) {
+ if (OPTON_FIND_LONGEST(options)) {
if (n > msa->best_len) {
msa->best_len = n;
msa->best_s = (UChar* )sstart;
- goto set_region;
}
- else
- goto end_best_len;
+ else {
+ if (s >= in_right_range && msa->best_s == sstart) {
+ best_len = msa->best_len; /* end of find */
+ }
+ else {
+ SOP_OUT;
+ goto fail; /* for retry */
+ }
+ }
}
-#endif
+ else {
+ best_len = n;
+ }
+#else
best_len = n;
+#endif
+ }
- set_region:
- region = msa->region;
- if (region) {
- if (keep > s) keep = s;
+ /* set region */
+ region = msa->region;
+ if (region) {
+ if (keep > s) keep = s;
#ifdef USE_POSIX_API
- if (OPTON_POSIX_REGION(msa->options)) {
- posix_regmatch_t* rmt = (posix_regmatch_t* )region;
-
- rmt[0].rm_so = (regoff_t )(keep - str);
- rmt[0].rm_eo = (regoff_t )(s - str);
- for (i = 1; i <= num_mem; i++) {
- if (mem_end_stk[i].i != INVALID_STACK_INDEX) {
- rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str);
- rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i) - str);
- }
- else {
- rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS;
- }
+ if (OPTON_POSIX_REGION(options)) {
+ posix_regmatch_t* rmt = (posix_regmatch_t* )region;
+
+ rmt[0].rm_so = (regoff_t )(keep - str);
+ rmt[0].rm_eo = (regoff_t )(s - str);
+ for (i = 1; i <= num_mem; i++) {
+ if (mem_end_stk[i].i != INVALID_STACK_INDEX) {
+ rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str);
+ rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i) - str);
+ }
+ else {
+ rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS;
}
}
- else {
+ }
+ else {
#endif /* USE_POSIX_API */
- region->beg[0] = (int )(keep - str);
- region->end[0] = (int )(s - str);
- for (i = 1; i <= num_mem; i++) {
- if (mem_end_stk[i].i != INVALID_STACK_INDEX) {
- region->beg[i] = (int )(STACK_MEM_START(reg, i) - str);
- region->end[i] = (int )(STACK_MEM_END(reg, i) - str);
- }
- else {
- region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS;
- }
+ region->beg[0] = (int )(keep - str);
+ region->end[0] = (int )(s - str);
+ for (i = 1; i <= num_mem; i++) {
+ if (mem_end_stk[i].i != INVALID_STACK_INDEX) {
+ region->beg[i] = (int )(STACK_MEM_START(reg, i) - str);
+ region->end[i] = (int )(STACK_MEM_END(reg, i) - str);
+ }
+ else {
+ region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS;
}
+ }
#ifdef USE_CAPTURE_HISTORY
- if (reg->capture_history != 0) {
- int r;
- OnigCaptureTreeNode* node;
+ if (reg->capture_history != 0) {
+ OnigCaptureTreeNode* node;
- if (IS_NULL(region->history_root)) {
- region->history_root = node = history_node_new();
- CHECK_NULL_RETURN_MEMERR(node);
- }
- else {
- node = region->history_root;
- history_tree_clear(node);
- }
+ if (IS_NULL(region->history_root)) {
+ region->history_root = node = history_node_new();
+ CHECK_NULL_RETURN_MEMERR(node);
+ }
+ else {
+ node = region->history_root;
+ history_tree_clear(node);
+ }
- node->group = 0;
- node->beg = (int )(keep - str);
- node->end = (int )(s - str);
+ node->group = 0;
+ node->beg = (int )(keep - str);
+ node->end = (int )(s - str);
- stkp = stk_base;
- r = make_capture_history_tree(region->history_root, &stkp,
- stk, (UChar* )str, reg);
- if (r < 0) MATCH_AT_ERROR_RETURN(r);
- }
+ stkp = stk_base;
+ i = make_capture_history_tree(region->history_root, &stkp,
+ stk, (UChar* )str, reg);
+ if (i < 0) MATCH_AT_ERROR_RETURN(i);
+ }
#endif /* USE_CAPTURE_HISTORY */
#ifdef USE_POSIX_API
- } /* else OPTON_POSIX_REGION() */
+ } /* else OPTON_POSIX_REGION() */
#endif
- } /* if (region) */
- } /* n > best_len */
+ } /* if (region) */
-#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
- end_best_len:
-#endif
SOP_OUT;
- if (OPTON_FIND_CONDITION(option)) {
- if (OPTON_FIND_NOT_EMPTY(option) && s == sstart) {
+ if (OPTON_CALLBACK_EACH_MATCH(options) &&
+ IS_NOT_NULL(CallbackEachMatch)) {
+ i = CallbackEachMatch(str, end, sstart, region,
+ msa->mp->callout_user_data);
+ if (i < 0) MATCH_AT_ERROR_RETURN(i);
+
+#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
+ if (! OPTON_FIND_LONGEST(options))
+#endif
best_len = ONIG_MISMATCH;
- goto fail; /* for retry */
- }
- if (OPTON_FIND_LONGEST(option)) {
- if (s >= in_right_range && msa->best_s == sstart)
- best_len = msa->best_len;
- else
- goto fail; /* for retry */
- }
+
+ goto fail;
}
/* default behavior: return first-matching result. */
@@ -3564,23 +3638,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
CASE_OP(BEGIN_BUF)
if (! ON_STR_BEGIN(s)) goto fail;
- if (OPTON_NOTBOL(msa->options)) goto fail;
- if (OPTON_NOT_BEGIN_STRING(msa->options)) goto fail;
+ if (OPTON_NOTBOL(options)) goto fail;
+ if (OPTON_NOT_BEGIN_STRING(options)) goto fail;
INC_OP;
JUMP_OUT;
CASE_OP(END_BUF)
if (! ON_STR_END(s)) goto fail;
- if (OPTON_NOTEOL(msa->options)) goto fail;
- if (OPTON_NOT_END_STRING(msa->options)) goto fail;
+ if (OPTON_NOTEOL(options)) goto fail;
+ if (OPTON_NOT_END_STRING(options)) goto fail;
INC_OP;
JUMP_OUT;
CASE_OP(BEGIN_LINE)
if (ON_STR_BEGIN(s)) {
- if (OPTON_NOTBOL(msa->options)) goto fail;
+ if (OPTON_NOTBOL(options)) goto fail;
INC_OP;
JUMP_OUT;
}
@@ -3599,7 +3673,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s);
if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) {
#endif
- if (OPTON_NOTEOL(msa->options)) goto fail;
+ if (OPTON_NOTEOL(options)) goto fail;
INC_OP;
JUMP_OUT;
#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE
@@ -3624,8 +3698,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s);
if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) {
#endif
- if (OPTON_NOTEOL(msa->options)) goto fail;
- if (OPTON_NOT_END_STRING(msa->options)) goto fail;
+ if (OPTON_NOTEOL(options)) goto fail;
+ if (OPTON_NOT_END_STRING(options)) goto fail;
INC_OP;
JUMP_OUT;
#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE
@@ -3634,8 +3708,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
}
else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) &&
ON_STR_END(s + enclen(encode, s))) {
- if (OPTON_NOTEOL(msa->options)) goto fail;
- if (OPTON_NOT_END_STRING(msa->options)) goto fail;
+ if (OPTON_NOTEOL(options)) goto fail;
+ if (OPTON_NOT_END_STRING(options)) goto fail;
INC_OP;
JUMP_OUT;
}
@@ -3644,8 +3718,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
UChar* ss = s + enclen(encode, s);
ss += enclen(encode, ss);
if (ON_STR_END(ss)) {
- if (OPTON_NOTEOL(msa->options)) goto fail;
- if (OPTON_NOT_END_STRING(msa->options)) goto fail;
+ if (OPTON_NOTEOL(options)) goto fail;
+ if (OPTON_NOT_END_STRING(options)) goto fail;
INC_OP;
JUMP_OUT;
}
@@ -3657,7 +3731,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
switch (p->check_position.type) {
case CHECK_POSITION_SEARCH_START:
if (s != msa->start) goto fail;
- if (OPTON_NOT_BEGIN_POSITION(msa->options)) goto fail;
+ if (OPTON_NOT_BEGIN_POSITION(options)) goto fail;
break;
case CHECK_POSITION_CURRENT_RIGHT_RANGE:
if (s != right_range) goto fail;
@@ -3924,13 +3998,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
}
JUMP_OUT;
-#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
+#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT
CASE_OP(EMPTY_CHECK_END_MEMST)
{
int is_empty;
mem = p->empty_check_end.mem; /* mem: null check id */
- STACK_EMPTY_CHECK_MEM(is_empty, mem, s, reg);
+ STACK_EMPTY_CHECK_MEM(is_empty, mem, p->empty_check_end.empty_status_mem, s, reg);
INC_OP;
if (is_empty) {
#ifdef ONIG_DEBUG_MATCH
@@ -3949,8 +4023,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
int is_empty;
mem = p->empty_check_end.mem; /* mem: null check id */
-#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT
- STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg);
+#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT
+ STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, p->empty_check_end.empty_status_mem, s, reg);
#else
STACK_EMPTY_CHECK_REC(is_empty, mem, s);
#endif
@@ -4109,6 +4183,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
}
}
+#ifdef ONIG_DEBUG_CALL
+ fprintf(DBGFP, "CALL: id:%d, at:%ld, level:%lu\n", p->call.called_mem, s - str, subexp_call_nest_counter);
+#endif
addr = p->call.addr;
INC_OP; STACK_PUSH_CALL_FRAME(p);
p = reg->ops + addr;
@@ -4425,7 +4502,7 @@ regset_search_body_position_lead(OnigRegSet* set,
sr[i].state = SRS_DEAD;
if (reg->optimize != OPTIMIZE_NONE) {
if (reg->dist_max != INFINITE_LEN) {
- if (end - range > reg->dist_max)
+ if (DIST_CAST(end - range) > reg->dist_max)
sch_range = (UChar* )range + reg->dist_max;
else
sch_range = (UChar* )end;
@@ -4609,7 +4686,7 @@ onig_regset_search_with_param(OnigRegSet* set,
if (set->n == 0)
return ONIG_MISMATCH;
- if (OPTON_POSIX_REGION(option))
+ if (OPTON_POSIX_REGION(option) || OPTON_CALLBACK_EACH_MATCH(option))
return ONIGERR_INVALID_ARGUMENT;
r = 0;
@@ -4884,7 +4961,7 @@ sunday_quick_search_step_forward(regex_t* reg,
const UChar* text_range)
{
const UChar *s, *se, *t, *p, *end;
- const UChar *tail;
+ const UChar *tail, *next;
int skip, tlen1;
int map_offset;
OnigEncoding enc;
@@ -4921,9 +4998,11 @@ sunday_quick_search_step_forward(regex_t* reg,
s += enclen(enc, s);
} while ((s - t) < skip && s < end);
#else
- s += skip;
- if (s < end)
- s = onigenc_get_right_adjust_char_head(enc, text, s);
+ next = s + skip;
+ if (next < end)
+ s = onigenc_get_right_adjust_char_head(enc, s, next);
+ else
+ break;
#endif
}
@@ -5086,7 +5165,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start,
p = start;
if (reg->dist_min != 0) {
- if (end - p <= reg->dist_min)
+ if (DIST_CAST(end - p) <= reg->dist_min)
return 0; /* fail */
if (ONIGENC_IS_SINGLEBYTE(reg->enc)) {
@@ -5119,7 +5198,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start,
}
if (p && p < range) {
- if (p - start < reg->dist_min) {
+ if (DIST_CAST(p - start) < reg->dist_min) {
retry_gate:
pprev = p;
p += enclen(reg->enc, p);
@@ -5164,7 +5243,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start,
}
else {
if (reg->dist_max != INFINITE_LEN) {
- if (p - str < reg->dist_max) {
+ if (DIST_CAST(p - str) < reg->dist_max) {
*low = (UChar* )str;
}
else {
@@ -5175,7 +5254,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start,
}
}
/* no needs to adjust *high, *high is used as range check only */
- if (p - str < reg->dist_min)
+ if (DIST_CAST(p - str) < reg->dist_min)
*high = (UChar* )str;
else
*high = p - reg->dist_min;
@@ -5260,13 +5339,13 @@ backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s,
}
if (reg->dist_max != INFINITE_LEN) {
- if (p - str < reg->dist_max)
+ if (DIST_CAST(p - str) < reg->dist_max)
*low = (UChar* )str;
else
*low = p - reg->dist_max;
if (reg->dist_min != 0) {
- if (p - str < reg->dist_min)
+ if (DIST_CAST(p - str) < reg->dist_min)
*high = (UChar* )str;
else
*high = p - reg->dist_min;
@@ -5410,13 +5489,13 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end,
if (range > start) {
if (reg->anc_dist_max != INFINITE_LEN &&
- min_semi_end - start > reg->anc_dist_max) {
+ DIST_CAST(min_semi_end - start) > reg->anc_dist_max) {
start = min_semi_end - reg->anc_dist_max;
if (start < end)
start = onigenc_get_right_adjust_char_head(reg->enc, str, start);
}
- if (max_semi_end - (range - 1) < reg->anc_dist_min) {
- if (max_semi_end - str + 1 < reg->anc_dist_min)
+ if (DIST_CAST(max_semi_end - (range - 1)) < reg->anc_dist_min) {
+ if (DIST_CAST(max_semi_end - str + 1) < reg->anc_dist_min)
goto mismatch_no_msa;
else
range = max_semi_end - reg->anc_dist_min + 1;
@@ -5428,11 +5507,11 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end,
}
else {
if (reg->anc_dist_max != INFINITE_LEN &&
- min_semi_end - range > reg->anc_dist_max) {
+ DIST_CAST(min_semi_end - range) > reg->anc_dist_max) {
range = min_semi_end - reg->anc_dist_max;
}
- if (max_semi_end - start < reg->anc_dist_min) {
- if (max_semi_end - str < reg->anc_dist_min)
+ if (DIST_CAST(max_semi_end - start) < reg->anc_dist_min) {
+ if (DIST_CAST(max_semi_end - str) < reg->anc_dist_min)
goto mismatch_no_msa;
else {
start = max_semi_end - reg->anc_dist_min;
@@ -5503,7 +5582,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end,
if (reg->dist_max == INFINITE_LEN)
sch_range = (UChar* )end;
else {
- if ((end - range) < reg->dist_max)
+ if (DIST_CAST(end - range) < reg->dist_max)
sch_range = (UChar* )end;
else {
sch_range = (UChar* )range + reg->dist_max;
@@ -5579,14 +5658,14 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end,
else
adjrange = (UChar* )end;
- if (end - range > reg->dist_min)
+ if (DIST_CAST(end - range) > reg->dist_min)
min_range = range + reg->dist_min;
else
min_range = end;
if (reg->dist_max != INFINITE_LEN) {
do {
- if (end - s > reg->dist_max)
+ if (DIST_CAST(end - s) > reg->dist_max)
sch_start = s + reg->dist_max;
else {
sch_start = onigenc_get_prev_char_head(reg->enc, str, end);
@@ -5887,8 +5966,10 @@ onig_regset_add(OnigRegSet* set, regex_t* reg)
{
OnigRegion* region;
+#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
if (OPTON_FIND_LONGEST(reg->options))
return ONIGERR_INVALID_ARGUMENT;
+#endif
if (set->n != 0 && reg->enc != set->enc)
return ONIGERR_INVALID_ARGUMENT;
@@ -5933,8 +6014,10 @@ onig_regset_replace(OnigRegSet* set, int at, regex_t* reg)
set->n--;
}
else {
+#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
if (OPTON_FIND_LONGEST(reg->options))
return ONIGERR_INVALID_ARGUMENT;
+#endif
if (set->n > 1 && reg->enc != set->enc)
return ONIGERR_INVALID_ARGUMENT;
@@ -6573,7 +6656,7 @@ onig_builtin_monitor(OnigCalloutArgs* args, void* user_data)
tag_len = tag_end - tag_start;
if (tag_len >= sizeof(buf)) tag_len = sizeof(buf) - 1;
- for (i = 0; i < tag_len; i++) buf[i] = tag_start[i];
+ for (i = 0; i < (int )tag_len; i++) buf[i] = tag_start[i];
buf[tag_len] = '\0';
}
diff --git a/src/regint.h b/src/regint.h
index 74a5c61..9856a96 100644
--- a/src/regint.h
+++ b/src/regint.h
@@ -4,7 +4,7 @@
regint.h - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2020 K.Kosako
+ * Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -35,6 +35,7 @@
/* #define ONIG_DEBUG_SEARCH */
/* #define ONIG_DEBUG_MATCH */
/* #define ONIG_DEBUG_MATCH_COUNTER */
+/* #define ONIG_DEBUG_CALL */
/* #define ONIG_DONT_OPTIMIZE */
/* for byte-code statistical data. */
@@ -42,7 +43,8 @@
#if defined(ONIG_DEBUG_PARSE) || defined(ONIG_DEBUG_MATCH) || \
defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \
- defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_STATISTICS)
+ defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_CALL) || \
+ defined(ONIG_DEBUG_STATISTICS)
#ifndef ONIG_DEBUG
#define ONIG_DEBUG
#define DBGFP stderr
@@ -61,7 +63,7 @@
#define USE_CALL
#define USE_CALLOUT
#define USE_BACKREF_WITH_LEVEL /* \k<name+n>, \k<name-n> */
-#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */
+#define USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */
#define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */
#define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
#define USE_RETRY_LIMIT
@@ -388,10 +390,10 @@ typedef unsigned int MemStatusType;
(IS_CODE_DIGIT_ASCII(enc,code) ? DIGITVAL(code) \
: (ONIGENC_IS_CODE_UPPER(enc,code) ? (code) - 'A' + 10 : (code) - 'a' + 10))
+#define OPTON_CALLBACK_EACH_MATCH(option) \
+ ((option) & ONIG_OPTION_CALLBACK_EACH_MATCH)
#define OPTON_FIND_LONGEST(option) ((option) & ONIG_OPTION_FIND_LONGEST)
#define OPTON_FIND_NOT_EMPTY(option) ((option) & ONIG_OPTION_FIND_NOT_EMPTY)
-#define OPTON_FIND_CONDITION(option) ((option) & \
- (ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY))
#define OPTON_NEGATE_SINGLELINE(option) ((option) & \
ONIG_OPTION_NEGATE_SINGLELINE)
#define OPTON_DONT_CAPTURE_GROUP(option) ((option) & \
@@ -406,8 +408,6 @@ typedef unsigned int MemStatusType;
#define OPTON_NOT_END_STRING(option) ((option) & ONIG_OPTION_NOT_END_STRING)
#define OPTON_NOT_BEGIN_POSITION(option) ((option) & ONIG_OPTION_NOT_BEGIN_POSITION)
-#define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \
- ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR)
#define INFINITE_REPEAT -1
#define IS_INFINITE_REPEAT(n) ((n) == INFINITE_REPEAT)
@@ -437,81 +437,6 @@ typedef Bits* BitSetRef;
#define BITSET_CLEAR_BIT(bs, pos) BS_ROOM(bs,pos) &= ~(BS_BIT(pos))
#define BITSET_INVERT_BIT(bs, pos) BS_ROOM(bs,pos) ^= BS_BIT(pos)
-/* bytes buffer */
-typedef struct _BBuf {
- UChar* p;
- unsigned int used;
- unsigned int alloc;
-} BBuf;
-
-#define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size))
-
-#define BB_EXPAND(buf,low) do{\
- do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\
- (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\
- if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\
-} while (0)
-
-#define BB_ENSURE_SIZE(buf,size) do{\
- unsigned int new_alloc = (buf)->alloc;\
- while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\
- if ((buf)->alloc != new_alloc) {\
- (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\
- if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\
- (buf)->alloc = new_alloc;\
- }\
-} while (0)
-
-#define BB_WRITE(buf,pos,bytes,n) do{\
- int used = (pos) + (n);\
- if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\
- xmemcpy((buf)->p + (pos), (bytes), (n));\
- if ((buf)->used < (unsigned int )used) (buf)->used = used;\
-} while (0)
-
-#define BB_WRITE1(buf,pos,byte) do{\
- int used = (pos) + 1;\
- if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\
- (buf)->p[(pos)] = (byte);\
- if ((buf)->used < (unsigned int )used) (buf)->used = used;\
-} while (0)
-
-#define BB_ADD(buf,bytes,n) BB_WRITE((buf),(buf)->used,(bytes),(n))
-#define BB_ADD1(buf,byte) BB_WRITE1((buf),(buf)->used,(byte))
-#define BB_GET_ADD_ADDRESS(buf) ((buf)->p + (buf)->used)
-#define BB_GET_OFFSET_POS(buf) ((buf)->used)
-
-/* from < to */
-#define BB_MOVE_RIGHT(buf,from,to,n) do {\
- if ((unsigned int )((to)+(n)) > (buf)->alloc) BB_EXPAND((buf),(to) + (n));\
- xmemmove((buf)->p + (to), (buf)->p + (from), (n));\
- if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\
-} while (0)
-
-/* from > to */
-#define BB_MOVE_LEFT(buf,from,to,n) do {\
- xmemmove((buf)->p + (to), (buf)->p + (from), (n));\
-} while (0)
-
-/* from > to */
-#define BB_MOVE_LEFT_REDUCE(buf,from,to) do {\
- xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\
- (buf)->used -= (from - to);\
-} while (0)
-
-#define BB_INSERT(buf,pos,bytes,n) do {\
- if (pos >= (buf)->used) {\
- BB_WRITE(buf,pos,bytes,n);\
- }\
- else {\
- BB_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\
- xmemcpy((buf)->p + (pos), (bytes), (n));\
- }\
-} while (0)
-
-#define BB_GET_BYTE(buf, pos) (buf)->p[(pos)]
-
-
/* has body */
#define ANCR_PREC_READ (1<<0)
#define ANCR_PREC_READ_NOT (1<<1)
@@ -884,6 +809,7 @@ typedef struct {
} empty_check_start;
struct {
MemNumType mem;
+ MemStatusType empty_status_mem;
} empty_check_end; /* EMPTY_CHECK_END, EMPTY_CHECK_END_MEMST, EMPTY_CHECK_END_MEMST_PUSH */
struct {
RelAddrType addr;
@@ -922,7 +848,7 @@ typedef struct {
} update_var;
struct {
AbsAddrType addr;
-#ifdef ONIG_DEBUG_MATCH_COUNTER
+#if defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_CALL)
MemNumType called_mem;
#endif
} call;
@@ -977,7 +903,6 @@ struct re_pattern_buffer {
MemStatusType capture_history; /* (?@...) flag (1-31) */
MemStatusType push_mem_start; /* need backtrack flag */
MemStatusType push_mem_end; /* need backtrack flag */
- MemStatusType empty_status_mem;
int stack_pop_level;
int repeat_range_alloc;
RepeatRange* repeat_range;
diff --git a/src/regparse.c b/src/regparse.c
index dd2824b..938a569 100644
--- a/src/regparse.c
+++ b/src/regparse.c
@@ -2,7 +2,7 @@
regparse.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2020 K.Kosako
+ * Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -159,6 +159,75 @@ OnigSyntaxType OnigSyntaxRuby = {
OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_ONIGURUMA;
+
+#define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size))
+
+#define BB_EXPAND(buf,low) do{\
+ do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\
+ (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\
+ if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\
+} while (0)
+
+#define BB_ENSURE_SIZE(buf,size) do{\
+ unsigned int new_alloc = (buf)->alloc;\
+ while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\
+ if ((buf)->alloc != new_alloc) {\
+ (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\
+ if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\
+ (buf)->alloc = new_alloc;\
+ }\
+} while (0)
+
+#define BB_WRITE(buf,pos,bytes,n) do{\
+ int used = (pos) + (n);\
+ if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\
+ xmemcpy((buf)->p + (pos), (bytes), (n));\
+ if ((buf)->used < (unsigned int )used) (buf)->used = used;\
+} while (0)
+
+#define BB_WRITE1(buf,pos,byte) do{\
+ int used = (pos) + 1;\
+ if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\
+ (buf)->p[(pos)] = (byte);\
+ if ((buf)->used < (unsigned int )used) (buf)->used = used;\
+} while (0)
+
+#define BB_ADD(buf,bytes,n) BB_WRITE((buf),(buf)->used,(bytes),(n))
+#define BB_ADD1(buf,byte) BB_WRITE1((buf),(buf)->used,(byte))
+#define BB_GET_ADD_ADDRESS(buf) ((buf)->p + (buf)->used)
+#define BB_GET_OFFSET_POS(buf) ((buf)->used)
+
+/* from < to */
+#define BB_MOVE_RIGHT(buf,from,to,n) do {\
+ if ((unsigned int )((to)+(n)) > (buf)->alloc) BB_EXPAND((buf),(to) + (n));\
+ xmemmove((buf)->p + (to), (buf)->p + (from), (n));\
+ if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\
+} while (0)
+
+/* from > to */
+#define BB_MOVE_LEFT(buf,from,to,n) do {\
+ xmemmove((buf)->p + (to), (buf)->p + (from), (n));\
+} while (0)
+
+/* from > to */
+#define BB_MOVE_LEFT_REDUCE(buf,from,to) do {\
+ xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\
+ (buf)->used -= (from - to);\
+} while (0)
+
+#define BB_INSERT(buf,pos,bytes,n) do {\
+ if (pos >= (buf)->used) {\
+ BB_WRITE(buf,pos,bytes,n);\
+ }\
+ else {\
+ BB_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\
+ xmemcpy((buf)->p + (pos), (bytes), (n));\
+ }\
+} while (0)
+
+#define BB_GET_BYTE(buf, pos) (buf)->p[(pos)]
+
+
typedef enum {
CS_VALUE,
CS_RANGE,
@@ -300,7 +369,7 @@ bbuf_clone(BBuf** rto, BBuf* from)
}
static int
-backref_rel_to_abs(int rel_no, ScanEnv* env)
+backref_rel_to_abs(int rel_no, ParseEnv* env)
{
if (rel_no > 0) {
if (rel_no > ONIG_INT_MAX - env->num_mem)
@@ -981,7 +1050,7 @@ onig_number_of_names(regex_t* reg)
#endif /* else USE_ST_LIBRARY */
static int
-name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
+name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ParseEnv* env)
{
int r;
int alloc;
@@ -1115,7 +1184,7 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name,
}
static int
-name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end,
+name_to_group_numbers(ParseEnv* env, const UChar* name, const UChar* name_end,
int** nums)
{
regex_t* reg;
@@ -1920,7 +1989,7 @@ callout_tag_table_new(CalloutTagTable** rt)
}
static int
-callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name,
+callout_tag_entry_raw(ParseEnv* env, CalloutTagTable* t, UChar* name,
UChar* name_end, CalloutTagVal entry_val)
{
int r;
@@ -1963,7 +2032,7 @@ ext_ensure_tag_table(regex_t* reg)
}
static int
-callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end,
+callout_tag_entry(ParseEnv* env, regex_t* reg, UChar* name, UChar* name_end,
CalloutTagVal entry_val)
{
int r;
@@ -1988,10 +2057,10 @@ callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end,
#endif /* USE_CALLOUT */
-#define INIT_SCANENV_MEMENV_ALLOC_SIZE 16
+#define INIT_PARSEENV_MEMENV_ALLOC_SIZE 16
static void
-scan_env_clear(ScanEnv* env)
+scan_env_clear(ParseEnv* env)
{
MEM_STATUS_CLEAR(env->cap_history);
MEM_STATUS_CLEAR(env->backtrack_mem);
@@ -2024,7 +2093,7 @@ scan_env_clear(ScanEnv* env)
}
static int
-scan_env_add_mem_entry(ScanEnv* env)
+scan_env_add_mem_entry(ParseEnv* env)
{
int i, need, alloc;
MemEnv* p;
@@ -2033,10 +2102,10 @@ scan_env_add_mem_entry(ScanEnv* env)
if (need > MaxCaptureNum && MaxCaptureNum != 0)
return ONIGERR_TOO_MANY_CAPTURES;
- if (need >= SCANENV_MEMENV_SIZE) {
+ if (need >= PARSEENV_MEMENV_SIZE) {
if (env->mem_alloc <= need) {
if (IS_NULL(env->mem_env_dynamic)) {
- alloc = INIT_SCANENV_MEMENV_ALLOC_SIZE;
+ alloc = INIT_PARSEENV_MEMENV_ALLOC_SIZE;
p = (MemEnv* )xmalloc(sizeof(MemEnv) * alloc);
CHECK_NULL_RETURN_MEMERR(p);
xmemcpy(p, env->mem_env_static, sizeof(env->mem_env_static));
@@ -2062,10 +2131,10 @@ scan_env_add_mem_entry(ScanEnv* env)
}
static int
-scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
+scan_env_set_mem_node(ParseEnv* env, int num, Node* node)
{
if (env->num_mem >= num)
- SCANENV_MEMENV(env)[num].mem_node = node;
+ PARSEENV_MEMENV(env)[num].mem_node = node;
else
return ONIGERR_PARSER_BUG;
return 0;
@@ -2285,7 +2354,7 @@ node_new_anychar(OnigOptionType options)
}
static int
-node_new_no_newline(Node** node, ScanEnv* env)
+node_new_no_newline(Node** node, ParseEnv* env)
{
Node* n;
@@ -2425,7 +2494,7 @@ node_new_backref(int back_num, int* backrefs, int by_name,
#ifdef USE_BACKREF_WITH_LEVEL
int exist_level, int nest_level,
#endif
- ScanEnv* env)
+ ParseEnv* env)
{
int i;
Node* node;
@@ -2451,7 +2520,7 @@ node_new_backref(int back_num, int* backrefs, int by_name,
for (i = 0; i < back_num; i++) {
if (backrefs[i] <= env->num_mem &&
- IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) {
+ IS_NULL(PARSEENV_MEMENV(env)[backrefs[i]].mem_node)) {
NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */
break;
}
@@ -2481,7 +2550,7 @@ node_new_backref_checker(int back_num, int* backrefs, int by_name,
#ifdef USE_BACKREF_WITH_LEVEL
int exist_level, int nest_level,
#endif
- ScanEnv* env)
+ ParseEnv* env)
{
Node* node;
@@ -2527,6 +2596,7 @@ node_new_quantifier(int lower, int upper, int by_number)
QUANT_(node)->head_exact = NULL_NODE;
QUANT_(node)->next_head_exact = NULL_NODE;
QUANT_(node)->include_referred = 0;
+ QUANT_(node)->empty_status_mem = 0;
if (by_number != 0)
NODE_STATUS_ADD(node, BY_NUMBER);
@@ -2640,7 +2710,7 @@ node_set_fail(Node* node)
}
static int
-node_new_fail(Node** node, ScanEnv* env)
+node_new_fail(Node** node, ParseEnv* env)
{
*node = node_new();
CHECK_NULL_RETURN_MEMERR(*node);
@@ -2656,7 +2726,7 @@ onig_node_reset_fail(Node* node)
}
static int
-node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env)
+node_new_save_gimmick(Node** node, enum SaveType save_type, ParseEnv* env)
{
int id;
@@ -2675,7 +2745,7 @@ node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env)
static int
node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type,
- int id, ScanEnv* env)
+ int id, ParseEnv* env)
{
*node = node_new();
CHECK_NULL_RETURN_MEMERR(*node);
@@ -2689,7 +2759,7 @@ node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type,
}
static int
-node_new_keep(Node** node, ScanEnv* env)
+node_new_keep(Node** node, ParseEnv* env)
{
int r;
@@ -2743,7 +2813,7 @@ onig_reg_callout_list_at(regex_t* reg, int num)
}
static int
-reg_callout_list_entry(ScanEnv* env, int* rnum)
+reg_callout_list_entry(ParseEnv* env, int* rnum)
{
#define INIT_CALLOUT_LIST_NUM 3
@@ -2795,7 +2865,7 @@ reg_callout_list_entry(ScanEnv* env, int* rnum)
static int
node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id,
- ScanEnv* env)
+ ParseEnv* env)
{
*node = node_new();
CHECK_NULL_RETURN_MEMERR(*node);
@@ -2811,7 +2881,7 @@ node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id,
#endif
static int
-make_text_segment(Node** node, ScanEnv* env)
+make_text_segment(Node** node, ParseEnv* env)
{
int r;
int i;
@@ -2868,7 +2938,7 @@ make_text_segment(Node** node, ScanEnv* env)
static int
make_absent_engine(Node** node, int pre_save_right_id, Node* absent,
Node* step_one, int lower, int upper, int possessive,
- int is_range_cutter, ScanEnv* env)
+ int is_range_cutter, ParseEnv* env)
{
int r;
int i;
@@ -2950,7 +3020,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent,
static int
make_absent_tail(Node** node1, Node** node2, int pre_save_right_id,
- ScanEnv* env)
+ ParseEnv* env)
{
int r;
int id;
@@ -2998,7 +3068,7 @@ make_absent_tail(Node** node1, Node** node2, int pre_save_right_id,
}
static int
-make_range_clear(Node** node, ScanEnv* env)
+make_range_clear(Node** node, ParseEnv* env)
{
int r;
int id;
@@ -3057,7 +3127,7 @@ make_range_clear(Node** node, ScanEnv* env)
static int
is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody,
- int* is_possessive, ScanEnv* env)
+ int* is_possessive, ParseEnv* env)
{
Node* quant;
Node* body;
@@ -3123,8 +3193,8 @@ is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody,
}
static int
-make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* quant,
- Node* body, int possessive, ScanEnv* env)
+make_absent_tree_for_simple_one_char_repeat(Node** node,
+ Node* absent, Node* quant, Node* body, int possessive, ParseEnv* env)
{
int r;
int i;
@@ -3171,7 +3241,7 @@ make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* qua
static int
make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter,
- ScanEnv* env)
+ ParseEnv* env)
{
int r;
int i;
@@ -3844,7 +3914,7 @@ add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)
}
static int
-add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
+add_code_range(BBuf** pbuf, ParseEnv* env, OnigCodePoint from, OnigCodePoint to)
{
if (from > to) {
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
@@ -4172,7 +4242,7 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
}
static OnigCodePoint
-conv_backslash_value(OnigCodePoint c, ScanEnv* env)
+conv_backslash_value(OnigCodePoint c, ParseEnv* env)
{
if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
switch (c) {
@@ -4258,10 +4328,10 @@ enum ReduceType {
RQ_ASIS = 0, /* as is */
RQ_DEL = 1, /* delete parent */
RQ_A, /* to '*' */
+ RQ_P, /* to '+' */
RQ_AQ, /* to '*?' */
RQ_QQ, /* to '??' */
RQ_P_QQ, /* to '+)??' */
- RQ_PQ_Q /* to '+?)?' */
};
static enum ReduceType ReduceTypeTable[6][6] = {
@@ -4270,7 +4340,7 @@ static enum ReduceType ReduceTypeTable[6][6] = {
{RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
{RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
{RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
- {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
+ {RQ_ASIS, RQ_A, RQ_P, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
};
extern int
@@ -4309,6 +4379,11 @@ onig_reduce_nested_quantifier(Node* pnode)
p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1;
goto remove_cnode;
break;
+ case RQ_P:
+ NODE_BODY(pnode) = NODE_BODY(cnode);
+ p->lower = 1; p->upper = INFINITE_REPEAT; p->greedy = 1;
+ goto remove_cnode;
+ break;
case RQ_AQ:
NODE_BODY(pnode) = NODE_BODY(cnode);
p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0;
@@ -4323,10 +4398,6 @@ onig_reduce_nested_quantifier(Node* pnode)
p->lower = 0; p->upper = 1; p->greedy = 0;
c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1;
break;
- case RQ_PQ_Q:
- p->lower = 0; p->upper = 1; p->greedy = 1;
- c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0;
- break;
case RQ_ASIS:
break;
}
@@ -4340,7 +4411,7 @@ onig_reduce_nested_quantifier(Node* pnode)
}
static int
-node_new_general_newline(Node** node, ScanEnv* env)
+node_new_general_newline(Node** node, ParseEnv* env)
{
int r;
int dlen, alen;
@@ -4472,7 +4543,7 @@ ptoken_init(PToken* tok)
}
static int
-fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env)
+fetch_interval(UChar** src, UChar* end, PToken* tok, ParseEnv* env)
{
int low, up, syn_allow, non_low = 0;
int r = 0;
@@ -4575,7 +4646,8 @@ fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env)
/* \M-, \C-, \c, or \... */
static int
-fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
+fetch_escaped_value_raw(UChar** src, UChar* end, ParseEnv* env,
+ OnigCodePoint* val)
{
int v;
OnigCodePoint c;
@@ -4646,7 +4718,7 @@ fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* va
}
static int
-fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
+fetch_escaped_value(UChar** src, UChar* end, ParseEnv* env, OnigCodePoint* val)
{
int r;
int len;
@@ -4660,7 +4732,7 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
return 0;
}
-static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env);
+static int fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env);
static OnigCodePoint
get_name_end_code_point(OnigCodePoint start)
@@ -4691,7 +4763,7 @@ enum REF_NUM {
*/
static int
fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
- UChar** rname_end, ScanEnv* env,
+ UChar** rname_end, ParseEnv* env,
int* rback_num, int* rlevel, enum REF_NUM* num_type)
{
int r, sign, exist_level;
@@ -4825,7 +4897,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
*/
static int
fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
- UChar** rname_end, ScanEnv* env, int* rback_num,
+ UChar** rname_end, ParseEnv* env, int* rback_num,
enum REF_NUM* num_type, int is_ref)
{
int r, sign;
@@ -4957,7 +5029,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
}
static void
-CC_ESC_WARN(ScanEnv* env, UChar *c)
+CC_ESC_WARN(ParseEnv* env, UChar *c)
{
if (onig_warn == onig_null_warn) return ;
@@ -4973,7 +5045,7 @@ CC_ESC_WARN(ScanEnv* env, UChar *c)
}
static void
-CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
+CLOSE_BRACKET_WITHOUT_ESC_WARN(ParseEnv* env, UChar* c)
{
if (onig_warn == onig_null_warn) return ;
@@ -5054,11 +5126,12 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
}
static int
-fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)
+fetch_token_cc(PToken* tok, UChar** src, UChar* end, ParseEnv* env, int state)
{
int r;
OnigCodePoint code;
OnigCodePoint c, c2;
+ int mindigits, maxdigits;
OnigSyntaxType* syn = env->syntax;
OnigEncoding enc = env->enc;
UChar* prev;
@@ -5247,10 +5320,11 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)
case 'u':
if (PEND) break;
-
prev = p;
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
- r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
+ mindigits = maxdigits = 4;
+ u_hex_digits:
+ r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code);
if (r < 0) return r;
if (p == prev) { /* can't read nothing. */
code = 0; /* but, it's not error */
@@ -5261,6 +5335,15 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)
}
break;
+ case 'U':
+ if (PEND) break;
+ prev = p;
+ if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) {
+ mindigits = maxdigits = 8;
+ goto u_hex_digits;
+ }
+ break;
+
case '0':
case '1': case '2': case '3': case '4': case '5': case '6': case '7':
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
@@ -5327,15 +5410,22 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)
}
static int
-fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
+fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env)
{
int r;
OnigCodePoint code;
OnigCodePoint c;
- OnigEncoding enc = env->enc;
- OnigSyntaxType* syn = env->syntax;
+ int mindigits, maxdigits;
UChar* prev;
- UChar* p = *src;
+ int allow_num;
+ OnigEncoding enc;
+ OnigSyntaxType* syn;
+ UChar* p;
+
+ enc = env->enc;
+ syn = env->syntax;
+ p = *src;
+
PFETCH_READY;
if (tok->code_point_continue != 0) {
@@ -5574,12 +5664,20 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
break;
case 'Z':
- if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
- tok->type = TK_ANCHOR;
- tok->u.subtype = ANCR_SEMI_END_BUF;
+ if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) {
+ goto end_buf;
+ }
+ else {
+ if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
+ tok->type = TK_ANCHOR;
+ tok->u.subtype = ANCR_SEMI_END_BUF;
+ }
break;
case 'z':
+ if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON))
+ return ONIGERR_UNDEFINED_OPERATOR;
+
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
end_buf:
tok->type = TK_ANCHOR;
@@ -5668,10 +5766,11 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
case 'u':
if (PEND) break;
-
prev = p;
+ mindigits = maxdigits = 4;
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
- r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code);
+ u_hex_digits:
+ r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code);
if (r < 0) return r;
if (p == prev) { /* can't read nothing. */
code = 0; /* but, it's not error */
@@ -5682,6 +5781,15 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
break;
+ case 'U':
+ if (PEND) break;
+ prev = p;
+ if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) {
+ mindigits = maxdigits = 8;
+ goto u_hex_digits;
+ }
+ break;
+
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
PUNFETCH;
@@ -5694,7 +5802,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
(r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */
if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
- if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node))
+ if (r > env->num_mem || IS_NULL(PARSEENV_MEMENV(env)[r].mem_node))
return ONIGERR_INVALID_BACKREF;
}
@@ -5743,6 +5851,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
int back_num;
enum REF_NUM num_type;
+ allow_num = 1;
+
+ backref_start:
prev = p;
#ifdef USE_BACKREF_WITH_LEVEL
@@ -5757,6 +5868,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (r < 0) return r;
if (num_type != IS_NOT_NUM) {
+ if (allow_num == 0) return ONIGERR_INVALID_BACKREF;
+
if (num_type == IS_REL_NUM) {
back_num = backref_rel_to_abs(back_num, env);
}
@@ -5765,7 +5878,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
if (back_num > env->num_mem ||
- IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node))
+ IS_NULL(PARSEENV_MEMENV(env)[back_num].mem_node))
return ONIGERR_INVALID_BACKREF;
}
tok->type = TK_BACKREF;
@@ -5782,7 +5895,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
int i;
for (i = 0; i < num; i++) {
if (backs[i] > env->num_mem ||
- IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node))
+ IS_NULL(PARSEENV_MEMENV(env)[backs[i]].mem_node))
return ONIGERR_INVALID_BACKREF;
}
}
@@ -5813,12 +5926,17 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
UChar* name_end;
enum REF_NUM num_type;
+ allow_num = 1;
+
+ call_start:
prev = p;
r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env,
&gnum, &num_type, TRUE);
if (r < 0) return r;
if (num_type != IS_NOT_NUM) {
+ if (allow_num == 0) return ONIGERR_UNDEFINED_GROUP_REFERENCE;
+
if (num_type == IS_REL_NUM) {
gnum = backref_rel_to_abs(gnum, env);
if (gnum < 0) {
@@ -5975,6 +6093,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
case '(':
if (!PEND && PPEEK_IS('?') &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
+ prev = p;
PINC;
if (! PEND) {
c = PPEEK;
@@ -6062,11 +6181,35 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)
break;
}
}
+ else if (c == 'P' &&
+ IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) {
+ PINC; /* skip 'P' */
+ if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
+ PFETCH(c);
+ allow_num = 0;
+ if (c == '=') {
+ c = '(';
+ goto backref_start;
+ }
+ else if (c == '>') {
+#ifdef USE_CALL
+ c = '(';
+ goto call_start;
+#else
+ return ONIGERR_UNDEFINED_OPERATOR;
+#endif
+ }
+ else {
+ p = prev;
+ goto lparen_qmark_end2;
+ }
+ }
}
lparen_qmark_end:
PUNFETCH;
}
+ lparen_qmark_end2:
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
tok->type = TK_SUBEXP_OPEN;
break;
@@ -6295,7 +6438,7 @@ add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not,
}
static int
-add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
+add_ctype_to_cc(CClassNode* cc, int ctype, int not, ParseEnv* env)
{
int c, r;
int ascii_mode;
@@ -6398,7 +6541,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
}
static int
-prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
+prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ParseEnv* env)
{
#define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
#define POSIX_BRACKET_NAME_MIN_LEN 4
@@ -6472,7 +6615,7 @@ prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
}
static int
-fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
+fetch_char_property_to_ctype(UChar** src, UChar* end, ParseEnv* env)
{
int r;
OnigCodePoint c;
@@ -6507,7 +6650,8 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
}
static int
-prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
+prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end,
+ ParseEnv* env)
{
int r, ctype;
CClassNode* cc;
@@ -6528,7 +6672,7 @@ prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
static int
cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state,
- ScanEnv* env)
+ ParseEnv* env)
{
int r;
@@ -6552,7 +6696,7 @@ cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state,
static int
cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to,
int* from_raw, int to_raw, CVAL intype, CVAL* type,
- CSTATE* state, ScanEnv* env)
+ CSTATE* state, ParseEnv* env)
{
int r;
@@ -6621,7 +6765,7 @@ cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to,
static int
code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
- ScanEnv* env)
+ ParseEnv* env)
{
int in_esc;
OnigCodePoint code;
@@ -6643,7 +6787,7 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
}
static int
-prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
+prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ParseEnv* env)
{
int r, neg, len, fetched, and_start;
OnigCodePoint in_code, curr_code;
@@ -6995,13 +7139,14 @@ prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
static int prs_alts(Node** top, PToken* tok, int term,
- UChar** src, UChar* end, ScanEnv* env, int group_head);
+ UChar** src, UChar* end, ParseEnv* env, int group_head);
#ifdef USE_CALLOUT
/* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */
static int
-prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
+prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end,
+ ParseEnv* env)
{
int r;
int i;
@@ -7184,7 +7329,7 @@ clear_callout_args(int n, unsigned int types[], OnigValue vals[])
static int
prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end,
int max_arg_num, unsigned int types[], OnigValue vals[],
- ScanEnv* env)
+ ParseEnv* env)
{
#define MAX_CALLOUT_ARG_BYTE_LENGTH 128
@@ -7347,7 +7492,8 @@ prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end,
/* (*name[TAG]) (*name[TAG]{a,b,..}) */
static int
-prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
+prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end,
+ ParseEnv* env)
{
int r;
int i;
@@ -7514,7 +7660,7 @@ prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)
static int
prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
- ScanEnv* env)
+ ParseEnv* env)
{
int r, num;
Node *target;
@@ -7747,7 +7893,7 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
if (back_num > env->num_mem ||
- IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node))
+ IS_NULL(PARSEENV_MEMENV(env)[back_num].mem_node))
return ONIGERR_INVALID_BACKREF;
}
@@ -7769,7 +7915,7 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
int i;
for (i = 0; i < num; i++) {
if (backs[i] > env->num_mem ||
- IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node))
+ IS_NULL(PARSEENV_MEMENV(env)[backs[i]].mem_node))
return ONIGERR_INVALID_BACKREF;
}
}
@@ -7932,12 +8078,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
break;
#endif
+ case 'P':
+ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) {
+ if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
+ PFETCH(c);
+ if (c == '<') goto named_group1;
+
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ }
+ /* else fall */
+ case 'W': case 'D': case 'S':
+ case 'y':
+ if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ /* else fall */
+
#ifdef USE_POSIXLINE_OPTION
case 'p':
#endif
+ case 'a':
case '-': case 'i': case 'm': case 's': case 'x':
- case 'W': case 'D': case 'S': case 'P':
- case 'y':
{
int neg = 0;
@@ -7974,10 +8134,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
break;
#endif
- case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break;
- case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break;
- case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break;
- case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break;
+ case 'W':
+ if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg);
+ break;
+ case 'D':
+ if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg);
+ break;
+ case 'S':
+ if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg);
+ break;
+ case 'P':
+ if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA))
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+ OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg);
+ break;
case 'y': /* y{g}, y{w} */
{
@@ -8016,8 +8192,15 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
PFETCH(c);
if (c != '}')
return ONIGERR_UNDEFINED_GROUP_OPTION;
- break;
} /* case 'y' */
+ break;
+
+ case 'a':
+ if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_PYTHON))
+ return ONIGERR_UNDEFINED_GROUP_OPTION;
+
+ OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg);
+ break;
default:
return ONIGERR_UNDEFINED_GROUP_OPTION;
@@ -8112,7 +8295,7 @@ static const char* ReduceQStr[] = {
};
static int
-assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env)
+assign_quantifier_body(Node* qnode, Node* target, int group, ParseEnv* env)
{
QuantNode* qn;
@@ -8260,35 +8443,38 @@ onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc,
}
typedef struct {
- ScanEnv* env;
+ ParseEnv* env;
CClassNode* cc;
Node* alt_root;
Node** ptail;
} IApplyCaseFoldArg;
static int
-i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)
+i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len,
+ void* arg)
{
IApplyCaseFoldArg* iarg;
- ScanEnv* env;
+ ParseEnv* env;
+ OnigEncoding enc;
CClassNode* cc;
iarg = (IApplyCaseFoldArg* )arg;
env = iarg->env;
cc = iarg->cc;
+ enc = env->enc;
if (to_len == 1) {
- int is_in = onig_is_code_in_cc(env->enc, from, cc);
+ int is_in = onig_is_code_in_cc(enc, from, cc);
#ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
(is_in == 0 && IS_NCCLASS_NOT(cc))) {
- ADD_CODE_INTO_CC(cc, *to, env->enc);
+ ADD_CODE_INTO_CC(cc, *to, enc);
}
#else
if (is_in != 0) {
- if (ONIGENC_MBC_MINLEN(env->enc) > 1 ||
- ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) {
- if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
+ if (ONIGENC_MBC_MINLEN(enc) > 1 ||
+ ONIGENC_CODE_TO_MBCLEN(enc, *to) != 1) {
+ if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, enc);
add_code_range(&(cc->mbuf), env, *to, *to);
}
else {
@@ -8305,7 +8491,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)
int r, i, len;
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
- if (onig_is_code_in_cc(env->enc, from, cc)
+ if (onig_is_code_in_cc(enc, from, cc)
#ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
&& !IS_NCCLASS_NOT(cc)
#endif
@@ -8320,8 +8506,9 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)
Node* csnode;
CClassNode* cs_cc;
- index = onigenc_unicode_fold1_key(&to[i]);
- if (index >= 0) {
+ index = 0;
+ if (ONIGENC_IS_UNICODE_ENCODING(enc) &&
+ (index = onigenc_unicode_fold1_key(&to[i])) >= 0) {
csnode = node_new_cclass();
cs_cc = CCLASS_(csnode);
if (IS_NULL(csnode)) {
@@ -8332,18 +8519,22 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)
m = FOLDS1_UNFOLDS_NUM(index);
for (j = 0; j < m; j++) {
code = FOLDS1_UNFOLDS(index)[j];
- ADD_CODE_INTO_CC(cs_cc, code, env->enc);
+ ADD_CODE_INTO_CC(cs_cc, code, enc);
}
- ADD_CODE_INTO_CC(cs_cc, to[i], env->enc);
+ ADD_CODE_INTO_CC(cs_cc, to[i], enc);
ns[n++] = csnode;
}
else {
- len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
+ len = ONIGENC_CODE_TO_MBC(enc, to[i], buf);
if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) {
csnode = node_new_str(buf, buf + len);
if (IS_NULL(csnode)) goto err_free_ns;
- NODE_STRING_SET_CASE_EXPANDED(csnode);
+ if (index == 0)
+ NODE_STATUS_ADD(csnode, IGNORECASE);
+ else
+ NODE_STRING_SET_CASE_EXPANDED(csnode);
+
ns[n++] = csnode;
}
else {
@@ -8372,7 +8563,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)
static int
prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,
- ScanEnv* env, int group_head)
+ ParseEnv* env, int group_head)
{
int r, len, group;
Node* qn;
@@ -8778,7 +8969,7 @@ prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,
static int
prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end,
- ScanEnv* env, int group_head)
+ ParseEnv* env, int group_head)
{
int r;
Node *node, **headp;
@@ -8829,7 +9020,7 @@ prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end,
/* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
static int
prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end,
- ScanEnv* env, int group_head)
+ ParseEnv* env, int group_head)
{
int r;
Node *node, **headp;
@@ -8892,7 +9083,7 @@ prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end,
}
static int
-prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
+prs_regexp(Node** top, UChar** src, UChar* end, ParseEnv* env)
{
int r;
PToken tok;
@@ -8908,7 +9099,7 @@ prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
#ifdef USE_CALL
static int
-make_call_zero_body(Node* node, ScanEnv* env, Node** rnode)
+make_call_zero_body(Node* node, ParseEnv* env, Node** rnode)
{
int r;
@@ -8930,7 +9121,7 @@ make_call_zero_body(Node* node, ScanEnv* env, Node** rnode)
extern int
onig_parse_tree(Node** root, const UChar* pattern, const UChar* end,
- regex_t* reg, ScanEnv* env)
+ regex_t* reg, ParseEnv* env)
{
int r;
UChar* p;
@@ -8945,7 +9136,6 @@ onig_parse_tree(Node** root, const UChar* pattern, const UChar* end,
reg->num_empty_check = 0;
reg->repeat_range_alloc = 0;
reg->repeat_range = (RepeatRange* )NULL;
- reg->empty_status_mem = 0;
names_clear(reg);
@@ -8990,7 +9180,7 @@ onig_parse_tree(Node** root, const UChar* pattern, const UChar* end,
}
extern void
-onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
+onig_scan_env_set_error_string(ParseEnv* env, int ecode ARG_UNUSED,
UChar* arg, UChar* arg_end)
{
env->error = arg;
diff --git a/src/regparse.h b/src/regparse.h
index c60a42d..8875f78 100644
--- a/src/regparse.h
+++ b/src/regparse.h
@@ -4,7 +4,7 @@
regparse.h - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2020 K.Kosako
+ * Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -73,6 +73,14 @@ enum BodyEmptyType {
BODY_MAY_BE_EMPTY_REC = 3
};
+/* bytes buffer */
+typedef struct _BBuf {
+ UChar* p;
+ unsigned int used;
+ unsigned int alloc;
+} BBuf;
+
+
struct _Node;
typedef struct {
@@ -110,6 +118,7 @@ typedef struct {
struct _Node* head_exact;
struct _Node* next_head_exact;
int include_referred; /* include called node. don't eliminate even if {0} */
+ MemStatusType empty_status_mem;
} QuantNode;
typedef struct {
@@ -340,6 +349,7 @@ typedef struct {
#define NODE_ST_ABSENT_WITH_SIDE_EFFECTS (1<<24) /* stopper or clear */
#define NODE_ST_FIXED_CLEN_MIN_SURE (1<<25)
#define NODE_ST_REFERENCED (1<<26)
+#define NODE_ST_INPEEK (1<<27)
#define NODE_STATUS(node) (((Node* )node)->u.base.status)
@@ -376,6 +386,7 @@ typedef struct {
#define NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) ((NODE_STATUS(node) & NODE_ST_ABSENT_WITH_SIDE_EFFECTS) != 0)
#define NODE_IS_FIXED_CLEN_MIN_SURE(node) ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN_MIN_SURE) != 0)
#define NODE_IS_REFERENCED(node) ((NODE_STATUS(node) & NODE_ST_REFERENCED) != 0)
+#define NODE_IS_INPEEK(node) ((NODE_STATUS(node) & NODE_ST_INPEEK) != 0)
#define NODE_PARENT(node) ((node)->u.base.parent)
#define NODE_BODY(node) ((node)->u.base.body)
@@ -384,8 +395,8 @@ typedef struct {
#define NODE_CALL_BODY(node) ((node)->body)
#define NODE_ANCHOR_BODY(node) ((node)->body)
-#define SCANENV_MEMENV_SIZE 8
-#define SCANENV_MEMENV(senv) \
+#define PARSEENV_MEMENV_SIZE 8
+#define PARSEENV_MEMENV(senv) \
(IS_NOT_NULL((senv)->mem_env_dynamic) ? \
(senv)->mem_env_dynamic : (senv)->mem_env_static)
@@ -424,7 +435,7 @@ typedef struct {
int num_mem;
int num_named;
int mem_alloc;
- MemEnv mem_env_static[SCANENV_MEMENV_SIZE];
+ MemEnv mem_env_static[PARSEENV_MEMENV_SIZE];
MemEnv* mem_env_dynamic;
int backref_num;
int keep_num;
@@ -439,14 +450,14 @@ typedef struct {
#ifdef ONIG_DEBUG_PARSE
unsigned int max_parse_depth;
#endif
-} ScanEnv;
+} ParseEnv;
extern int onig_renumber_name_table P_((regex_t* reg, GroupNumMap* map));
extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n));
extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end));
-extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end));
+extern void onig_scan_env_set_error_string P_((ParseEnv* env, int ecode, UChar* arg, UChar* arg_end));
extern int onig_reduce_nested_quantifier P_((Node* pnode));
extern int onig_node_copy(Node** rcopy, Node* from);
extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end));
@@ -460,7 +471,7 @@ extern Node* onig_node_new_str P_((const UChar* s, const UChar* end));
extern Node* onig_node_new_list P_((Node* left, Node* right));
extern Node* onig_node_new_alt P_((Node* left, Node* right));
extern int onig_names_free P_((regex_t* reg));
-extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env));
+extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ParseEnv* env));
extern int onig_free_shared_cclass_table P_((void));
extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc));
extern int onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]);
diff --git a/src/regposix.c b/src/regposix.c
index 497ba02..494446f 100644
--- a/src/regposix.c
+++ b/src/regposix.c
@@ -2,7 +2,7 @@
regposix.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2020 K.Kosako
+ * Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -120,6 +120,7 @@ onig2posix_error_code(int code)
{ ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED, REG_BADPAT },
{ ONIGERR_TOO_BIG_WIDE_CHAR_VALUE, REG_EONIG_BADWC },
{ ONIGERR_TOO_LONG_WIDE_CHAR_VALUE, REG_EONIG_BADWC },
+ { ONIGERR_UNDEFINED_OPERATOR, REG_BADPAT },
{ ONIGERR_INVALID_CODE_POINT_VALUE, REG_EONIG_BADWC },
{ ONIGERR_EMPTY_GROUP_NAME, REG_BADPAT },
{ ONIGERR_INVALID_GROUP_NAME, REG_BADPAT },
@@ -141,6 +142,7 @@ onig2posix_error_code(int code)
{ ONIGERR_INVALID_CALLOUT_TAG_NAME, REG_BADPAT },
{ ONIGERR_INVALID_CALLOUT_ARG, REG_BADPAT },
{ ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION, REG_EONIG_BADARG },
+ { ONIGERR_VERY_INEFFICIENT_PATTERN, REG_BADPAT },
{ ONIGERR_LIBRARY_IS_NOT_INITIALIZED, REG_EONIG_INTERNAL }
};
diff --git a/src/regsyntax.c b/src/regsyntax.c
index 984aac6..8e1c313 100644
--- a/src/regsyntax.c
+++ b/src/regsyntax.c
@@ -2,7 +2,7 @@
regsyntax.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2020 K.Kosako
+ * Copyright (c) 2002-2021 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -240,6 +240,35 @@ OnigSyntaxType OnigSyntaxPerl_NG = {
}
};
+/* Python 3.9 */
+OnigSyntaxType OnigSyntaxPython = {
+ (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
+ ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
+ ONIG_SYN_OP_ESC_CONTROL_CHARS |
+ ONIG_SYN_OP_ESC_C_CONTROL )
+ & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
+ , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL |
+ ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE |
+ ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME |
+ ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
+ ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
+ ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME |
+ ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
+ ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 )
+ , ( SYN_GNU_REGEX_BV | ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH |
+ ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_PYTHON )
+ , ONIG_OPTION_SINGLELINE
+ ,
+ {
+ (OnigCodePoint )'\\' /* esc */
+ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
+ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
+ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
+ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
+ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
+ }
+};
+
extern int
diff --git a/src/unicode.c b/src/unicode.c
index 6703d4b..efe5f73 100644
--- a/src/unicode.c
+++ b/src/unicode.c
@@ -2,7 +2,7 @@
unicode.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2019 K.Kosako
+ * Copyright (c) 2002-2020 K.Kosako
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -77,9 +77,8 @@ static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
#include "unicode_fold_data.c"
extern int
-onigenc_unicode_mbc_case_fold(OnigEncoding enc,
- OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end,
- UChar* fold)
+onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag,
+ const UChar** pp, const UChar* end, UChar* fold)
{
const struct ByUnfoldKey* buk;
@@ -104,23 +103,27 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc,
}
#endif
- buk = onigenc_unicode_unfold_key(code);
- if (buk != 0) {
- if (buk->fold_len == 1) {
- return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);
- }
- else {
- OnigCodePoint* addr;
-
- FOLDS_FOLD_ADDR_BUK(buk, addr);
- rlen = 0;
- for (i = 0; i < buk->fold_len; i++) {
- OnigCodePoint c = addr[i];
- len = ONIGENC_CODE_TO_MBC(enc, c, fold);
- fold += len;
- rlen += len;
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(code)) {
+ buk = onigenc_unicode_unfold_key(code);
+ if (buk != 0) {
+ if (buk->fold_len == 1) {
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
+ ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk->index)))
+ return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold);
+ }
+ else {
+ OnigCodePoint* addr;
+
+ FOLDS_FOLD_ADDR_BUK(buk, addr);
+ rlen = 0;
+ for (i = 0; i < buk->fold_len; i++) {
+ OnigCodePoint c = addr[i];
+ len = ONIGENC_CODE_TO_MBC(enc, c, fold);
+ fold += len;
+ rlen += len;
+ }
+ return rlen;
}
- return rlen;
}
}
@@ -131,16 +134,22 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc,
}
static int
-apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
+apply_case_fold1(OnigCaseFoldType flag, int from, int to,
+ OnigApplyAllCaseFoldFunc f, void* arg)
{
int i, j, k, n, r;
for (i = from; i < to; ) {
OnigCodePoint fold = *FOLDS1_FOLD(i);
+ if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(fold)) break;
+
n = FOLDS1_UNFOLDS_NUM(i);
for (j = 0; j < n; j++) {
OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j];
+ if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(unfold))
+ continue;
+
r = (*f)(fold, &unfold, 1, arg);
if (r != 0) return r;
r = (*f)(unfold, &fold, 1, arg);
@@ -148,6 +157,9 @@ apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)
for (k = 0; k < j; k++) {
OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k];
+ if (CASE_FOLD_IS_ASCII_ONLY(flag) &&
+ ! ONIGENC_IS_ASCII_CODE(unfold2)) continue;
+
r = (*f)(unfold, &unfold2, 1, arg);
if (r != 0) return r;
r = (*f)(unfold2, &unfold, 1, arg);
@@ -225,7 +237,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
{
int r;
- r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg);
+ r = apply_case_fold1(flag, 0, FOLDS1_NORMAL_END_INDEX, f, arg);
if (r != 0) return r;
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
@@ -246,7 +258,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,
}
else {
#endif
- r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);
+ r = apply_case_fold1(flag, FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);
if (r != 0) return r;
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
}
@@ -288,6 +300,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
n = 0;
code = ONIGENC_MBC_TO_CODE(enc, p, end);
+ if (CASE_FOLD_IS_ASCII_ONLY(flag)) {
+ if (! ONIGENC_IS_ASCII_CODE(code)) return n;
+ }
len = enclen(enc, p);
#ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
@@ -449,19 +464,26 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
if (buk1 != 0) {
if (buk1->fold_len == 1) {
int un;
- items[0].byte_len = lens[0];
- items[0].code_len = 1;
- items[0].code[0] = *FOLDS1_FOLD(buk1->index);
- n++;
+
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
+ ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk1->index))) {
+ items[0].byte_len = lens[0];
+ items[0].code_len = 1;
+ items[0].code[0] = *FOLDS1_FOLD(buk1->index);
+ n++;
+ }
un = FOLDS1_UNFOLDS_NUM(buk1->index);
for (i = 0; i < un; i++) {
OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i];
if (unfold != orig_codes[0]) {
- items[n].byte_len = lens[0];
- items[n].code_len = 1;
- items[n].code[0] = unfold;
- n++;
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) ||
+ ONIGENC_IS_ASCII_CODE(unfold)) {
+ items[n].byte_len = lens[0];
+ items[n].code_len = 1;
+ items[n].code[0] = unfold;
+ n++;
+ }
}
}
}
@@ -548,10 +570,13 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
if (index >= 0) {
int m = FOLDS1_UNFOLDS_NUM(index);
for (i = 0; i < m; i++) {
- items[n].byte_len = lens[0];
- items[n].code_len = 1;
- items[n].code[0] = FOLDS1_UNFOLDS(index)[i];
- n++;
+ code = FOLDS1_UNFOLDS(index)[i];
+ if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)||ONIGENC_IS_ASCII_CODE(code)) {
+ items[n].byte_len = lens[0];
+ items[n].code_len = 1;
+ items[n].code[0] = code;
+ n++;
+ }
}
}
}