From 98f7065a3f7b386564840bb5b24b94f9335b2e97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Mon, 26 Apr 2021 17:40:17 +0200 Subject: New upstream version 6.9.7.1 --- src/Makefile.windows | 39 +- src/cp1251.c | 10 +- src/gb18030.c | 46 +- src/iso8859_1.c | 96 ++-- src/iso8859_10.c | 8 +- src/iso8859_13.c | 8 +- src/iso8859_14.c | 8 +- src/iso8859_15.c | 8 +- src/iso8859_16.c | 8 +- src/iso8859_2.c | 8 +- src/iso8859_3.c | 8 +- src/iso8859_4.c | 8 +- src/iso8859_5.c | 10 +- src/iso8859_7.c | 10 +- src/iso8859_9.c | 8 +- src/koi8.c | 10 +- src/koi8_r.c | 8 +- src/oniguruma.h | 32 +- src/regcomp.c | 1229 ++++++++++++++++++++++++++++++++------------------ src/regenc.c | 19 +- src/regenc.h | 16 +- src/regerror.c | 6 +- src/regexec.c | 417 ++++++++++------- src/regint.h | 93 +--- src/regparse.c | 408 ++++++++++++----- src/regparse.h | 25 +- src/regposix.c | 4 +- src/regsyntax.c | 31 +- src/unicode.c | 95 ++-- 29 files changed, 1720 insertions(+), 956 deletions(-) (limited to 'src') diff --git a/src/Makefile.windows b/src/Makefile.windows index 11d6fd8..b637772 100644 --- a/src/Makefile.windows +++ b/src/Makefile.windows @@ -2,8 +2,9 @@ product_name = oniguruma -TEST_DIR = $(ONIG_DIR)/../test -WIN_DIR = $(ONIG_DIR)/../windows +TEST_DIR = $(ONIG_DIR)/../test +SAMPLE_DIR = $(ONIG_DIR)/../sample +WIN_DIR = $(ONIG_DIR)/../windows CPPFLAGS = CFLAGS = -O2 -nologo /W3 @@ -15,6 +16,8 @@ ARDLL = cl ARDLL_FLAGS = -nologo -LD $(LINKFLAGS) -dll LINKFLAGS = -link -incremental:no -pdb:none +SAMPLE_CFLAGS = $(CFLAGS) /I$(ONIG_DIR) + INSTALL = install -c CP = copy CC = cl @@ -89,11 +92,6 @@ makeargs = $(MFLAGS) CPPFLAGS='$(CPPFLAGS)' CFLAGS='$(CFLAGS)' CC='$(CC)' # targets default: all -setup: - $(CP) ..\win32\config.h config.h - $(CP) ..\win32\testc.c testc.c - - all: $(libname) $(dllname) $(libname): $(libobjs) $(encobjs) @@ -155,7 +153,7 @@ $(BUILD_DIR)/unicode_fold1_key.obj: $(ONIG_DIR)/unicode_fold1_key.c $(ONIG_DIR)/ $(BUILD_DIR)/unicode_fold2_key.obj: $(ONIG_DIR)/unicode_fold2_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h $(BUILD_DIR)/unicode_fold3_key.obj: $(ONIG_DIR)/unicode_fold3_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h -all-test: test_syntax test_regset test_utf8 testc testp testu +all-test: test_syntax test_regset test_utf8 test_options test_back testc testp testu test_syntax: $(TEST_DIR)/test_syntax.c $(libname) $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_syntax.c $(libname) @@ -166,6 +164,12 @@ test_regset: $(TEST_DIR)/test_regset.c $(libname) test_utf8: $(TEST_DIR)/test_utf8.c $(libname) $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_utf8.c $(libname) +test_options: $(TEST_DIR)/test_options.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_options.c $(libname) + +test_back: $(TEST_DIR)/test_back.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_back.c $(libname) + testc: $(WIN_DIR)/testc.c $(libname) $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(WIN_DIR)/testc.c $(libname) @@ -176,14 +180,17 @@ testu: $(TEST_DIR)/testu.c $(libname) $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(TEST_DIR)/testu.c $(libname) clean: - del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_syntax.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe + del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_syntax.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\test_options.exe $(BUILD_DIR)\test_back.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe samples: all - $(CC) $(CFLAGS) -I. /Fe:simple $(ONIG_DIR)\sample\simple.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:posix $(ONIG_DIR)\sample\posix.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:names $(ONIG_DIR)\sample\names.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:listcap $(ONIG_DIR)\sample\listcap.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:sql $(ONIG_DIR)\sample\sql.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:encode $(ONIG_DIR)\sample\encode.c $(dlllib) - $(CC) $(CFLAGS) -I. /Fe:syntax $(ONIG_DIR)\sample\syntax.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:simple $(SAMPLE_DIR)\simple.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:posix $(SAMPLE_DIR)\posix.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:names $(SAMPLE_DIR)\names.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:listcap $(SAMPLE_DIR)\listcap.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:sql $(SAMPLE_DIR)\sql.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:encode $(SAMPLE_DIR)\encode.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:syntax $(SAMPLE_DIR)\syntax.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:count $(SAMPLE_DIR)\count.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:regset $(SAMPLE_DIR)\regset.c $(dlllib) + $(CC) $(SAMPLE_CFLAGS) /Fe:callback_each_match $(SAMPLE_DIR)\callback_each_match.c $(dlllib) diff --git a/src/cp1251.c b/src/cp1251.c index fa20780..36b36f6 100644 --- a/src/cp1251.c +++ b/src/cp1251.c @@ -2,7 +2,7 @@ cp1251.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2006-2019 Byte + * Copyright (c) 2006-2020 Byte * K.Kosako * All rights reserved. * @@ -105,12 +105,16 @@ static const unsigned short EncCP1251_CtypeTable[256] = { }; static int -cp1251_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +cp1251_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_CP1251_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_CP1251_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/gb18030.c b/src/gb18030.c index 7409d3e..1da19b4 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -30,9 +30,11 @@ #include "regenc.h" -#if 1 +/* #define DEBUG_GB18030 */ -#define DEBUG_GB18030(arg) +#ifndef DEBUG_GB18030 + +#define DEBUG_OUT(arg) #else @@ -43,7 +45,7 @@ /* for printf() */ #include "regint.h" -#define DEBUG_GB18030(arg) printf arg +#define DEBUG_OUT(arg) printf arg #endif @@ -177,8 +179,8 @@ gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype) } enum state { - S_START, - S_one_C2, + S_START = 0, + S_one_C2 = 1, S_one_C4, S_one_CM, @@ -210,15 +212,43 @@ enum state { S_odd_CM_even_C4CM, }; +#ifdef DEBUG_GB18030 +static char* StateNames[] = { + "S_START", + "S_one_C2", + "S_one_C4", + "S_one_CM", + "S_odd_CM_one_CX", + "S_even_CM_one_CX", + "S_one_CMC4", + "S_odd_CMC4", + "S_one_C4_odd_CMC4", + "S_even_CMC4", + "S_one_C4_even_CMC4", + "S_odd_CM_odd_CMC4", + "S_even_CM_odd_CMC4", + "S_odd_CM_even_CMC4", + "S_even_CM_even_CMC4", + "S_odd_C4CM", + "S_one_CM_odd_C4CM", + "S_even_C4CM", + "S_one_CM_even_C4CM", + "S_even_CM_odd_C4CM", + "S_odd_CM_odd_C4CM", + "S_even_CM_even_C4CM", + "S_odd_CM_even_C4CM" +}; +#endif + static UChar* gb18030_left_adjust_char_head(const UChar* start, const UChar* s) { const UChar *p; enum state state = S_START; - DEBUG_GB18030(("----------------\n")); + DEBUG_OUT(("----------------\n")); for (p = s; p >= start; p--) { - DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p)); + DEBUG_OUT(("%5d: state %-19s (0x%02x)->\n", (int )(p - start), StateNames[state], *p)); switch (state) { case S_START: switch (GB18030_MAP[*p]) { @@ -499,7 +529,7 @@ gb18030_left_adjust_char_head(const UChar* start, const UChar* s) } } - DEBUG_GB18030(("state %d\n", state)); + DEBUG_OUT(("state %-19s\n", StateNames[state])); switch (state) { case S_START: return (UChar *)(s - 0); case S_one_C2: return (UChar *)(s - 0); diff --git a/src/iso8859_1.c b/src/iso8859_1.c index d75509e..2013e75 100644 --- a/src/iso8859_1.c +++ b/src/iso8859_1.c @@ -2,7 +2,7 @@ iso8859_1.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,7 +114,7 @@ apply_all_case_fold(OnigCaseFoldType flag, } static int -get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, +get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { @@ -123,7 +123,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, if (0x41 <= *p && *p <= 0x5a) { if (*p == LARGE_S && end > p + 1 - && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */ + && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* SS */ ss_combination: items[0].byte_len = 2; items[0].code_len = 1; @@ -152,7 +153,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, } else if (0x61 <= *p && *p <= 0x7a) { if (*p == SMALL_S && end > p + 1 - && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { /* ss */ + && (*(p+1) == SMALL_S || *(p+1) == LARGE_S) + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* ss */ goto ss_combination; } @@ -161,56 +163,58 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, items[0].code[0] = (OnigCodePoint )(*p - 0x20); return 1; } - else if (0xc0 <= *p && *p <= 0xcf) { - items[0].byte_len = 1; - items[0].code_len = 1; - items[0].code[0] = (OnigCodePoint )(*p + 0x20); - return 1; - } - else if (0xd0 <= *p && *p <= 0xdf) { - if (*p == 0xdf) { + else if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { + if (0xc0 <= *p && *p <= 0xcf) { items[0].byte_len = 1; - items[0].code_len = 2; - items[0].code[0] = (OnigCodePoint )'s'; - items[0].code[1] = (OnigCodePoint )'s'; + items[0].code_len = 1; + items[0].code[0] = (OnigCodePoint )(*p + 0x20); + return 1; + } + else if (0xd0 <= *p && *p <= 0xdf) { + if (*p == 0xdf) { + items[0].byte_len = 1; + items[0].code_len = 2; + items[0].code[0] = (OnigCodePoint )'s'; + items[0].code[1] = (OnigCodePoint )'s'; - items[1].byte_len = 1; - items[1].code_len = 2; - items[1].code[0] = (OnigCodePoint )'S'; - items[1].code[1] = (OnigCodePoint )'S'; + items[1].byte_len = 1; + items[1].code_len = 2; + items[1].code[0] = (OnigCodePoint )'S'; + items[1].code[1] = (OnigCodePoint )'S'; - items[2].byte_len = 1; - items[2].code_len = 2; - items[2].code[0] = (OnigCodePoint )'s'; - items[2].code[1] = (OnigCodePoint )'S'; + items[2].byte_len = 1; + items[2].code_len = 2; + items[2].code[0] = (OnigCodePoint )'s'; + items[2].code[1] = (OnigCodePoint )'S'; - items[3].byte_len = 1; - items[3].code_len = 2; - items[3].code[0] = (OnigCodePoint )'S'; - items[3].code[1] = (OnigCodePoint )'s'; + items[3].byte_len = 1; + items[3].code_len = 2; + items[3].code[0] = (OnigCodePoint )'S'; + items[3].code[1] = (OnigCodePoint )'s'; - return 4; - } - else if (*p != 0xd7) { - items[0].byte_len = 1; - items[0].code_len = 1; - items[0].code[0] = (OnigCodePoint )(*p + 0x20); - return 1; + return 4; + } + else if (*p != 0xd7) { + items[0].byte_len = 1; + items[0].code_len = 1; + items[0].code[0] = (OnigCodePoint )(*p + 0x20); + return 1; + } } - } - else if (0xe0 <= *p && *p <= 0xef) { - items[0].byte_len = 1; - items[0].code_len = 1; - items[0].code[0] = (OnigCodePoint )(*p - 0x20); - return 1; - } - else if (0xf0 <= *p && *p <= 0xfe) { - if (*p != 0xf7) { + else if (0xe0 <= *p && *p <= 0xef) { items[0].byte_len = 1; items[0].code_len = 1; items[0].code[0] = (OnigCodePoint )(*p - 0x20); return 1; } + else if (0xf0 <= *p && *p <= 0xfe) { + if (*p != 0xf7) { + items[0].byte_len = 1; + items[0].code_len = 1; + items[0].code[0] = (OnigCodePoint )(*p - 0x20); + return 1; + } + } } return 0; @@ -229,7 +233,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 2; } - *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_10.c b/src/iso8859_10.c index e98cffb..e4bf599 100644 --- a/src/iso8859_10.c +++ b/src/iso8859_10.c @@ -2,7 +2,7 @@ iso8859_10.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_13.c b/src/iso8859_13.c index 2bd460f..dbf747f 100644 --- a/src/iso8859_13.c +++ b/src/iso8859_13.c @@ -2,7 +2,7 @@ iso8859_13.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_14.c b/src/iso8859_14.c index 5030b55..a6d6b71 100644 --- a/src/iso8859_14.c +++ b/src/iso8859_14.c @@ -2,7 +2,7 @@ iso8859_14.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_15.c b/src/iso8859_15.c index f32c3de..0bb6b12 100644 --- a/src/iso8859_15.c +++ b/src/iso8859_15.c @@ -2,7 +2,7 @@ iso8859_15.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_16.c b/src/iso8859_16.c index 22a653a..bfd0a5b 100644 --- a/src/iso8859_16.c +++ b/src/iso8859_16.c @@ -2,7 +2,7 @@ iso8859_16.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_2.c b/src/iso8859_2.c index dc3d0a1..d08140e 100644 --- a/src/iso8859_2.c +++ b/src/iso8859_2.c @@ -2,7 +2,7 @@ iso8859_2.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_3.c b/src/iso8859_3.c index 49dc6b2..69b96fd 100644 --- a/src/iso8859_3.c +++ b/src/iso8859_3.c @@ -2,7 +2,7 @@ iso8859_3.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 2; } - *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_4.c b/src/iso8859_4.c index f3f6ba9..949b7a1 100644 --- a/src/iso8859_4.c +++ b/src/iso8859_4.c @@ -2,7 +2,7 @@ iso8859_4.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_5.c b/src/iso8859_5.c index a5f587c..9e5d418 100644 --- a/src/iso8859_5.c +++ b/src/iso8859_5.c @@ -2,7 +2,7 @@ iso8859_5.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_5_CtypeTable[256] = { }; static int -mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_7.c b/src/iso8859_7.c index 018efac..07b1360 100644 --- a/src/iso8859_7.c +++ b/src/iso8859_7.c @@ -2,7 +2,7 @@ iso8859_7.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_7_CtypeTable[256] = { }; static int -mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_9.c b/src/iso8859_9.c index 1f9bdea..6f205e5 100644 --- a/src/iso8859_9.c +++ b/src/iso8859_9.c @@ -2,7 +2,7 @@ iso8859_9.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/koi8.c b/src/koi8.c index 37023c6..90a04f9 100644 --- a/src/koi8.c +++ b/src/koi8.c @@ -2,7 +2,7 @@ koi8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -105,12 +105,16 @@ static const unsigned short EncKOI8_CtypeTable[256] = { static int -koi8_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +koi8_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_KOI8_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_KOI8_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/koi8_r.c b/src/koi8_r.c index c77302f..31cc870 100644 --- a/src/koi8_r.c +++ b/src/koi8_r.c @@ -2,7 +2,7 @@ koi8_r.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -109,7 +109,11 @@ koi8_r_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, { const UChar* p = *pp; - *lower = ENC_KOI8_R_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_KOI8_R_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/oniguruma.h b/src/oniguruma.h index d983fc9..a7b9d8f 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -4,7 +4,7 @@ oniguruma.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,9 +36,9 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 6 #define ONIGURUMA_VERSION_MINOR 9 -#define ONIGURUMA_VERSION_TEENY 6 +#define ONIGURUMA_VERSION_TEENY 7 -#define ONIGURUMA_VERSION_INT 60906 +#define ONIGURUMA_VERSION_INT 60907 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -91,6 +91,7 @@ typedef unsigned int OnigCaseFoldType; /* case fold flag */ ONIG_EXTERN OnigCaseFoldType OnigDefaultCaseFoldFlag; +#define ONIGENC_CASE_FOLD_ASCII_ONLY (1) /* #define ONIGENC_CASE_FOLD_HIRAGANA_KATAKANA (1<<1) */ /* #define ONIGENC_CASE_FOLD_KATAKANA_WIDTH (1<<2) */ #define ONIGENC_CASE_FOLD_TURKISH_AZERI (1<<20) @@ -387,9 +388,9 @@ typedef unsigned int OnigOptionType; #define ONIG_OPTION_NOTEOL (ONIG_OPTION_NOTBOL << 1) #define ONIG_OPTION_POSIX_REGION (ONIG_OPTION_NOTEOL << 1) #define ONIG_OPTION_CHECK_VALIDITY_OF_STRING (ONIG_OPTION_POSIX_REGION << 1) -/* #define ONIG_OPTION_CRLF_AS_LINE_SEPARATOR (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 1) */ /* options (compile time) */ -#define ONIG_OPTION_WORD_IS_ASCII (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 4) +#define ONIG_OPTION_IGNORECASE_IS_ASCII (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 3) +#define ONIG_OPTION_WORD_IS_ASCII (ONIG_OPTION_IGNORECASE_IS_ASCII << 1) #define ONIG_OPTION_DIGIT_IS_ASCII (ONIG_OPTION_WORD_IS_ASCII << 1) #define ONIG_OPTION_SPACE_IS_ASCII (ONIG_OPTION_DIGIT_IS_ASCII << 1) #define ONIG_OPTION_POSIX_IS_ASCII (ONIG_OPTION_SPACE_IS_ASCII << 1) @@ -399,8 +400,9 @@ typedef unsigned int OnigOptionType; #define ONIG_OPTION_NOT_BEGIN_STRING (ONIG_OPTION_TEXT_SEGMENT_WORD << 1) #define ONIG_OPTION_NOT_END_STRING (ONIG_OPTION_NOT_BEGIN_STRING << 1) #define ONIG_OPTION_NOT_BEGIN_POSITION (ONIG_OPTION_NOT_END_STRING << 1) +#define ONIG_OPTION_CALLBACK_EACH_MATCH (ONIG_OPTION_NOT_BEGIN_POSITION << 1) -#define ONIG_OPTION_MAXBIT ONIG_OPTION_NOT_BEGIN_POSITION +#define ONIG_OPTION_MAXBIT ONIG_OPTION_CALLBACK_EACH_MATCH #define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt)) #define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt)) @@ -425,6 +427,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxJava; ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl; ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl_NG; ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; +ONIG_EXTERN OnigSyntaxType OnigSyntaxPython; ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma; /* predefined syntaxes (see regsyntax.c) */ @@ -438,6 +441,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma; #define ONIG_SYNTAX_PERL (&OnigSyntaxPerl) #define ONIG_SYNTAX_PERL_NG (&OnigSyntaxPerl_NG) #define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby) +#define ONIG_SYNTAX_PYTHON (&OnigSyntaxPython) #define ONIG_SYNTAX_ONIGURUMA (&OnigSyntaxOniguruma) /* default syntax */ @@ -510,6 +514,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS (1U<<28) /* (?{...}) (?{{...}}) */ #define ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME (1U<<29) /* (*name) (*name{a,..}) */ #define ONIG_SYN_OP2_OPTION_ONIGURUMA (1U<<30) /* (?imxWDSPy) */ +#define ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME (1U<<31) /* (?P...) (?P=name) */ /* syntax (behavior) */ #define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */ @@ -525,6 +530,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */ #define ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH (1U<<10) /* ..(?i)...|... */ #define ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND (1U<<11) /* (?<=a+|..) */ +#define ONIG_SYN_PYTHON (1U<<12) /* \UHHHHHHHH */ /* syntax (behavior) in char class [...] */ #define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */ @@ -548,8 +554,10 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; /* error codes */ #define ONIG_IS_PATTERN_ERROR(ecode) ((ecode) <= -100 && (ecode) > -1000) + /* normal return */ #define ONIG_NORMAL 0 +#define ONIG_VALUE_IS_NOT_SET 1 #define ONIG_MISMATCH -1 #define ONIG_NO_SUPPORT_CONFIG -2 #define ONIG_ABORT -3 @@ -607,6 +615,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209 #define ONIGERR_TOO_MANY_CAPTURES -210 #define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212 +#define ONIGERR_UNDEFINED_OPERATOR -213 #define ONIGERR_EMPTY_GROUP_NAME -214 #define ONIGERR_INVALID_GROUP_NAME -215 #define ONIGERR_INVALID_CHAR_IN_GROUP_NAME -216 @@ -633,6 +642,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_INVALID_COMBINATION_OF_OPTIONS -403 #define ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS -404 #define ONIGERR_TOO_LONG_PROPERTY_NAME -405 +#define ONIGERR_VERY_INEFFICIENT_PATTERN -406 #define ONIGERR_LIBRARY_IS_NOT_INITIALIZED -500 /* errors related to thread */ @@ -717,6 +727,8 @@ typedef struct { OnigCaseFoldType case_fold_flag; } OnigCompileInfo; +typedef int (*OnigCallbackEachMatchFunc)(const OnigUChar* str, const OnigUChar* end, const OnigUChar* match_start, OnigRegion* region, void* user_data); + /* types for callout */ typedef enum { @@ -940,6 +952,12 @@ const char* onig_version P_((void)); ONIG_EXTERN const char* onig_copyright P_((void)); +/* for callback each match */ +ONIG_EXTERN +OnigCallbackEachMatchFunc onig_get_callback_each_match P_((void)); +ONIG_EXTERN +int onig_set_callback_each_match P_((OnigCallbackEachMatchFunc f)); + /* for OnigMatchParam */ ONIG_EXTERN OnigMatchParam* onig_new_match_param P_((void)); @@ -981,6 +999,8 @@ ONIG_EXTERN int onig_get_callout_data_by_tag P_((OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType* type, OnigValue* val)); ONIG_EXTERN int onig_set_callout_data_by_tag P_((OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType type, OnigValue* val)); +ONIG_EXTERN +int onig_get_callout_data_by_tag_dont_clear_old P_((regex_t* reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType* type, OnigValue* val)); /* used in callout functions */ ONIG_EXTERN diff --git a/src/regcomp.c b/src/regcomp.c index dd2b328..d80551d 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -2,7 +2,7 @@ regcomp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,6 +31,9 @@ #define OPS_INIT_SIZE 8 +#define NODE_IS_REAL_IGNORECASE(node) \ + (NODE_IS_IGNORECASE(node) && !NODE_STRING_IS_CRUDE(node)) + typedef struct { OnigLen min; OnigLen max; @@ -44,7 +47,7 @@ typedef struct { OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN; -static OnigLen node_min_byte_len(Node* node, ScanEnv* env); +static OnigLen node_min_byte_len(Node* node, ParseEnv* env); #if 0 typedef struct { @@ -129,27 +132,22 @@ ops_init(regex_t* reg, int init_alloc_size) Operation* p; size_t size; - if (init_alloc_size > 0) { - size = sizeof(Operation) * init_alloc_size; - p = (Operation* )xrealloc(reg->ops, size); - CHECK_NULL_RETURN_MEMERR(p); - reg->ops = p; + if (init_alloc_size <= 0) + return ONIGERR_PARSER_BUG; + + size = sizeof(Operation) * init_alloc_size; + p = (Operation* )xrealloc(reg->ops, size); + CHECK_NULL_RETURN_MEMERR(p); + reg->ops = p; #ifdef USE_DIRECT_THREADED_CODE - { - enum OpCode* cp; - size = sizeof(enum OpCode) * init_alloc_size; - cp = (enum OpCode* )xrealloc(reg->ocs, size); - CHECK_NULL_RETURN_MEMERR(cp); - reg->ocs = cp; - } -#endif + { + enum OpCode* cp; + size = sizeof(enum OpCode) * init_alloc_size; + cp = (enum OpCode* )xrealloc(reg->ocs, size); + CHECK_NULL_RETURN_MEMERR(cp); + reg->ocs = cp; } - else { - reg->ops = (Operation* )0; -#ifdef USE_DIRECT_THREADED_CODE - reg->ocs = (enum OpCode* )0; #endif - } reg->ops_curr = 0; /* !!! not yet done ops_new() */ reg->ops_alloc = init_alloc_size; @@ -159,19 +157,16 @@ ops_init(regex_t* reg, int init_alloc_size) } static int -ops_expand(regex_t* reg, int n) +ops_resize(regex_t* reg, int n) { -#define MIN_OPS_EXPAND_SIZE 4 - #ifdef USE_DIRECT_THREADED_CODE enum OpCode* cp; #endif Operation* p; size_t size; - if (n <= 0) n = MIN_OPS_EXPAND_SIZE; - - n += reg->ops_alloc; + if (n == reg->ops_alloc) return ONIG_NORMAL; + if (n <= 0) return ONIGERR_PARSER_BUG; size = sizeof(Operation) * n; p = (Operation* )xrealloc(reg->ops, size); @@ -197,10 +192,8 @@ ops_expand(regex_t* reg, int n) static int ops_new(regex_t* reg) { - int r; - if (reg->ops_used >= reg->ops_alloc) { - r = ops_expand(reg, reg->ops_alloc); + int r = ops_resize(reg, reg->ops_alloc << 1); if (r != ONIG_NORMAL) return r; } @@ -669,6 +662,8 @@ mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt) if (to->max < alt->max) to->max = alt->max; } +#ifndef ONIG_DONT_OPTIMIZE + static int mml_is_equal(MinMaxLen* a, MinMaxLen* b) { @@ -709,9 +704,11 @@ mml_alt_merge(MinMaxLen* to, MinMaxLen* alt) if (to->max < alt->max) to->max = alt->max; } +#endif + /* fixed size pattern node only */ static int -node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, +node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env, int level) { MinMaxCharLen tci; @@ -768,7 +765,8 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, StrNode* sn = STR_(node); UChar *s = sn->s; - if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { + if (NODE_IS_REAL_IGNORECASE(node) && + CASE_FOLD_IS_NOT_ASCII_ONLY(env->case_fold_flag)) { /* Such a case is possible. ex. /(?i)(?<=\1)(a)/ Backref node refer to capture group, but it doesn't tune yet. @@ -917,7 +915,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, { int i; int* backs; - MemEnv* mem_env = SCANENV_MEMENV(env); + MemEnv* mem_env = PARSEENV_MEMENV(env); BackRefNode* br = BACKREF_(node); backs = BACKREFS_P(br); @@ -943,7 +941,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, } static int -node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env) +node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env) { return node_char_len1(node, reg, ci, env, 0); } @@ -967,7 +965,7 @@ add_op(regex_t* reg, int opcode) } static int compile_length_tree(Node* node, regex_t* reg); -static int compile_tree(Node* node, regex_t* reg, ScanEnv* env); +static int compile_tree(Node* node, regex_t* reg, ParseEnv* env); #define IS_NEED_STR_LEN_OP(op) \ @@ -1035,7 +1033,7 @@ is_strict_real_node(Node* node) } static int -compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) +compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ParseEnv* env) { int r; int saved_num_empty_check; @@ -1060,14 +1058,20 @@ compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) if (emptiness == BODY_MAY_BE_EMPTY) r = add_op(reg, OP_EMPTY_CHECK_END); else if (emptiness == BODY_MAY_BE_EMPTY_MEM) { - if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0) + if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0 && qn->empty_status_mem != 0) { r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); + if (r != 0) return r; + COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem; + } else r = add_op(reg, OP_EMPTY_CHECK_END); } #ifdef USE_CALL - else if (emptiness == BODY_MAY_BE_EMPTY_REC) + else if (emptiness == BODY_MAY_BE_EMPTY_REC) { r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); + if (r != 0) return r; + COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem; + } #endif if (r != 0) return r; @@ -1078,7 +1082,7 @@ compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) #ifdef USE_CALL static int -compile_call(CallNode* node, regex_t* reg, ScanEnv* env) +compile_call(CallNode* node, regex_t* reg, ParseEnv* env) { int r; int offset; @@ -1098,7 +1102,7 @@ compile_call(CallNode* node, regex_t* reg, ScanEnv* env) #endif static int -compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env) +compile_tree_n_times(Node* node, int n, regex_t* reg, ParseEnv* env) { int i, r; @@ -1356,7 +1360,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index) static int compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, - regex_t* reg, ScanEnv* env) + regex_t* reg, ParseEnv* env) { int r; int num_repeat = reg->num_repeat++; @@ -1469,7 +1473,7 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) } static int -compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) +compile_quantifier_node(QuantNode* qn, regex_t* reg, ParseEnv* env) { int i, r, mod_tlen; int infinite = IS_INFINITE_REPEAT(qn->upper); @@ -1649,7 +1653,7 @@ compile_length_option_node(BagNode* node, regex_t* reg) } static int -compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env) +compile_option_node(BagNode* node, regex_t* reg, ParseEnv* env) { int r; @@ -1765,7 +1769,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) } static int -compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) +compile_bag_memory_node(BagNode* node, regex_t* reg, ParseEnv* env) { int r; @@ -1845,7 +1849,7 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) } static int -compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) +compile_bag_node(BagNode* node, regex_t* reg, ParseEnv* env) { int r, len; @@ -2036,7 +2040,7 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) } static int -compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env) +compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ParseEnv* env) { int r; @@ -2150,7 +2154,7 @@ compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env) static int compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg, - ScanEnv* env) + ParseEnv* env) { int r; int len; @@ -2279,7 +2283,7 @@ compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg, } static int -compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) +compile_anchor_node(AnchorNode* node, regex_t* reg, ParseEnv* env) { int r, len; enum OpCode op; @@ -2573,7 +2577,7 @@ compile_length_tree(Node* node, regex_t* reg) } static int -compile_tree(Node* node, regex_t* reg, ScanEnv* env) +compile_tree(Node* node, regex_t* reg, ParseEnv* env) { int n, len, pos, r = 0; @@ -2983,7 +2987,7 @@ numbered_ref_check(Node* node) } static int -disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) +disable_noname_group_capture(Node** root, regex_t* reg, ParseEnv* env) { int r, i, pos, counter; MemStatusType loc; @@ -3003,7 +3007,7 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) for (i = 1, pos = 1; i <= env->num_mem; i++) { if (map[i].new_val > 0) { - SCANENV_MEMENV(env)[pos] = SCANENV_MEMENV(env)[i]; + PARSEENV_MEMENV(env)[pos] = PARSEENV_MEMENV(env)[i]; pos++; } } @@ -3285,8 +3289,7 @@ get_tree_head_literal(Node* node, int exact, regex_t* reg) if (sn->end <= sn->s) break; - if (exact == 0 || - ! NODE_IS_IGNORECASE(node) || NODE_STRING_IS_CRUDE(node)) { + if (exact == 0 || !NODE_IS_REAL_IGNORECASE(node)) { n = node; } } @@ -3381,7 +3384,7 @@ get_tree_tail_literal(Node* node, Node** rnode, regex_t* reg) break; } - if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { + if (NODE_IS_REAL_IGNORECASE(node)) { r = GET_VALUE_NONE; break; } @@ -3601,7 +3604,7 @@ check_node_in_look_behind(Node* node, int not, int* used) } static OnigLen -node_min_byte_len(Node* node, ScanEnv* env) +node_min_byte_len(Node* node, ParseEnv* env) { OnigLen len; OnigLen tmin; @@ -3612,7 +3615,7 @@ node_min_byte_len(Node* node, ScanEnv* env) if (! NODE_IS_CHECKER(node)) { int i; int* backs; - MemEnv* mem_env = SCANENV_MEMENV(env); + MemEnv* mem_env = PARSEENV_MEMENV(env); BackRefNode* br = BACKREF_(node); if (NODE_IS_RECURSION(node)) break; @@ -3629,10 +3632,8 @@ node_min_byte_len(Node* node, ScanEnv* env) case NODE_CALL: { Node* t = NODE_BODY(node); - if (NODE_IS_RECURSION(node)) { - if (NODE_IS_FIXED_MIN(t)) - len = BAG_(t)->min_len; - } + if (NODE_IS_FIXED_MIN(t)) + len = BAG_(t)->min_len; else len = node_min_byte_len(t, env); } @@ -3742,143 +3743,8 @@ node_min_byte_len(Node* node, ScanEnv* env) return len; } -static OnigLen -node_max_byte_len(Node* node, ScanEnv* env) -{ - OnigLen len; - OnigLen tmax; - - len = 0; - switch (NODE_TYPE(node)) { - case NODE_LIST: - do { - tmax = node_max_byte_len(NODE_CAR(node), env); - len = distance_add(len, tmax); - } while (IS_NOT_NULL(node = NODE_CDR(node))); - break; - - case NODE_ALT: - do { - tmax = node_max_byte_len(NODE_CAR(node), env); - if (len < tmax) len = tmax; - } while (IS_NOT_NULL(node = NODE_CDR(node))); - break; - - case NODE_STRING: - { - StrNode* sn = STR_(node); - len = (OnigLen )(sn->end - sn->s); - } - break; - - case NODE_CTYPE: - case NODE_CCLASS: - len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - break; - - case NODE_BACKREF: - if (! NODE_IS_CHECKER(node)) { - int i; - int* backs; - MemEnv* mem_env = SCANENV_MEMENV(env); - BackRefNode* br = BACKREF_(node); - if (NODE_IS_RECURSION(node)) { -#ifdef USE_BACKREF_WITH_LEVEL - if (NODE_IS_NEST_LEVEL(node)) { - len = INFINITE_LEN; - } -#endif - break; - } - backs = BACKREFS_P(br); - for (i = 0; i < br->back_num; i++) { - tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env); - if (len < tmax) len = tmax; - } - } - break; - -#ifdef USE_CALL - case NODE_CALL: - if (! NODE_IS_RECURSION(node)) - len = node_max_byte_len(NODE_BODY(node), env); - else - len = INFINITE_LEN; - break; -#endif - - case NODE_QUANT: - { - QuantNode* qn = QUANT_(node); - - if (qn->upper != 0) { - len = node_max_byte_len(NODE_BODY(node), env); - if (len != 0) { - if (! IS_INFINITE_REPEAT(qn->upper)) - len = distance_multiply(len, qn->upper); - else - len = INFINITE_LEN; - } - } - } - break; - - case NODE_BAG: - { - BagNode* en = BAG_(node); - switch (en->type) { - case BAG_MEMORY: - if (NODE_IS_FIXED_MAX(node)) - len = en->max_len; - else { - if (NODE_IS_MARK1(node)) - len = INFINITE_LEN; - else { - NODE_STATUS_ADD(node, MARK1); - len = node_max_byte_len(NODE_BODY(node), env); - NODE_STATUS_REMOVE(node, MARK1); - - en->max_len = len; - NODE_STATUS_ADD(node, FIXED_MAX); - } - } - break; - - case BAG_OPTION: - case BAG_STOP_BACKTRACK: - len = node_max_byte_len(NODE_BODY(node), env); - break; - case BAG_IF_ELSE: - { - OnigLen tlen, elen; - - len = node_max_byte_len(NODE_BODY(node), env); - if (IS_NOT_NULL(en->te.Then)) { - tlen = node_max_byte_len(en->te.Then, env); - len = distance_add(len, tlen); - } - if (IS_NOT_NULL(en->te.Else)) - elen = node_max_byte_len(en->te.Else, env); - else elen = 0; - - if (elen > len) len = elen; - } - break; - } - } - break; - - case NODE_ANCHOR: - case NODE_GIMMICK: - default: - break; - } - - return len; -} - static int -check_backrefs(Node* node, ScanEnv* env) +check_backrefs(Node* node, ParseEnv* env) { int r; @@ -3923,7 +3789,7 @@ check_backrefs(Node* node, ScanEnv* env) int i; BackRefNode* br = BACKREF_(node); int* backs = BACKREFS_P(br); - MemEnv* mem_env = SCANENV_MEMENV(env); + MemEnv* mem_env = PARSEENV_MEMENV(env); for (i = 0; i < br->back_num; i++) { if (backs[i] > env->num_mem) @@ -3944,7 +3810,7 @@ check_backrefs(Node* node, ScanEnv* env) } static int -set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) +set_empty_repeat_node_trav(Node* node, Node* empty, ParseEnv* env) { int r; @@ -3998,7 +3864,7 @@ set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) if (en->type == BAG_MEMORY) { if (NODE_IS_BACKREF(node)) { if (IS_NOT_NULL(empty)) - SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty; + PARSEENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty; } } else if (en->type == BAG_IF_ELSE) { @@ -4034,7 +3900,7 @@ is_ancestor_node(Node* node, Node* me) } static void -set_empty_status_check_trav(Node* node, ScanEnv* env) +set_empty_status_check_trav(Node* node, ParseEnv* env) { switch (NODE_TYPE(node)) { case NODE_LIST: @@ -4078,14 +3944,14 @@ set_empty_status_check_trav(Node* node, ScanEnv* env) { int i; int* backs; - MemEnv* mem_env = SCANENV_MEMENV(env); + MemEnv* mem_env = PARSEENV_MEMENV(env); BackRefNode* br = BACKREF_(node); backs = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { Node* ernode = mem_env[backs[i]].empty_repeat_node; if (IS_NOT_NULL(ernode)) { if (! is_ancestor_node(ernode, node)) { - MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]); + MEM_STATUS_LIMIT_ON(QUANT_(ernode)->empty_status_mem, backs[i]); NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK); NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK); } @@ -4150,7 +4016,7 @@ set_parent_node_trav(Node* node, Node* parent) #define RECURSION_INFINITE (1<<2) static int -infinite_recursive_call_check(Node* node, ScanEnv* env, int head) +infinite_recursive_call_check(Node* node, ParseEnv* env, int head) { int ret; int r = 0; @@ -4191,6 +4057,8 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) break; case NODE_QUANT: + if (QUANT_(node)->upper == 0) break; + r = infinite_recursive_call_check(NODE_BODY(node), env, head); if (r < 0) return r; if ((r & RECURSION_MUST) != 0) { @@ -4265,7 +4133,7 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) } static int -infinite_recursive_call_check_trav(Node* node, ScanEnv* env) +infinite_recursive_call_check_trav(Node* node, ParseEnv* env) { int r; @@ -4403,7 +4271,7 @@ recursive_call_check(Node* node) #define FOUND_CALLED_NODE 1 static int -recursive_call_check_trav(Node* node, ScanEnv* env, int state) +recursive_call_check_trav(Node* node, ParseEnv* env, int state) { int r = 0; @@ -4443,19 +4311,21 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) BagNode* en = BAG_(node); if (en->type == BAG_MEMORY) { - if (NODE_IS_CALLED(node) || (state & IN_RECURSION) != 0) { + if (NODE_IS_CALLED(node)) { + r = FOUND_CALLED_NODE; + goto check_recursion; + } + else if ((state & IN_RECURSION) != 0) { + check_recursion: if (! NODE_IS_RECURSION(node)) { NODE_STATUS_ADD(node, MARK1); - r = recursive_call_check(NODE_BODY(node)); - if (r != 0) { + ret = recursive_call_check(NODE_BODY(node)); + if (ret != 0) { NODE_STATUS_ADD(node, RECURSION); MEM_STATUS_ON(env->backtrack_mem, en->m.regnum); } NODE_STATUS_REMOVE(node, MARK1); } - - if (NODE_IS_CALLED(node)) - r = FOUND_CALLED_NODE; } } @@ -4616,8 +4486,9 @@ reduce_string_list(Node* node, OnigEncoding enc) #define IN_VAR_REPEAT (1<<3) #define IN_ZERO_REPEAT (1<<4) #define IN_MULTI_ENTRY (1<<5) -#define IN_LOOK_BEHIND (1<<6) - +#define IN_PREC_READ (1<<6) +#define IN_LOOK_BEHIND (1<<7) +#define IN_PEEK (1<<8) /* divide different length alternatives in look-behind. (?<=A|B) ==> (?<=A)|(?<=B) @@ -4706,7 +4577,7 @@ list_reduce_in_look_behind(Node* node) } static int -alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env) +alt_reduce_in_look_behind(Node* node, regex_t* reg, ParseEnv* env) { int r; @@ -4725,10 +4596,10 @@ alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env) return r; } -static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); +static int tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env); static int -tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_look_behind(Node* node, regex_t* reg, int state, ParseEnv* env) { int r; int state1; @@ -5183,7 +5054,7 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state) return r; } -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT static enum BodyEmptyType quantifiers_memory_node_info(Node* node) { @@ -5265,7 +5136,7 @@ quantifiers_memory_node_info(Node* node) return r; } -#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #ifdef USE_CALL @@ -5274,9 +5145,9 @@ quantifiers_memory_node_info(Node* node) __inline #endif static int -check_call_reference(CallNode* cn, ScanEnv* env, int state) +check_call_reference(CallNode* cn, ParseEnv* env, int state) { - MemEnv* mem_env = SCANENV_MEMENV(env); + MemEnv* mem_env = PARSEENV_MEMENV(env); if (cn->by_number != 0) { int gnum = cn->called_gnum; @@ -5393,7 +5264,7 @@ tune_call2_call(Node* node) } static int -tune_call(Node* node, ScanEnv* env, int state) +tune_call(Node* node, ParseEnv* env, int state) { int r; @@ -5539,6 +5410,8 @@ tune_called_state_call(Node* node, int state) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; + if ((state & IN_PEEK) != 0) + NODE_STATUS_ADD(node, INPEEK); tune_called_state_call(NODE_QUANT_BODY(qn), state); } @@ -5551,10 +5424,12 @@ tune_called_state_call(Node* node, int state) switch (an->type) { case ANCR_PREC_READ_NOT: case ANCR_LOOK_BEHIND_NOT: - state |= IN_NOT; - /* fall */ + state |= (IN_NOT | IN_PEEK); + tune_called_state_call(NODE_ANCHOR_BODY(an), state); + break; case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: + state |= IN_PEEK; tune_called_state_call(NODE_ANCHOR_BODY(an), state); break; default: @@ -5597,6 +5472,11 @@ tune_called_state_call(Node* node, int state) break; case NODE_CALL: + if ((state & IN_PEEK) != 0) + NODE_STATUS_ADD(node, INPEEK); + if ((state & IN_REAL_REPEAT) != 0) + NODE_STATUS_ADD(node, IN_REAL_REPEAT); + tune_called_state_call(NODE_BODY(node), state); break; @@ -5620,6 +5500,11 @@ tune_called_state(Node* node, int state) #ifdef USE_CALL case NODE_CALL: + if ((state & IN_PEEK) != 0) + NODE_STATUS_ADD(node, INPEEK); + if ((state & IN_REAL_REPEAT) != 0) + NODE_STATUS_ADD(node, IN_REAL_REPEAT); + tune_called_state_call(node, state); break; #endif @@ -5659,6 +5544,8 @@ tune_called_state(Node* node, int state) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; + if ((state & IN_PEEK) != 0) + NODE_STATUS_ADD(node, INPEEK); tune_called_state(NODE_QUANT_BODY(qn), state); } @@ -5671,10 +5558,12 @@ tune_called_state(Node* node, int state) switch (an->type) { case ANCR_PREC_READ_NOT: case ANCR_LOOK_BEHIND_NOT: - state |= IN_NOT; - /* fall */ + state |= (IN_NOT | IN_PEEK); + tune_called_state(NODE_ANCHOR_BODY(an), state); + break; case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: + state |= IN_PEEK; tune_called_state(NODE_ANCHOR_BODY(an), state); break; default: @@ -5700,17 +5589,18 @@ tune_called_state(Node* node, int state) __inline #endif static int -tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_anchor(Node* node, regex_t* reg, int state, ParseEnv* env) { int r; AnchorNode* an = ANCHOR_(node); switch (an->type) { case ANCR_PREC_READ: - r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ), env); break; case ANCR_PREC_READ_NOT: - r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ | IN_NOT), + env); break; case ANCR_LOOK_BEHIND: @@ -5730,7 +5620,7 @@ tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) __inline #endif static int -tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_quant(Node* node, regex_t* reg, int state, ParseEnv* env) { int r; QuantNode* qn = QUANT_(node); @@ -5746,7 +5636,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) { OnigLen d = node_min_byte_len(body, env); if (d == 0) { -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT qn->emptiness = quantifiers_memory_node_info(body); #else qn->emptiness = BODY_MAY_BE_EMPTY; @@ -5807,7 +5697,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) 6. expand repeated string. */ static int -tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env) { int r = 0; @@ -5832,7 +5722,7 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case NODE_STRING: - if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { + if (NODE_IS_REAL_IGNORECASE(node)) { r = unravel_case_fold_string(node, reg, state); } break; @@ -5918,6 +5808,9 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case NODE_QUANT: + if ((state & (IN_PREC_READ | IN_LOOK_BEHIND)) != 0) + NODE_STATUS_ADD(node, INPEEK); + r = tune_quant(node, reg, state, env); break; @@ -5938,6 +5831,7 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) return r; } +#ifndef ONIG_DONT_OPTIMIZE static int set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand, UChar* s, UChar* end, @@ -6007,6 +5901,7 @@ set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand, return 0; } +#endif #define OPT_EXACT_MAXLEN 24 @@ -6019,7 +5914,7 @@ typedef struct { MinMaxLen mm; OnigEncoding enc; OnigCaseFoldType case_fold_flag; - ScanEnv* scan_env; + ParseEnv* scan_env; } OptEnv; typedef struct { @@ -6052,6 +5947,8 @@ typedef struct { } OptNode; +#ifndef ONIG_DONT_OPTIMIZE + static int map_position_value(OnigEncoding enc, int i) { @@ -6540,85 +6437,219 @@ alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env) mml_alt_merge(&to->len, &add->len); } - -#define MAX_NODE_OPT_INFO_REF_COUNT 5 - -static int -optimize_nodes(Node* node, OptNode* opt, OptEnv* env) +static OnigLen +node_max_byte_len(Node* node, ParseEnv* env) { - int i; - int r; - OptNode xo; - OnigEncoding enc; - - r = 0; - enc = env->enc; - clear_node_opt_info(opt); - set_bound_node_opt_info(opt, &env->mm); + OnigLen len; + OnigLen tmax; + len = 0; switch (NODE_TYPE(node)) { case NODE_LIST: - { - OptEnv nenv; - Node* nd = node; - - copy_opt_env(&nenv, env); - do { - r = optimize_nodes(NODE_CAR(nd), &xo, &nenv); - if (r == 0) { - mml_add(&nenv.mm, &xo.len); - concat_left_node_opt_info(enc, opt, &xo); - } - } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd))); - } + do { + tmax = node_max_byte_len(NODE_CAR(node), env); + len = distance_add(len, tmax); + } while (IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_ALT: - { - Node* nd = node; - - do { - r = optimize_nodes(NODE_CAR(nd), &xo, env); - if (r == 0) { - if (nd == node) copy_node_opt_info(opt, &xo); - else alt_merge_node_opt_info(opt, &xo, env); - } - } while ((r == 0) && IS_NOT_NULL(nd = NODE_CDR(nd))); - } + do { + tmax = node_max_byte_len(NODE_CAR(node), env); + if (len < tmax) len = tmax; + } while (IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_STRING: { StrNode* sn = STR_(node); - int slen = (int )(sn->end - sn->s); - - concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); - if (slen > 0) { - add_char_opt_map(&opt->map, *(sn->s), enc); - } - mml_set_min_max(&opt->len, slen, slen); + len = (OnigLen )(sn->end - sn->s); } break; + case NODE_CTYPE: case NODE_CCLASS: - { - int z; - CClassNode* cc = CCLASS_(node); - - /* no need to check ignore case. (set in tune_tree()) */ - - if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) { - OnigLen min = ONIGENC_MBC_MINLEN(enc); - OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc); + len = ONIGENC_MBC_MAXLEN_DIST(env->enc); + break; - mml_set_min_max(&opt->len, min, max); - } - else { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = BITSET_AT(cc->bs, i); - if ((z && ! IS_NCCLASS_NOT(cc)) || (! z && IS_NCCLASS_NOT(cc))) { - add_char_opt_map(&opt->map, (UChar )i, enc); - } + case NODE_BACKREF: + if (! NODE_IS_CHECKER(node)) { + int i; + int* backs; + MemEnv* mem_env = PARSEENV_MEMENV(env); + BackRefNode* br = BACKREF_(node); + if (NODE_IS_RECURSION(node)) { +#ifdef USE_BACKREF_WITH_LEVEL + if (NODE_IS_NEST_LEVEL(node)) { + len = INFINITE_LEN; + } +#endif + break; + } + backs = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env); + if (len < tmax) len = tmax; + } + } + break; + +#ifdef USE_CALL + case NODE_CALL: + if (! NODE_IS_RECURSION(node)) + len = node_max_byte_len(NODE_BODY(node), env); + else + len = INFINITE_LEN; + break; +#endif + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + + if (qn->upper != 0) { + len = node_max_byte_len(NODE_BODY(node), env); + if (len != 0) { + if (! IS_INFINITE_REPEAT(qn->upper)) + len = distance_multiply(len, qn->upper); + else + len = INFINITE_LEN; + } + } + } + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + switch (en->type) { + case BAG_MEMORY: + if (NODE_IS_FIXED_MAX(node)) + len = en->max_len; + else { + if (NODE_IS_MARK1(node)) + len = INFINITE_LEN; + else { + NODE_STATUS_ADD(node, MARK1); + len = node_max_byte_len(NODE_BODY(node), env); + NODE_STATUS_REMOVE(node, MARK1); + + en->max_len = len; + NODE_STATUS_ADD(node, FIXED_MAX); + } + } + break; + + case BAG_OPTION: + case BAG_STOP_BACKTRACK: + len = node_max_byte_len(NODE_BODY(node), env); + break; + case BAG_IF_ELSE: + { + OnigLen tlen, elen; + + len = node_max_byte_len(NODE_BODY(node), env); + if (IS_NOT_NULL(en->te.Then)) { + tlen = node_max_byte_len(en->te.Then, env); + len = distance_add(len, tlen); + } + if (IS_NOT_NULL(en->te.Else)) + elen = node_max_byte_len(en->te.Else, env); + else elen = 0; + + if (elen > len) len = elen; + } + break; + } + } + break; + + case NODE_ANCHOR: + case NODE_GIMMICK: + default: + break; + } + + return len; +} + +#define MAX_NODE_OPT_INFO_REF_COUNT 5 + +static int +optimize_nodes(Node* node, OptNode* opt, OptEnv* env) +{ + int i; + int r; + OptNode xo; + OnigEncoding enc; + + r = 0; + enc = env->enc; + clear_node_opt_info(opt); + set_bound_node_opt_info(opt, &env->mm); + + switch (NODE_TYPE(node)) { + case NODE_LIST: + { + OptEnv nenv; + Node* nd = node; + + copy_opt_env(&nenv, env); + do { + r = optimize_nodes(NODE_CAR(nd), &xo, &nenv); + if (r == 0) { + mml_add(&nenv.mm, &xo.len); + concat_left_node_opt_info(enc, opt, &xo); + } + } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd))); + } + break; + + case NODE_ALT: + { + Node* nd = node; + + do { + r = optimize_nodes(NODE_CAR(nd), &xo, env); + if (r == 0) { + if (nd == node) copy_node_opt_info(opt, &xo); + else alt_merge_node_opt_info(opt, &xo, env); + } + } while ((r == 0) && IS_NOT_NULL(nd = NODE_CDR(nd))); + } + break; + + case NODE_STRING: + { + StrNode* sn = STR_(node); + int slen = (int )(sn->end - sn->s); + + concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); + if (slen > 0) { + add_char_opt_map(&opt->map, *(sn->s), enc); + } + mml_set_min_max(&opt->len, slen, slen); + } + break; + + case NODE_CCLASS: + { + int z; + CClassNode* cc = CCLASS_(node); + + /* no need to check ignore case. (set in tune_tree()) */ + + if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) { + OnigLen min = ONIGENC_MBC_MINLEN(enc); + OnigLen max = ONIGENC_MBC_MAXLEN_DIST(enc); + + mml_set_min_max(&opt->len, min, max); + } + else { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + z = BITSET_AT(cc->bs, i); + if ((z && ! IS_NCCLASS_NOT(cc)) || (! z && IS_NCCLASS_NOT(cc))) { + add_char_opt_map(&opt->map, (UChar )i, enc); + } } mml_set_min_max(&opt->len, 1, 1); } @@ -6822,22 +6853,22 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) { OptEnv nenv; - copy_opt_env(&nenv, env); - r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); - if (r == 0) { - mml_add(&nenv.mm, &xo.len); - concat_left_node_opt_info(enc, opt, &xo); - if (IS_NOT_NULL(en->te.Then)) { - r = optimize_nodes(en->te.Then, &xo, &nenv); - if (r == 0) { - concat_left_node_opt_info(enc, opt, &xo); + if (IS_NOT_NULL(en->te.Else)) { + copy_opt_env(&nenv, env); + r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); + if (r == 0) { + mml_add(&nenv.mm, &xo.len); + concat_left_node_opt_info(enc, opt, &xo); + if (IS_NOT_NULL(en->te.Then)) { + r = optimize_nodes(en->te.Then, &xo, &nenv); + if (r == 0) { + concat_left_node_opt_info(enc, opt, &xo); + } } - } - if (IS_NOT_NULL(en->te.Else)) { - r = optimize_nodes(en->te.Else, &xo, env); - if (r == 0) - alt_merge_node_opt_info(opt, &xo, env); + r = optimize_nodes(en->te.Else, &xo, env); + if (r == 0) + alt_merge_node_opt_info(opt, &xo, env); } } } @@ -6930,7 +6961,7 @@ static void print_optimize_info(FILE* f, regex_t* reg); #endif static int -set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) +set_optimize_info_from_tree(Node* node, regex_t* reg, ParseEnv* scan_env) { int r; OptNode opt; @@ -6985,6 +7016,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) #endif return r; } +#endif /* ONIG_DONT_OPTIMIZE */ static void clear_optimize_info(regex_t* reg) @@ -7031,14 +7063,43 @@ static void print_enc_string(FILE* fp, OnigEncoding enc, s++; } } +} - fprintf(fp, "/\n"); +static void +print_options(FILE* fp, OnigOptionType o) +{ + if ((o & ONIG_OPTION_IGNORECASE) != 0) fprintf(fp, " IGNORECASE"); + if ((o & ONIG_OPTION_EXTEND) != 0) fprintf(fp, " EXTEND"); + if ((o & ONIG_OPTION_MULTILINE) != 0) fprintf(fp, " MULTILINE"); + if ((o & ONIG_OPTION_SINGLELINE) != 0) fprintf(fp, " SINGLELINE"); + if ((o & ONIG_OPTION_FIND_LONGEST) != 0) fprintf(fp, " FIND_LONGEST"); + if ((o & ONIG_OPTION_FIND_NOT_EMPTY) != 0) fprintf(fp, " FIND_NOT_EMPTY"); + if ((o & ONIG_OPTION_NEGATE_SINGLELINE) != 0) fprintf(fp, " NEGATE_SINGLELINE"); + if ((o & ONIG_OPTION_DONT_CAPTURE_GROUP) != 0) fprintf(fp, " DONT_CAPTURE_GROUP"); + if ((o & ONIG_OPTION_CAPTURE_GROUP) != 0) fprintf(fp, " CAPTURE_GROUP"); + if ((o & ONIG_OPTION_NOTBOL) != 0) fprintf(fp, " NOTBOL"); + if ((o & ONIG_OPTION_NOTEOL) != 0) fprintf(fp, " NOTEOL"); + if ((o & ONIG_OPTION_POSIX_REGION) != 0) fprintf(fp, " POSIX_REGION"); + if ((o & ONIG_OPTION_CHECK_VALIDITY_OF_STRING) != 0) fprintf(fp, " CHECK_VALIDITY_OF_STRING"); + if ((o & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) fprintf(fp, " IGNORECASE_IS_ASCII"); + if ((o & ONIG_OPTION_WORD_IS_ASCII) != 0) fprintf(fp, " WORD_IS_ASCII"); + if ((o & ONIG_OPTION_DIGIT_IS_ASCII) != 0) fprintf(fp, " DIGIT_IS_ASCII"); + if ((o & ONIG_OPTION_SPACE_IS_ASCII) != 0) fprintf(fp, " SPACE_IS_ASCII"); + if ((o & ONIG_OPTION_POSIX_IS_ASCII) != 0) fprintf(fp, " POSIX_IS_ASCII"); + if ((o & ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER) != 0) fprintf(fp, " TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER"); + if ((o & ONIG_OPTION_TEXT_SEGMENT_WORD) != 0) fprintf(fp, " TEXT_SEGMENT_WORD"); + if ((o & ONIG_OPTION_NOT_BEGIN_STRING) != 0) fprintf(fp, " NOT_BIGIN_STRING"); + if ((o & ONIG_OPTION_NOT_END_STRING) != 0) fprintf(fp, " NOT_END_STRING"); + if ((o & ONIG_OPTION_NOT_BEGIN_POSITION) != 0) fprintf(fp, " NOT_BEGIN_POSITION"); + if ((o & ONIG_OPTION_CALLBACK_EACH_MATCH) != 0) fprintf(fp, " CALLBACK_EACH_MATCH"); } #endif /* ONIG_DEBUG */ #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) +#ifndef ONIG_DONT_OPTIMIZE + static void print_distance_range(FILE* f, OnigLen a, OnigLen b) { @@ -7161,7 +7222,8 @@ print_optimize_info(FILE* f, regex_t* reg) } } } -#endif +#endif /* ONIG_DONT_OPTIMIZE */ +#endif /* defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) */ extern RegexExt* @@ -7259,93 +7321,150 @@ static void print_tree P_((FILE* f, Node* node)); extern int onig_init_for_match_at(regex_t* reg); -extern int -onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, - OnigErrorInfo* einfo) -{ - int r; - Node* root; - ScanEnv scan_env; +static int parse_and_tune(regex_t* reg, const UChar* pattern, + const UChar* pattern_end, ParseEnv *scan_env, Node** rroot, + OnigErrorInfo* einfo #ifdef USE_CALL - UnsetAddrList uslist = {0}; + , UnsetAddrList* uslist #endif +) +{ + int r; + Node* root; - root = 0; + root = NULL_NODE; if (IS_NOT_NULL(einfo)) { einfo->enc = reg->enc; einfo->par = (UChar* )NULL; } -#ifdef ONIG_DEBUG - fprintf(DBGFP, "\nPATTERN: /"); - print_enc_string(DBGFP, reg->enc, pattern, pattern_end); -#endif - - if (reg->ops_alloc == 0) { - r = ops_init(reg, OPS_INIT_SIZE); - if (r != 0) goto end; - } - else - reg->ops_used = 0; - - r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); + r = onig_parse_tree(&root, pattern, pattern_end, reg, scan_env); if (r != 0) goto err; r = reduce_string_list(root, reg->enc); if (r != 0) goto err; /* mixed use named group and no-named group */ - if (scan_env.num_named > 0 && - IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && + if (scan_env->num_named > 0 && + IS_SYNTAX_BV(scan_env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && ! OPTON_CAPTURE_GROUP(reg->options)) { - if (scan_env.num_named != scan_env.num_mem) - r = disable_noname_group_capture(&root, reg, &scan_env); + if (scan_env->num_named != scan_env->num_mem) + r = disable_noname_group_capture(&root, reg, scan_env); else r = numbered_ref_check(root); if (r != 0) goto err; } - r = check_backrefs(root, &scan_env); + r = check_backrefs(root, scan_env); if (r != 0) goto err; #ifdef USE_CALL - if (scan_env.num_call > 0) { - r = unset_addr_list_init(&uslist, scan_env.num_call); + if (scan_env->num_call > 0) { + r = unset_addr_list_init(uslist, scan_env->num_call); if (r != 0) goto err; - scan_env.unset_addr_list = &uslist; - r = tune_call(root, &scan_env, 0); + scan_env->unset_addr_list = uslist; + r = tune_call(root, scan_env, 0); if (r != 0) goto err_unset; r = tune_call2(root); if (r != 0) goto err_unset; - r = recursive_call_check_trav(root, &scan_env, 0); + r = recursive_call_check_trav(root, scan_env, 0); if (r < 0) goto err_unset; - r = infinite_recursive_call_check_trav(root, &scan_env); + r = infinite_recursive_call_check_trav(root, scan_env); if (r != 0) goto err_unset; tune_called_state(root, 0); } - reg->num_call = scan_env.num_call; + reg->num_call = scan_env->num_call; #endif #ifdef ONIG_DEBUG_PARSE - fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); - fprintf(DBGFP, "TREE (parsed)\n"); - print_tree(DBGFP, root); - fprintf(DBGFP, "\n"); + fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env->max_parse_depth); #endif - r = tune_tree(root, reg, 0, &scan_env); - if (r != 0) goto err_unset; + r = tune_tree(root, reg, 0, scan_env); + if (r != 0) { +#ifdef ONIG_DEBUG_PARSE + fprintf(DBGFP, "TREE (error in tune)\n"); + print_tree(DBGFP, root); + fprintf(DBGFP, "\n"); +#endif + goto err_unset; + } - if (scan_env.backref_num != 0) { + if (scan_env->backref_num != 0) { set_parent_node_trav(root, NULL_NODE); - r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env); + r = set_empty_repeat_node_trav(root, NULL_NODE, scan_env); if (r != 0) goto err_unset; - set_empty_status_check_trav(root, &scan_env); + set_empty_status_check_trav(root, scan_env); } + *rroot = root; + return r; + + err_unset: +#ifdef USE_CALL + if (scan_env->num_call > 0) { + unset_addr_list_end(uslist); + } +#endif + err: + if (IS_NOT_NULL(scan_env->error)) { + if (IS_NOT_NULL(einfo)) { + einfo->par = scan_env->error; + einfo->par_end = scan_env->error_end; + } + } + + onig_node_free(root); + if (IS_NOT_NULL(scan_env->mem_env_dynamic)) + xfree(scan_env->mem_env_dynamic); + + *rroot = NULL_NODE; + return r; +} + +extern int +onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, + OnigErrorInfo* einfo) +{ + int r; + Node* root; + ParseEnv scan_env; +#ifdef USE_CALL + UnsetAddrList uslist = {0}; +#endif + +#ifdef ONIG_DEBUG + fprintf(DBGFP, "\nPATTERN: /"); + print_enc_string(DBGFP, reg->enc, pattern, pattern_end); + fprintf(DBGFP, "/\n"); + fprintf(DBGFP, "OPTIONS:"); + print_options(DBGFP, reg->options); + fprintf(DBGFP, "\n"); +#endif + + if (reg->ops_alloc == 0) { + r = ops_init(reg, OPS_INIT_SIZE); + if (r != 0) { + if (IS_NOT_NULL(einfo)) { + einfo->enc = reg->enc; + einfo->par = (UChar* )NULL; + } + return r; + } + } + else + reg->ops_used = 0; + + r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, einfo +#ifdef USE_CALL + , &uslist +#endif + ); + if (r != 0) return r; + #ifdef ONIG_DEBUG_PARSE fprintf(DBGFP, "TREE (after tune)\n"); print_tree(DBGFP, root); @@ -7377,7 +7496,14 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, clear_optimize_info(reg); #ifndef ONIG_DONT_OPTIMIZE r = set_optimize_info_from_tree(root, reg, &scan_env); - if (r != 0) goto err_unset; + if (r != 0) { +#ifdef USE_CALL + if (scan_env.num_call > 0) { + unset_addr_list_end(&uslist); + } +#endif + goto err; + } #endif if (IS_NOT_NULL(scan_env.mem_env_dynamic)) { @@ -7407,6 +7533,9 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #endif + r = ops_resize(reg, reg->ops_used); + if (r != ONIG_NORMAL) goto err; + set_addr_in_repeat_range(reg); if ((reg->push_mem_end != 0) @@ -7449,15 +7578,8 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, onig_init_for_match_at(reg); #endif - end: return r; - err_unset: -#ifdef USE_CALL - if (scan_env.num_call > 0) { - unset_addr_list_end(&uslist); - } -#endif err: if (IS_NOT_NULL(scan_env.error)) { if (IS_NOT_NULL(einfo)) { @@ -7513,6 +7635,12 @@ onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_fl else option |= syntax->options; + if ((option & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) { + case_fold_flag &= ~(INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR | + ONIGENC_CASE_FOLD_TURKISH_AZERI); + case_fold_flag |= ONIGENC_CASE_FOLD_ASCII_ONLY; + } + (reg)->enc = enc; (reg)->options = option; (reg)->syntax = syntax; @@ -7703,15 +7831,145 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) return onig_is_code_in_cc_len(len, code, cc); } + +#define MANY_REPEAT_OF_ANYCHAR 20 + +typedef enum { + MJ_NO = 0, + MJ_YES = 1, + MJ_IGNORE = 2, +} MJ_RESULT; + +static MJ_RESULT +mostly_just_anychar(Node* node, int in_reluctant) +{ + MJ_RESULT r; + + r = MJ_NO; + switch (NODE_TYPE(node)) { + case NODE_LIST: + { + int found = FALSE; + do { + r = mostly_just_anychar(NODE_CAR(node), in_reluctant); + if (r == MJ_NO) break; + if (r == MJ_YES) found = TRUE; + } while (IS_NOT_NULL(node = NODE_CDR(node))); + if (r == MJ_IGNORE) { + if (found == TRUE) r = MJ_YES; + } + } + break; + + case NODE_ALT: + r = MJ_IGNORE; + do { + r = mostly_just_anychar(NODE_CAR(node), in_reluctant); + if (r == MJ_YES) break; + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + + if (qn->upper == 0) + r = MJ_IGNORE; + else { + if (in_reluctant == FALSE) { + if (qn->greedy != 0 && + (! IS_INFINITE_REPEAT(qn->upper) && + qn->upper <= MANY_REPEAT_OF_ANYCHAR)) { + in_reluctant = TRUE; + } + } + r = mostly_just_anychar(NODE_BODY(node), in_reluctant); + } + } + break; + + case NODE_ANCHOR: + switch (ANCHOR_(node)->type) { + case ANCR_PREC_READ: + case ANCR_PREC_READ_NOT: + case ANCR_LOOK_BEHIND: + case ANCR_LOOK_BEHIND_NOT: + case ANCR_TEXT_SEGMENT_BOUNDARY: /* \y */ + r = MJ_IGNORE; + break; + default: + break; + } + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = mostly_just_anychar(en->te.Then, in_reluctant); + if (r == MJ_YES) break; + } + if (IS_NOT_NULL(en->te.Else)) { + r = mostly_just_anychar(en->te.Else, in_reluctant); + } + } + else { + r = mostly_just_anychar(NODE_BODY(node), in_reluctant); + } + } + break; + + case NODE_CTYPE: + if (CTYPE_(node)->ctype == CTYPE_ANYCHAR) + r = MJ_YES; + else + r = MJ_NO; + break; + + case NODE_STRING: + if (NODE_STRING_LEN(node) == 0) { + r = MJ_IGNORE; + break; + } + /* fall */ + case NODE_CCLASS: + r = MJ_NO; + break; + +#ifdef USE_CALL + case NODE_CALL: + /* ignore call */ +#endif + case NODE_BACKREF: + case NODE_GIMMICK: + r = MJ_IGNORE; + break; + + default: + break; + } + + return r; +} + +#define MAX_CALLS_IN_DETECT 10 + typedef struct { int prec_read; int look_behind; + int backref; int backref_with_level; int call; + int anychar_reluctant_many; + int empty_check_nest_level; + int max_empty_check_nest_level; + int heavy_element; } SlowElementCount; static int -node_detect_can_be_slow(Node* node, SlowElementCount* ct) +detect_can_be_slow(Node* node, SlowElementCount* ct, int ncall, int calls[]) { int r; @@ -7720,13 +7978,45 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct) case NODE_LIST: case NODE_ALT: do { - r = node_detect_can_be_slow(NODE_CAR(node), ct); + r = detect_can_be_slow(NODE_CAR(node), ct, ncall, calls); if (r != 0) return r; } while (IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: - r = node_detect_can_be_slow(NODE_BODY(node), ct); + { + int prev_heavy_element; + QuantNode* qn; + Node* body; + + qn = QUANT_(node); + body = NODE_BODY(node); + + if (qn->emptiness != BODY_IS_NOT_EMPTY) { + prev_heavy_element = ct->heavy_element; + ct->empty_check_nest_level++; + if (ct->empty_check_nest_level > ct->max_empty_check_nest_level) + ct->max_empty_check_nest_level = ct->empty_check_nest_level; + } + else if (IS_INFINITE_REPEAT(qn->upper) || + qn->upper > MANY_REPEAT_OF_ANYCHAR) { + MJ_RESULT mr = mostly_just_anychar(body, (qn->greedy == 0)); + if (mr == MJ_YES) + ct->anychar_reluctant_many++; + } + + r = detect_can_be_slow(body, ct, ncall, calls); + + if (qn->emptiness != BODY_IS_NOT_EMPTY) { + if (NODE_IS_INPEEK(node)) { + if (ct->empty_check_nest_level > 2) { + if (prev_heavy_element == ct->heavy_element) + ct->heavy_element++; + } + } + ct->empty_check_nest_level--; + } + } break; case NODE_ANCHOR: @@ -7744,23 +8034,23 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct) } if (ANCHOR_HAS_BODY(ANCHOR_(node))) - r = node_detect_can_be_slow(NODE_BODY(node), ct); + r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls); break; case NODE_BAG: { BagNode* en = BAG_(node); - r = node_detect_can_be_slow(NODE_BODY(node), ct); + r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls); if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = node_detect_can_be_slow(en->te.Then, ct); + r = detect_can_be_slow(en->te.Then, ct, ncall, calls); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) { - r = node_detect_can_be_slow(en->te.Else, ct); + r = detect_can_be_slow(en->te.Else, ct, ncall, calls); if (r != 0) return r; } } @@ -7771,12 +8061,44 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct) case NODE_BACKREF: if (NODE_IS_NEST_LEVEL(node)) ct->backref_with_level++; + else + ct->backref++; break; #endif #ifdef USE_CALL case NODE_CALL: - ct->call++; + { + int i; + int found; + int gnum; + + gnum = CALL_(node)->called_gnum; + ct->call++; + + if (NODE_IS_RECURSION(node) && NODE_IS_INPEEK(node) && + NODE_IS_IN_REAL_REPEAT(node)) { + ct->heavy_element += 10; + } + + found = FALSE; + for (i = 0; i < ncall; i++) { + if (gnum == calls[i]) { + found = TRUE; + break; + } + } + + if (! found) { + if (ncall + 1 < MAX_CALLS_IN_DETECT) { + calls[ncall] = gnum; + r = detect_can_be_slow(NODE_BODY(node), ct, ncall + 1, calls); + } + else { + ct->heavy_element++; + } + } + } break; #endif @@ -7795,8 +8117,12 @@ onig_detect_can_be_slow_pattern(const UChar* pattern, int r; regex_t* reg; Node* root; - ScanEnv scan_env; + ParseEnv scan_env; SlowElementCount count; + int calls[MAX_CALLS_IN_DETECT]; +#ifdef USE_CALL + UnsetAddrList uslist = {0}; +#endif reg = (regex_t* )xmalloc(sizeof(regex_t)); if (IS_NULL(reg)) return ONIGERR_MEMORY; @@ -7807,25 +8133,44 @@ onig_detect_can_be_slow_pattern(const UChar* pattern, return r; } - root = 0; - r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); + r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, NULL +#ifdef USE_CALL + , &uslist +#endif + ); + if (r != 0) goto err; + +#ifdef USE_CALL + if (scan_env.num_call > 0) { + unset_addr_list_end(&uslist); + } +#endif + + count.prec_read = 0; + count.look_behind = 0; + count.backref = 0; + count.backref_with_level = 0; + count.call = 0; + count.anychar_reluctant_many = 0; + count.empty_check_nest_level = 0; + count.max_empty_check_nest_level = 0; + count.heavy_element = 0; + + r = detect_can_be_slow(root, &count, 0, calls); if (r == 0) { - count.prec_read = 0; - count.look_behind = 0; - count.backref_with_level = 0; - count.call = 0; - - r = node_detect_can_be_slow(root, &count); - if (r == 0) { - int n = count.prec_read + count.look_behind - + count.backref_with_level + count.call; - r = n; - } + int n = count.prec_read + count.look_behind + + count.backref + count.backref_with_level + count.call + + count.anychar_reluctant_many; + if (count.heavy_element != 0) + n += count.heavy_element * 10; + + r = n; } if (IS_NOT_NULL(scan_env.mem_env_dynamic)) xfree(scan_env.mem_env_dynamic); + err: onig_node_free(root); onig_free(reg); return r; @@ -7853,6 +8198,8 @@ Indent(FILE* f, int indent) static void print_indent_tree(FILE* f, Node* node, int indent) { + static char* emptiness_name[] = { "", " empty", " empty_mem", " empty_rec" }; + int i; NodeType type; UChar* p; @@ -8019,69 +8366,83 @@ print_indent_tree(FILE* f, Node* node, int indent) fprintf(f, "", node); fprintf(f, " num: %d, name", cn->called_gnum); p_string(f, cn->name_end - cn->name, cn->name); + if (NODE_IS_RECURSION(node)) fprintf(f, ", recursion"); + if (NODE_IS_INPEEK(node)) fprintf(f, ", in-peek"); + if (NODE_IS_IN_REAL_REPEAT(node)) fprintf(f, ", in-real-repeat"); } break; #endif case NODE_QUANT: - fprintf(f, "{%d,%d}%s%s\n", node, - QUANT_(node)->lower, QUANT_(node)->upper, - (QUANT_(node)->greedy ? "" : "?"), - QUANT_(node)->include_referred == 0 ? "" : " referred"); - print_indent_tree(f, NODE_BODY(node), indent + add); + { + fprintf(f, "{%d,%d}%s%s%s", node, + QUANT_(node)->lower, QUANT_(node)->upper, + (QUANT_(node)->greedy ? "" : "?"), + QUANT_(node)->include_referred == 0 ? "" : " referred", + emptiness_name[QUANT_(node)->emptiness]); + if (NODE_IS_INPEEK(node)) fprintf(f, ", in-peek"); + fprintf(f, "\n"); + print_indent_tree(f, NODE_BODY(node), indent + add); + } break; case NODE_BAG: - fprintf(f, " ", node); - if (BAG_(node)->type == BAG_IF_ELSE) { - Node* Then; - Node* Else; - BagNode* bn; - - bn = BAG_(node); - fprintf(f, "if-else\n"); - print_indent_tree(f, NODE_BODY(node), indent + add); + { + BagNode* bn = BAG_(node); + fprintf(f, " ", node); + if (bn->type == BAG_IF_ELSE) { + Node* Then; + Node* Else; + + fprintf(f, "if-else\n"); + print_indent_tree(f, NODE_BODY(node), indent + add); + + Then = bn->te.Then; + Else = bn->te.Else; + if (IS_NULL(Then)) { + Indent(f, indent + add); + fprintf(f, "THEN empty\n"); + } + else + print_indent_tree(f, Then, indent + add); - Then = bn->te.Then; - Else = bn->te.Else; - if (IS_NULL(Then)) { - Indent(f, indent + add); - fprintf(f, "THEN empty\n"); + if (IS_NULL(Else)) { + Indent(f, indent + add); + fprintf(f, "ELSE empty\n"); + } + else + print_indent_tree(f, Else, indent + add); } - else - print_indent_tree(f, Then, indent + add); + else { + switch (bn->type) { + case BAG_OPTION: + fprintf(f, "option:%d", bn->o.options); + break; + case BAG_MEMORY: + fprintf(f, "memory:%d", bn->m.regnum); + if (NODE_IS_CALLED(node)) { + fprintf(f, ", called"); + if (NODE_IS_RECURSION(node)) + fprintf(f, ", recursion"); + } + else if (NODE_IS_REFERENCED(node)) + fprintf(f, ", referenced"); - if (IS_NULL(Else)) { - Indent(f, indent + add); - fprintf(f, "ELSE empty\n"); + if (NODE_IS_FIXED_ADDR(node)) + fprintf(f, ", fixed-addr"); + if ((bn->m.called_state & IN_PEEK) != 0) + fprintf(f, ", in-peek"); + break; + case BAG_STOP_BACKTRACK: + fprintf(f, "stop-bt"); + break; + default: + break; + } + fprintf(f, "\n"); + print_indent_tree(f, NODE_BODY(node), indent + add); } - else - print_indent_tree(f, Else, indent + add); - - break; } - - switch (BAG_(node)->type) { - case BAG_OPTION: - fprintf(f, "option:%d", BAG_(node)->o.options); - break; - case BAG_MEMORY: - fprintf(f, "memory:%d", BAG_(node)->m.regnum); - if (NODE_IS_CALLED(node)) - fprintf(f, ", called"); - else if (NODE_IS_REFERENCED(node)) - fprintf(f, ", referenced"); - if (NODE_IS_FIXED_ADDR(node)) - fprintf(f, ", fixed-addr"); - break; - case BAG_STOP_BACKTRACK: - fprintf(f, "stop-bt"); - break; - default: - break; - } - fprintf(f, "\n"); - print_indent_tree(f, NODE_BODY(node), indent + add); break; case NODE_GIMMICK: diff --git a/src/regenc.c b/src/regenc.c index 27e4549..84afd1e 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -2,7 +2,7 @@ regenc.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -569,6 +569,9 @@ onigenc_apply_all_case_fold_with_map(int map_size, r = onigenc_ascii_apply_all_case_fold(flag, f, arg); if (r != 0) return r; + if (CASE_FOLD_IS_ASCII_ONLY(flag)) + return 0; + for (i = 0; i < map_size; i++) { code = map[i].to; r = (*f)(map[i].from, &code, 1, arg); @@ -588,7 +591,7 @@ onigenc_apply_all_case_fold_with_map(int map_size, extern int onigenc_get_case_fold_codes_by_str_with_map(int map_size, const OnigPairCaseFoldCodes map[], - int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED, + int ess_tsett_flag, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { int i, j, n; @@ -596,7 +599,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, if (0x41 <= *p && *p <= 0x5a) { /* A - Z */ if (*p == LARGE_S && ess_tsett_flag != 0 && end > p + 1 - && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */ + && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) /* SS */ + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { ss_combination: items[0].byte_len = 2; items[0].code_len = 1; @@ -625,7 +629,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, } else if (0x61 <= *p && *p <= 0x7a) { /* a - z */ if (*p == SMALL_S && ess_tsett_flag != 0 && end > p + 1 - && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { + && (*(p+1) == SMALL_S || *(p+1) == LARGE_S) + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { goto ss_combination; } @@ -634,7 +639,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, items[0].code[0] = (OnigCodePoint )(*p - 0x20); return 1; } - else if (*p == 0xdf && ess_tsett_flag != 0) { + else if (*p == 0xdf && ess_tsett_flag != 0 + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { items[0].byte_len = 1; items[0].code_len = 2; items[0].code[0] = (OnigCodePoint )'s'; @@ -660,6 +666,9 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, else { int i; + if (CASE_FOLD_IS_ASCII_ONLY(flag)) + return 0; + for (i = 0; i < map_size; i++) { if (*p == map[i].from) { items[0].byte_len = 1; diff --git a/src/regenc.h b/src/regenc.h index d183b97..d0b447d 100644 --- a/src/regenc.h +++ b/src/regenc.h @@ -142,6 +142,10 @@ struct PropertyNameCtype { #define ENC_GET_SKIP_OFFSET(enc) \ (((enc)->flag & ENC_FLAG_SKIP_OFFSET_MASK)>>2) +#define CASE_FOLD_IS_ASCII_ONLY(flag) \ + (((flag) & ONIGENC_CASE_FOLD_ASCII_ONLY) != 0) +#define CASE_FOLD_IS_NOT_ASCII_ONLY(flag) \ + (((flag) & ONIGENC_CASE_FOLD_ASCII_ONLY) == 0) /* for encoding system implementation (internal) */ extern int onigenc_end(void); @@ -202,12 +206,12 @@ extern int onigenc_wb_is_break_position P_((OnigEncoding enc, UChar* p, UChar* p #define FOLDS1_UNFOLDS_NUM(i) (OnigUnicodeFolds1[(i)+1]) #define FOLDS2_UNFOLDS_NUM(i) (OnigUnicodeFolds2[(i)+2]) #define FOLDS3_UNFOLDS_NUM(i) (OnigUnicodeFolds3[(i)+3]) -#define FOLDS1_UNFOLDS(i) (OnigUnicodeFolds1 + (i) + 2) -#define FOLDS2_UNFOLDS(i) (OnigUnicodeFolds2 + (i) + 3) -#define FOLDS3_UNFOLDS(i) (OnigUnicodeFolds3 + (i) + 4) -#define FOLDS1_NEXT_INDEX(i) ((i) + 2 + OnigUnicodeFolds1[(i)+1]) -#define FOLDS2_NEXT_INDEX(i) ((i) + 3 + OnigUnicodeFolds2[(i)+2]) -#define FOLDS3_NEXT_INDEX(i) ((i) + 4 + OnigUnicodeFolds3[(i)+3]) +#define FOLDS1_UNFOLDS(i) (FOLDS1_FOLD(i) + 2) +#define FOLDS2_UNFOLDS(i) (FOLDS2_FOLD(i) + 3) +#define FOLDS3_UNFOLDS(i) (FOLDS3_FOLD(i) + 4) +#define FOLDS1_NEXT_INDEX(i) ((i) + 2 + FOLDS1_UNFOLDS_NUM(i)) +#define FOLDS2_NEXT_INDEX(i) ((i) + 3 + FOLDS2_UNFOLDS_NUM(i)) +#define FOLDS3_NEXT_INDEX(i) ((i) + 4 + FOLDS3_UNFOLDS_NUM(i)) #define FOLDS_FOLD_ADDR_BUK(buk, addr) do {\ if ((buk)->fold_len == 1)\ diff --git a/src/regerror.c b/src/regerror.c index dc1c8b6..18a5bdd 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -2,7 +2,7 @@ regerror.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -146,6 +146,8 @@ onig_error_code_to_format(int code) p = "too big wide-char value"; break; case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE: p = "too long wide-char value"; break; + case ONIGERR_UNDEFINED_OPERATOR: + p = "undefined operator"; break; case ONIGERR_INVALID_CODE_POINT_VALUE: p = "invalid code point value"; break; case ONIGERR_EMPTY_GROUP_NAME: @@ -190,6 +192,8 @@ onig_error_code_to_format(int code) p = "not supported encoding combination"; break; case ONIGERR_INVALID_COMBINATION_OF_OPTIONS: p = "invalid combination of options"; break; + case ONIGERR_VERY_INEFFICIENT_PATTERN: + p = "very inefficient pattern"; break; case ONIGERR_LIBRARY_IS_NOT_INITIALIZED: p = "library is not initialized"; break; diff --git a/src/regexec.c b/src/regexec.c index bb6b474..a3cf60a 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -2,7 +2,7 @@ regexec.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -54,6 +54,13 @@ (MEM_STATUS_AT((reg)->push_mem_end, (idx)) != 0 ? \ STACK_AT(mem_end_stk[idx].i)->u.mem.pstr : mem_end_stk[idx].s) +#ifdef _MSC_VER +#define DIST_CAST(d) (size_t )(d) +#else +#define DIST_CAST(d) (d) +#endif + + static int forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, UChar* range, UChar** low, UChar** high); static int @@ -76,11 +83,12 @@ struct OnigMatchParamStruct { unsigned long retry_limit_in_match; unsigned long retry_limit_in_search; #endif + + void* callout_user_data; /* used in callback each match */ #ifdef USE_CALLOUT OnigCalloutFunc progress_callout_of_contents; OnigCalloutFunc retraction_callout_of_contents; int match_at_call_counter; - void* callout_user_data; CalloutData* callout_data; int callout_data_alloc_num; #endif @@ -143,12 +151,8 @@ onig_set_retraction_callout_of_match_param(OnigMatchParam* param, OnigCalloutFun extern int onig_set_callout_user_data_of_match_param(OnigMatchParam* param, void* user_data) { -#ifdef USE_CALLOUT param->callout_user_data = user_data; return ONIG_NORMAL; -#else - return ONIG_NO_SUPPORT_CONFIG; -#endif } @@ -873,6 +877,23 @@ onig_get_capture_tree(OnigRegion* region) } #endif /* USE_CAPTURE_HISTORY */ + +static OnigCallbackEachMatchFunc CallbackEachMatch; + +extern OnigCallbackEachMatchFunc +onig_get_callback_each_match(void) +{ + return CallbackEachMatch; +} + +extern int +onig_set_callback_each_match(OnigCallbackEachMatchFunc f) +{ + CallbackEachMatch = f; + return ONIG_NORMAL; +} + + extern void onig_region_clear(OnigRegion* region) { @@ -1238,7 +1259,7 @@ struct OnigCalloutArgsStruct { #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE #define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ (msa).stack_p = (void* )0;\ - (msa).options = (arg_option);\ + (msa).options = (arg_option)|(reg)->options;\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ (msa).match_stack_limit = (mpv)->match_stack_limit;\ @@ -1251,7 +1272,7 @@ struct OnigCalloutArgsStruct { #else #define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ (msa).stack_p = (void* )0;\ - (msa).options = (arg_option);\ + (msa).options = (arg_option)|(reg)->options;\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ (msa).match_stack_limit = (mpv)->match_stack_limit;\ @@ -1405,6 +1426,7 @@ onig_set_subexp_call_limit_in_search(unsigned long n) #endif + #ifdef USE_CALLOUT static OnigCalloutFunc DefaultProgressCallout; static OnigCalloutFunc DefaultRetractionCallout; @@ -1452,11 +1474,12 @@ onig_initialize_match_param(OnigMatchParam* mp) mp->retry_limit_in_search = RetryLimitInSearch; #endif + mp->callout_user_data = 0; + #ifdef USE_CALLOUT mp->progress_callout_of_contents = DefaultProgressCallout; mp->retraction_callout_of_contents = DefaultRetractionCallout; mp->match_at_call_counter = 0; - mp->callout_user_data = 0; mp->callout_data = 0; mp->callout_data_alloc_num = 0; #endif @@ -1532,13 +1555,26 @@ onig_get_callout_data_dont_clear_old(regex_t* reg, OnigMatchParam* mp, t = d->slot[slot].type; if (IS_NOT_NULL(type)) *type = t; if (IS_NOT_NULL(val)) *val = d->slot[slot].val; - return (t == ONIG_TYPE_VOID ? 1 : ONIG_NORMAL); + return (t == ONIG_TYPE_VOID ? ONIG_VALUE_IS_NOT_SET : ONIG_NORMAL); +} + +extern int +onig_get_callout_data_by_tag_dont_clear_old(regex_t* reg, + OnigMatchParam* mp, const UChar* tag, const UChar* tag_end, int slot, + OnigType* type, OnigValue* val) +{ + int num; + + num = onig_get_callout_num_by_tag(reg, tag, tag_end); + if (num < 0) return num; + if (num == 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME; + + return onig_get_callout_data_dont_clear_old(reg, mp, num, slot, type, val); } extern int -onig_get_callout_data_by_callout_args_self_dont_clear_old(OnigCalloutArgs* args, - int slot, OnigType* type, - OnigValue* val) +onig_get_callout_data_by_callout_args_self_dont_clear_old( + OnigCalloutArgs* args, int slot, OnigType* type, OnigValue* val) { return onig_get_callout_data_dont_clear_old(args->regex, args->msa->mp, args->num, slot, type, val); @@ -1563,7 +1599,7 @@ onig_get_callout_data(regex_t* reg, OnigMatchParam* mp, t = d->slot[slot].type; if (IS_NOT_NULL(type)) *type = t; if (IS_NOT_NULL(val)) *val = d->slot[slot].val; - return (t == ONIG_TYPE_VOID ? 1 : ONIG_NORMAL); + return (t == ONIG_TYPE_VOID ? ONIG_VALUE_IS_NOT_SET : ONIG_NORMAL); } extern int @@ -2171,65 +2207,90 @@ stack_double(int* is_alloca, char** arg_alloc_base, }\ } while (0) -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT -#define STACK_EMPTY_CHECK_MEM(isnull, sid, s, reg) do {\ - StackType* k;\ - GET_EMPTY_CHECK_START(sid, k);\ - if (k->u.empty_check.pstr != (s)) {\ +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT +#define STACK_EMPTY_CHECK_MEM(isnull, sid, empty_status_mem, s, reg) do {\ + StackType* klow;\ + GET_EMPTY_CHECK_START(sid, klow);\ + if (klow->u.empty_check.pstr != (s)) {\ + stack_empty_check_mem_not_empty:\ (isnull) = 0;\ }\ else {\ - UChar* endp;\ + StackType *k, *kk;\ + MemStatusType ms = (empty_status_mem);\ (isnull) = 1;\ - while (k < stk) {\ - if (k->type == STK_MEM_START &&\ - MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid)) {\ - STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ - if (endp == 0) {\ - (isnull) = 0; break;\ - }\ - else if (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != endp) {\ - (isnull) = 0; break;\ - }\ - else if (endp != s) {\ - (isnull) = -1; /* empty, but position changed */ \ + k = stk;\ + while (k > klow) {\ + k--;\ + if (k->type == STK_MEM_END && MEM_STATUS_LIMIT_AT(ms, k->zid)) {\ + kk = klow;\ + while (kk < k) {\ + if (kk->type == STK_MEM_START && kk->zid == k->zid) {\ + if (kk->u.mem.prev_end.i == INVALID_STACK_INDEX || \ + ((STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr) && (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr))) {\ + goto stack_empty_check_mem_not_empty;\ + }\ + else {\ + ms &= ~((MemStatusType )1 << k->zid);\ + break;\ + }\ + }\ + kk++;\ }\ + if (ms == 0) break;\ }\ - k++;\ }\ }\ } while(0) -#define STACK_EMPTY_CHECK_MEM_REC(isnull,sid,s,reg) do {\ +#define STACK_EMPTY_CHECK_MEM_REC(isnull,sid,empty_status_mem,s,reg) do {\ int level = 0;\ - StackType* k = stk;\ + StackType* klow = stk;\ while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_EMPTY_CHECK_MEM_REC");\ - if (k->type == STK_EMPTY_CHECK_START) {\ - if (k->zid == (sid)) {\ + klow--;\ + STACK_BASE_CHECK(klow, "STACK_EMPTY_CHECK_MEM_REC");\ + if (klow->type == STK_EMPTY_CHECK_START) {\ + if (klow->zid == (sid)) {\ if (level == 0) {\ - if (k->u.empty_check.pstr != (s)) {\ + if (klow->u.empty_check.pstr != (s)) {\ + stack_empty_check_mem_rec_not_empty:\ (isnull) = 0;\ break;\ }\ else {\ - UChar* endp;\ + StackType *k, *kk;\ + MemStatusType ms;\ (isnull) = 1;\ - while (k < stk) {\ - if (k->type == STK_MEM_START) {\ - if (level == 0 && \ - MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid) !=0) {\ - STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ - if (endp == 0) {\ - (isnull) = 0; break;\ - }\ - else if (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != endp) { \ - (isnull) = 0; break;\ - }\ - else if (endp != s) {\ - (isnull) = -1; /* empty, but position changed */\ + if ((empty_status_mem) == 0) break;\ + ms = (empty_status_mem);\ + k = stk;\ + while (k > klow) {\ + k--;\ + if (k->type == STK_MEM_END) {\ + if (level == 0 && MEM_STATUS_LIMIT_AT(ms, k->zid)) {\ + kk = klow;\ + kk++;\ + while (kk < k) {\ + if (kk->type == STK_MEM_START && kk->zid == k->zid) {\ + if (kk->u.mem.prev_end.i == INVALID_STACK_INDEX || \ + ((STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr) && (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr))) {\ + goto stack_empty_check_mem_rec_not_empty;\ + }\ + else {\ + ms &= ~((MemStatusType )1 << k->zid);\ + break;\ + }\ + }\ + else if (kk->type == STK_EMPTY_CHECK_START) {\ + if (kk->zid == (sid)) level++;\ + }\ + else if (kk->type == STK_EMPTY_CHECK_END) {\ + if (kk->zid == (sid)) level--;\ + }\ + kk++;\ }\ + level = 0;\ + if (ms == 0) break;\ }\ }\ else if (k->type == STK_EMPTY_CHECK_START) {\ @@ -2238,7 +2299,6 @@ stack_double(int* is_alloca, char** arg_alloc_base, else if (k->type == STK_EMPTY_CHECK_END) {\ if (k->zid == (sid)) level--;\ }\ - k++;\ }\ break;\ }\ @@ -2248,8 +2308,8 @@ stack_double(int* is_alloca, char** arg_alloc_base, }\ }\ }\ - else if (k->type == STK_EMPTY_CHECK_END) {\ - if (k->zid == (sid)) level++;\ + else if (klow->type == STK_EMPTY_CHECK_END) {\ + if (klow->zid == (sid)) level++;\ }\ }\ } while(0) @@ -2274,7 +2334,7 @@ stack_double(int* is_alloca, char** arg_alloc_base, }\ }\ } while(0) -#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #define STACK_GET_REPEAT_COUNT_SEARCH(sid, c) do {\ StackType* k = stk;\ @@ -2888,6 +2948,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, StackType *stkp; /* used as any purpose. */ StkPtrType *mem_start_stk, *mem_end_stk; UChar* keep; + OnigRegion* region; #ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR StackIndex *repeat_stk; @@ -2905,8 +2966,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, unsigned long subexp_call_counters[MAX_SUBEXP_CALL_COUNTERS]; #endif + OnigOptionType options; Operation* p = reg->ops; - OnigOptionType option = reg->options; OnigEncoding encode = reg->enc; OnigCaseFoldType case_fold_flag = reg->case_fold_flag; @@ -2936,6 +2997,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } #endif + options = msa->options; + #ifdef USE_CALLOUT msa->mp->match_at_call_counter++; #endif @@ -2976,102 +3039,113 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, BYTECODE_INTERPRETER_START { CASE_OP(END) n = (int )(s - sstart); + if (n == 0 && OPTON_FIND_NOT_EMPTY(options)) { + best_len = ONIG_MISMATCH; + goto fail; /* for retry */ + } + if (n > best_len) { - OnigRegion* region; #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE - if (OPTON_FIND_LONGEST(option)) { + if (OPTON_FIND_LONGEST(options)) { if (n > msa->best_len) { msa->best_len = n; msa->best_s = (UChar* )sstart; - goto set_region; } - else - goto end_best_len; + else { + if (s >= in_right_range && msa->best_s == sstart) { + best_len = msa->best_len; /* end of find */ + } + else { + SOP_OUT; + goto fail; /* for retry */ + } + } } -#endif + else { + best_len = n; + } +#else best_len = n; +#endif + } - set_region: - region = msa->region; - if (region) { - if (keep > s) keep = s; + /* set region */ + region = msa->region; + if (region) { + if (keep > s) keep = s; #ifdef USE_POSIX_API - if (OPTON_POSIX_REGION(msa->options)) { - posix_regmatch_t* rmt = (posix_regmatch_t* )region; - - rmt[0].rm_so = (regoff_t )(keep - str); - rmt[0].rm_eo = (regoff_t )(s - str); - for (i = 1; i <= num_mem; i++) { - if (mem_end_stk[i].i != INVALID_STACK_INDEX) { - rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str); - rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i) - str); - } - else { - rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; - } + if (OPTON_POSIX_REGION(options)) { + posix_regmatch_t* rmt = (posix_regmatch_t* )region; + + rmt[0].rm_so = (regoff_t )(keep - str); + rmt[0].rm_eo = (regoff_t )(s - str); + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i].i != INVALID_STACK_INDEX) { + rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str); + rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i) - str); + } + else { + rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; } } - else { + } + else { #endif /* USE_POSIX_API */ - region->beg[0] = (int )(keep - str); - region->end[0] = (int )(s - str); - for (i = 1; i <= num_mem; i++) { - if (mem_end_stk[i].i != INVALID_STACK_INDEX) { - region->beg[i] = (int )(STACK_MEM_START(reg, i) - str); - region->end[i] = (int )(STACK_MEM_END(reg, i) - str); - } - else { - region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; - } + region->beg[0] = (int )(keep - str); + region->end[0] = (int )(s - str); + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i].i != INVALID_STACK_INDEX) { + region->beg[i] = (int )(STACK_MEM_START(reg, i) - str); + region->end[i] = (int )(STACK_MEM_END(reg, i) - str); + } + else { + region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; } + } #ifdef USE_CAPTURE_HISTORY - if (reg->capture_history != 0) { - int r; - OnigCaptureTreeNode* node; + if (reg->capture_history != 0) { + OnigCaptureTreeNode* node; - if (IS_NULL(region->history_root)) { - region->history_root = node = history_node_new(); - CHECK_NULL_RETURN_MEMERR(node); - } - else { - node = region->history_root; - history_tree_clear(node); - } + if (IS_NULL(region->history_root)) { + region->history_root = node = history_node_new(); + CHECK_NULL_RETURN_MEMERR(node); + } + else { + node = region->history_root; + history_tree_clear(node); + } - node->group = 0; - node->beg = (int )(keep - str); - node->end = (int )(s - str); + node->group = 0; + node->beg = (int )(keep - str); + node->end = (int )(s - str); - stkp = stk_base; - r = make_capture_history_tree(region->history_root, &stkp, - stk, (UChar* )str, reg); - if (r < 0) MATCH_AT_ERROR_RETURN(r); - } + stkp = stk_base; + i = make_capture_history_tree(region->history_root, &stkp, + stk, (UChar* )str, reg); + if (i < 0) MATCH_AT_ERROR_RETURN(i); + } #endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_API - } /* else OPTON_POSIX_REGION() */ + } /* else OPTON_POSIX_REGION() */ #endif - } /* if (region) */ - } /* n > best_len */ + } /* if (region) */ -#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE - end_best_len: -#endif SOP_OUT; - if (OPTON_FIND_CONDITION(option)) { - if (OPTON_FIND_NOT_EMPTY(option) && s == sstart) { + if (OPTON_CALLBACK_EACH_MATCH(options) && + IS_NOT_NULL(CallbackEachMatch)) { + i = CallbackEachMatch(str, end, sstart, region, + msa->mp->callout_user_data); + if (i < 0) MATCH_AT_ERROR_RETURN(i); + +#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE + if (! OPTON_FIND_LONGEST(options)) +#endif best_len = ONIG_MISMATCH; - goto fail; /* for retry */ - } - if (OPTON_FIND_LONGEST(option)) { - if (s >= in_right_range && msa->best_s == sstart) - best_len = msa->best_len; - else - goto fail; /* for retry */ - } + + goto fail; } /* default behavior: return first-matching result. */ @@ -3564,23 +3638,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(BEGIN_BUF) if (! ON_STR_BEGIN(s)) goto fail; - if (OPTON_NOTBOL(msa->options)) goto fail; - if (OPTON_NOT_BEGIN_STRING(msa->options)) goto fail; + if (OPTON_NOTBOL(options)) goto fail; + if (OPTON_NOT_BEGIN_STRING(options)) goto fail; INC_OP; JUMP_OUT; CASE_OP(END_BUF) if (! ON_STR_END(s)) goto fail; - if (OPTON_NOTEOL(msa->options)) goto fail; - if (OPTON_NOT_END_STRING(msa->options)) goto fail; + if (OPTON_NOTEOL(options)) goto fail; + if (OPTON_NOT_END_STRING(options)) goto fail; INC_OP; JUMP_OUT; CASE_OP(BEGIN_LINE) if (ON_STR_BEGIN(s)) { - if (OPTON_NOTBOL(msa->options)) goto fail; + if (OPTON_NOTBOL(options)) goto fail; INC_OP; JUMP_OUT; } @@ -3599,7 +3673,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif - if (OPTON_NOTEOL(msa->options)) goto fail; + if (OPTON_NOTEOL(options)) goto fail; INC_OP; JUMP_OUT; #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE @@ -3624,8 +3698,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif - if (OPTON_NOTEOL(msa->options)) goto fail; - if (OPTON_NOT_END_STRING(msa->options)) goto fail; + if (OPTON_NOTEOL(options)) goto fail; + if (OPTON_NOT_END_STRING(options)) goto fail; INC_OP; JUMP_OUT; #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE @@ -3634,8 +3708,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) && ON_STR_END(s + enclen(encode, s))) { - if (OPTON_NOTEOL(msa->options)) goto fail; - if (OPTON_NOT_END_STRING(msa->options)) goto fail; + if (OPTON_NOTEOL(options)) goto fail; + if (OPTON_NOT_END_STRING(options)) goto fail; INC_OP; JUMP_OUT; } @@ -3644,8 +3718,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar* ss = s + enclen(encode, s); ss += enclen(encode, ss); if (ON_STR_END(ss)) { - if (OPTON_NOTEOL(msa->options)) goto fail; - if (OPTON_NOT_END_STRING(msa->options)) goto fail; + if (OPTON_NOTEOL(options)) goto fail; + if (OPTON_NOT_END_STRING(options)) goto fail; INC_OP; JUMP_OUT; } @@ -3657,7 +3731,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, switch (p->check_position.type) { case CHECK_POSITION_SEARCH_START: if (s != msa->start) goto fail; - if (OPTON_NOT_BEGIN_POSITION(msa->options)) goto fail; + if (OPTON_NOT_BEGIN_POSITION(options)) goto fail; break; case CHECK_POSITION_CURRENT_RIGHT_RANGE: if (s != right_range) goto fail; @@ -3924,13 +3998,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } JUMP_OUT; -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT CASE_OP(EMPTY_CHECK_END_MEMST) { int is_empty; mem = p->empty_check_end.mem; /* mem: null check id */ - STACK_EMPTY_CHECK_MEM(is_empty, mem, s, reg); + STACK_EMPTY_CHECK_MEM(is_empty, mem, p->empty_check_end.empty_status_mem, s, reg); INC_OP; if (is_empty) { #ifdef ONIG_DEBUG_MATCH @@ -3949,8 +4023,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int is_empty; mem = p->empty_check_end.mem; /* mem: null check id */ -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT - STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg); +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT + STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, p->empty_check_end.empty_status_mem, s, reg); #else STACK_EMPTY_CHECK_REC(is_empty, mem, s); #endif @@ -4109,6 +4183,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } } +#ifdef ONIG_DEBUG_CALL + fprintf(DBGFP, "CALL: id:%d, at:%ld, level:%lu\n", p->call.called_mem, s - str, subexp_call_nest_counter); +#endif addr = p->call.addr; INC_OP; STACK_PUSH_CALL_FRAME(p); p = reg->ops + addr; @@ -4425,7 +4502,7 @@ regset_search_body_position_lead(OnigRegSet* set, sr[i].state = SRS_DEAD; if (reg->optimize != OPTIMIZE_NONE) { if (reg->dist_max != INFINITE_LEN) { - if (end - range > reg->dist_max) + if (DIST_CAST(end - range) > reg->dist_max) sch_range = (UChar* )range + reg->dist_max; else sch_range = (UChar* )end; @@ -4609,7 +4686,7 @@ onig_regset_search_with_param(OnigRegSet* set, if (set->n == 0) return ONIG_MISMATCH; - if (OPTON_POSIX_REGION(option)) + if (OPTON_POSIX_REGION(option) || OPTON_CALLBACK_EACH_MATCH(option)) return ONIGERR_INVALID_ARGUMENT; r = 0; @@ -4884,7 +4961,7 @@ sunday_quick_search_step_forward(regex_t* reg, const UChar* text_range) { const UChar *s, *se, *t, *p, *end; - const UChar *tail; + const UChar *tail, *next; int skip, tlen1; int map_offset; OnigEncoding enc; @@ -4921,9 +4998,11 @@ sunday_quick_search_step_forward(regex_t* reg, s += enclen(enc, s); } while ((s - t) < skip && s < end); #else - s += skip; - if (s < end) - s = onigenc_get_right_adjust_char_head(enc, text, s); + next = s + skip; + if (next < end) + s = onigenc_get_right_adjust_char_head(enc, s, next); + else + break; #endif } @@ -5086,7 +5165,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, p = start; if (reg->dist_min != 0) { - if (end - p <= reg->dist_min) + if (DIST_CAST(end - p) <= reg->dist_min) return 0; /* fail */ if (ONIGENC_IS_SINGLEBYTE(reg->enc)) { @@ -5119,7 +5198,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, } if (p && p < range) { - if (p - start < reg->dist_min) { + if (DIST_CAST(p - start) < reg->dist_min) { retry_gate: pprev = p; p += enclen(reg->enc, p); @@ -5164,7 +5243,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, } else { if (reg->dist_max != INFINITE_LEN) { - if (p - str < reg->dist_max) { + if (DIST_CAST(p - str) < reg->dist_max) { *low = (UChar* )str; } else { @@ -5175,7 +5254,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, } } /* no needs to adjust *high, *high is used as range check only */ - if (p - str < reg->dist_min) + if (DIST_CAST(p - str) < reg->dist_min) *high = (UChar* )str; else *high = p - reg->dist_min; @@ -5260,13 +5339,13 @@ backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s, } if (reg->dist_max != INFINITE_LEN) { - if (p - str < reg->dist_max) + if (DIST_CAST(p - str) < reg->dist_max) *low = (UChar* )str; else *low = p - reg->dist_max; if (reg->dist_min != 0) { - if (p - str < reg->dist_min) + if (DIST_CAST(p - str) < reg->dist_min) *high = (UChar* )str; else *high = p - reg->dist_min; @@ -5410,13 +5489,13 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, if (range > start) { if (reg->anc_dist_max != INFINITE_LEN && - min_semi_end - start > reg->anc_dist_max) { + DIST_CAST(min_semi_end - start) > reg->anc_dist_max) { start = min_semi_end - reg->anc_dist_max; if (start < end) start = onigenc_get_right_adjust_char_head(reg->enc, str, start); } - if (max_semi_end - (range - 1) < reg->anc_dist_min) { - if (max_semi_end - str + 1 < reg->anc_dist_min) + if (DIST_CAST(max_semi_end - (range - 1)) < reg->anc_dist_min) { + if (DIST_CAST(max_semi_end - str + 1) < reg->anc_dist_min) goto mismatch_no_msa; else range = max_semi_end - reg->anc_dist_min + 1; @@ -5428,11 +5507,11 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, } else { if (reg->anc_dist_max != INFINITE_LEN && - min_semi_end - range > reg->anc_dist_max) { + DIST_CAST(min_semi_end - range) > reg->anc_dist_max) { range = min_semi_end - reg->anc_dist_max; } - if (max_semi_end - start < reg->anc_dist_min) { - if (max_semi_end - str < reg->anc_dist_min) + if (DIST_CAST(max_semi_end - start) < reg->anc_dist_min) { + if (DIST_CAST(max_semi_end - str) < reg->anc_dist_min) goto mismatch_no_msa; else { start = max_semi_end - reg->anc_dist_min; @@ -5503,7 +5582,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, if (reg->dist_max == INFINITE_LEN) sch_range = (UChar* )end; else { - if ((end - range) < reg->dist_max) + if (DIST_CAST(end - range) < reg->dist_max) sch_range = (UChar* )end; else { sch_range = (UChar* )range + reg->dist_max; @@ -5579,14 +5658,14 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, else adjrange = (UChar* )end; - if (end - range > reg->dist_min) + if (DIST_CAST(end - range) > reg->dist_min) min_range = range + reg->dist_min; else min_range = end; if (reg->dist_max != INFINITE_LEN) { do { - if (end - s > reg->dist_max) + if (DIST_CAST(end - s) > reg->dist_max) sch_start = s + reg->dist_max; else { sch_start = onigenc_get_prev_char_head(reg->enc, str, end); @@ -5887,8 +5966,10 @@ onig_regset_add(OnigRegSet* set, regex_t* reg) { OnigRegion* region; +#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE if (OPTON_FIND_LONGEST(reg->options)) return ONIGERR_INVALID_ARGUMENT; +#endif if (set->n != 0 && reg->enc != set->enc) return ONIGERR_INVALID_ARGUMENT; @@ -5933,8 +6014,10 @@ onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) set->n--; } else { +#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE if (OPTON_FIND_LONGEST(reg->options)) return ONIGERR_INVALID_ARGUMENT; +#endif if (set->n > 1 && reg->enc != set->enc) return ONIGERR_INVALID_ARGUMENT; @@ -6573,7 +6656,7 @@ onig_builtin_monitor(OnigCalloutArgs* args, void* user_data) tag_len = tag_end - tag_start; if (tag_len >= sizeof(buf)) tag_len = sizeof(buf) - 1; - for (i = 0; i < tag_len; i++) buf[i] = tag_start[i]; + for (i = 0; i < (int )tag_len; i++) buf[i] = tag_start[i]; buf[tag_len] = '\0'; } diff --git a/src/regint.h b/src/regint.h index 74a5c61..9856a96 100644 --- a/src/regint.h +++ b/src/regint.h @@ -4,7 +4,7 @@ regint.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,6 +35,7 @@ /* #define ONIG_DEBUG_SEARCH */ /* #define ONIG_DEBUG_MATCH */ /* #define ONIG_DEBUG_MATCH_COUNTER */ +/* #define ONIG_DEBUG_CALL */ /* #define ONIG_DONT_OPTIMIZE */ /* for byte-code statistical data. */ @@ -42,7 +43,8 @@ #if defined(ONIG_DEBUG_PARSE) || defined(ONIG_DEBUG_MATCH) || \ defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \ - defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_STATISTICS) + defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_CALL) || \ + defined(ONIG_DEBUG_STATISTICS) #ifndef ONIG_DEBUG #define ONIG_DEBUG #define DBGFP stderr @@ -61,7 +63,7 @@ #define USE_CALL #define USE_CALLOUT #define USE_BACKREF_WITH_LEVEL /* \k, \k */ -#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ +#define USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR #define USE_RETRY_LIMIT @@ -388,10 +390,10 @@ typedef unsigned int MemStatusType; (IS_CODE_DIGIT_ASCII(enc,code) ? DIGITVAL(code) \ : (ONIGENC_IS_CODE_UPPER(enc,code) ? (code) - 'A' + 10 : (code) - 'a' + 10)) +#define OPTON_CALLBACK_EACH_MATCH(option) \ + ((option) & ONIG_OPTION_CALLBACK_EACH_MATCH) #define OPTON_FIND_LONGEST(option) ((option) & ONIG_OPTION_FIND_LONGEST) #define OPTON_FIND_NOT_EMPTY(option) ((option) & ONIG_OPTION_FIND_NOT_EMPTY) -#define OPTON_FIND_CONDITION(option) ((option) & \ - (ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY)) #define OPTON_NEGATE_SINGLELINE(option) ((option) & \ ONIG_OPTION_NEGATE_SINGLELINE) #define OPTON_DONT_CAPTURE_GROUP(option) ((option) & \ @@ -406,8 +408,6 @@ typedef unsigned int MemStatusType; #define OPTON_NOT_END_STRING(option) ((option) & ONIG_OPTION_NOT_END_STRING) #define OPTON_NOT_BEGIN_POSITION(option) ((option) & ONIG_OPTION_NOT_BEGIN_POSITION) -#define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \ - ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) #define INFINITE_REPEAT -1 #define IS_INFINITE_REPEAT(n) ((n) == INFINITE_REPEAT) @@ -437,81 +437,6 @@ typedef Bits* BitSetRef; #define BITSET_CLEAR_BIT(bs, pos) BS_ROOM(bs,pos) &= ~(BS_BIT(pos)) #define BITSET_INVERT_BIT(bs, pos) BS_ROOM(bs,pos) ^= BS_BIT(pos) -/* bytes buffer */ -typedef struct _BBuf { - UChar* p; - unsigned int used; - unsigned int alloc; -} BBuf; - -#define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size)) - -#define BB_EXPAND(buf,low) do{\ - do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ - (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ - if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ -} while (0) - -#define BB_ENSURE_SIZE(buf,size) do{\ - unsigned int new_alloc = (buf)->alloc;\ - while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\ - if ((buf)->alloc != new_alloc) {\ - (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\ - if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ - (buf)->alloc = new_alloc;\ - }\ -} while (0) - -#define BB_WRITE(buf,pos,bytes,n) do{\ - int used = (pos) + (n);\ - if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ - xmemcpy((buf)->p + (pos), (bytes), (n));\ - if ((buf)->used < (unsigned int )used) (buf)->used = used;\ -} while (0) - -#define BB_WRITE1(buf,pos,byte) do{\ - int used = (pos) + 1;\ - if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ - (buf)->p[(pos)] = (byte);\ - if ((buf)->used < (unsigned int )used) (buf)->used = used;\ -} while (0) - -#define BB_ADD(buf,bytes,n) BB_WRITE((buf),(buf)->used,(bytes),(n)) -#define BB_ADD1(buf,byte) BB_WRITE1((buf),(buf)->used,(byte)) -#define BB_GET_ADD_ADDRESS(buf) ((buf)->p + (buf)->used) -#define BB_GET_OFFSET_POS(buf) ((buf)->used) - -/* from < to */ -#define BB_MOVE_RIGHT(buf,from,to,n) do {\ - if ((unsigned int )((to)+(n)) > (buf)->alloc) BB_EXPAND((buf),(to) + (n));\ - xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ - if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\ -} while (0) - -/* from > to */ -#define BB_MOVE_LEFT(buf,from,to,n) do {\ - xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ -} while (0) - -/* from > to */ -#define BB_MOVE_LEFT_REDUCE(buf,from,to) do {\ - xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\ - (buf)->used -= (from - to);\ -} while (0) - -#define BB_INSERT(buf,pos,bytes,n) do {\ - if (pos >= (buf)->used) {\ - BB_WRITE(buf,pos,bytes,n);\ - }\ - else {\ - BB_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\ - xmemcpy((buf)->p + (pos), (bytes), (n));\ - }\ -} while (0) - -#define BB_GET_BYTE(buf, pos) (buf)->p[(pos)] - - /* has body */ #define ANCR_PREC_READ (1<<0) #define ANCR_PREC_READ_NOT (1<<1) @@ -884,6 +809,7 @@ typedef struct { } empty_check_start; struct { MemNumType mem; + MemStatusType empty_status_mem; } empty_check_end; /* EMPTY_CHECK_END, EMPTY_CHECK_END_MEMST, EMPTY_CHECK_END_MEMST_PUSH */ struct { RelAddrType addr; @@ -922,7 +848,7 @@ typedef struct { } update_var; struct { AbsAddrType addr; -#ifdef ONIG_DEBUG_MATCH_COUNTER +#if defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_CALL) MemNumType called_mem; #endif } call; @@ -977,7 +903,6 @@ struct re_pattern_buffer { MemStatusType capture_history; /* (?@...) flag (1-31) */ MemStatusType push_mem_start; /* need backtrack flag */ MemStatusType push_mem_end; /* need backtrack flag */ - MemStatusType empty_status_mem; int stack_pop_level; int repeat_range_alloc; RepeatRange* repeat_range; diff --git a/src/regparse.c b/src/regparse.c index dd2824b..938a569 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -2,7 +2,7 @@ regparse.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -159,6 +159,75 @@ OnigSyntaxType OnigSyntaxRuby = { OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_ONIGURUMA; + +#define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size)) + +#define BB_EXPAND(buf,low) do{\ + do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ + (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ + if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ +} while (0) + +#define BB_ENSURE_SIZE(buf,size) do{\ + unsigned int new_alloc = (buf)->alloc;\ + while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\ + if ((buf)->alloc != new_alloc) {\ + (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\ + if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ + (buf)->alloc = new_alloc;\ + }\ +} while (0) + +#define BB_WRITE(buf,pos,bytes,n) do{\ + int used = (pos) + (n);\ + if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ + xmemcpy((buf)->p + (pos), (bytes), (n));\ + if ((buf)->used < (unsigned int )used) (buf)->used = used;\ +} while (0) + +#define BB_WRITE1(buf,pos,byte) do{\ + int used = (pos) + 1;\ + if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ + (buf)->p[(pos)] = (byte);\ + if ((buf)->used < (unsigned int )used) (buf)->used = used;\ +} while (0) + +#define BB_ADD(buf,bytes,n) BB_WRITE((buf),(buf)->used,(bytes),(n)) +#define BB_ADD1(buf,byte) BB_WRITE1((buf),(buf)->used,(byte)) +#define BB_GET_ADD_ADDRESS(buf) ((buf)->p + (buf)->used) +#define BB_GET_OFFSET_POS(buf) ((buf)->used) + +/* from < to */ +#define BB_MOVE_RIGHT(buf,from,to,n) do {\ + if ((unsigned int )((to)+(n)) > (buf)->alloc) BB_EXPAND((buf),(to) + (n));\ + xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ + if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\ +} while (0) + +/* from > to */ +#define BB_MOVE_LEFT(buf,from,to,n) do {\ + xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ +} while (0) + +/* from > to */ +#define BB_MOVE_LEFT_REDUCE(buf,from,to) do {\ + xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\ + (buf)->used -= (from - to);\ +} while (0) + +#define BB_INSERT(buf,pos,bytes,n) do {\ + if (pos >= (buf)->used) {\ + BB_WRITE(buf,pos,bytes,n);\ + }\ + else {\ + BB_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\ + xmemcpy((buf)->p + (pos), (bytes), (n));\ + }\ +} while (0) + +#define BB_GET_BYTE(buf, pos) (buf)->p[(pos)] + + typedef enum { CS_VALUE, CS_RANGE, @@ -300,7 +369,7 @@ bbuf_clone(BBuf** rto, BBuf* from) } static int -backref_rel_to_abs(int rel_no, ScanEnv* env) +backref_rel_to_abs(int rel_no, ParseEnv* env) { if (rel_no > 0) { if (rel_no > ONIG_INT_MAX - env->num_mem) @@ -981,7 +1050,7 @@ onig_number_of_names(regex_t* reg) #endif /* else USE_ST_LIBRARY */ static int -name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) +name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ParseEnv* env) { int r; int alloc; @@ -1115,7 +1184,7 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name, } static int -name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end, +name_to_group_numbers(ParseEnv* env, const UChar* name, const UChar* name_end, int** nums) { regex_t* reg; @@ -1920,7 +1989,7 @@ callout_tag_table_new(CalloutTagTable** rt) } static int -callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name, +callout_tag_entry_raw(ParseEnv* env, CalloutTagTable* t, UChar* name, UChar* name_end, CalloutTagVal entry_val) { int r; @@ -1963,7 +2032,7 @@ ext_ensure_tag_table(regex_t* reg) } static int -callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, +callout_tag_entry(ParseEnv* env, regex_t* reg, UChar* name, UChar* name_end, CalloutTagVal entry_val) { int r; @@ -1988,10 +2057,10 @@ callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, #endif /* USE_CALLOUT */ -#define INIT_SCANENV_MEMENV_ALLOC_SIZE 16 +#define INIT_PARSEENV_MEMENV_ALLOC_SIZE 16 static void -scan_env_clear(ScanEnv* env) +scan_env_clear(ParseEnv* env) { MEM_STATUS_CLEAR(env->cap_history); MEM_STATUS_CLEAR(env->backtrack_mem); @@ -2024,7 +2093,7 @@ scan_env_clear(ScanEnv* env) } static int -scan_env_add_mem_entry(ScanEnv* env) +scan_env_add_mem_entry(ParseEnv* env) { int i, need, alloc; MemEnv* p; @@ -2033,10 +2102,10 @@ scan_env_add_mem_entry(ScanEnv* env) if (need > MaxCaptureNum && MaxCaptureNum != 0) return ONIGERR_TOO_MANY_CAPTURES; - if (need >= SCANENV_MEMENV_SIZE) { + if (need >= PARSEENV_MEMENV_SIZE) { if (env->mem_alloc <= need) { if (IS_NULL(env->mem_env_dynamic)) { - alloc = INIT_SCANENV_MEMENV_ALLOC_SIZE; + alloc = INIT_PARSEENV_MEMENV_ALLOC_SIZE; p = (MemEnv* )xmalloc(sizeof(MemEnv) * alloc); CHECK_NULL_RETURN_MEMERR(p); xmemcpy(p, env->mem_env_static, sizeof(env->mem_env_static)); @@ -2062,10 +2131,10 @@ scan_env_add_mem_entry(ScanEnv* env) } static int -scan_env_set_mem_node(ScanEnv* env, int num, Node* node) +scan_env_set_mem_node(ParseEnv* env, int num, Node* node) { if (env->num_mem >= num) - SCANENV_MEMENV(env)[num].mem_node = node; + PARSEENV_MEMENV(env)[num].mem_node = node; else return ONIGERR_PARSER_BUG; return 0; @@ -2285,7 +2354,7 @@ node_new_anychar(OnigOptionType options) } static int -node_new_no_newline(Node** node, ScanEnv* env) +node_new_no_newline(Node** node, ParseEnv* env) { Node* n; @@ -2425,7 +2494,7 @@ node_new_backref(int back_num, int* backrefs, int by_name, #ifdef USE_BACKREF_WITH_LEVEL int exist_level, int nest_level, #endif - ScanEnv* env) + ParseEnv* env) { int i; Node* node; @@ -2451,7 +2520,7 @@ node_new_backref(int back_num, int* backrefs, int by_name, for (i = 0; i < back_num; i++) { if (backrefs[i] <= env->num_mem && - IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) { + IS_NULL(PARSEENV_MEMENV(env)[backrefs[i]].mem_node)) { NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */ break; } @@ -2481,7 +2550,7 @@ node_new_backref_checker(int back_num, int* backrefs, int by_name, #ifdef USE_BACKREF_WITH_LEVEL int exist_level, int nest_level, #endif - ScanEnv* env) + ParseEnv* env) { Node* node; @@ -2527,6 +2596,7 @@ node_new_quantifier(int lower, int upper, int by_number) QUANT_(node)->head_exact = NULL_NODE; QUANT_(node)->next_head_exact = NULL_NODE; QUANT_(node)->include_referred = 0; + QUANT_(node)->empty_status_mem = 0; if (by_number != 0) NODE_STATUS_ADD(node, BY_NUMBER); @@ -2640,7 +2710,7 @@ node_set_fail(Node* node) } static int -node_new_fail(Node** node, ScanEnv* env) +node_new_fail(Node** node, ParseEnv* env) { *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); @@ -2656,7 +2726,7 @@ onig_node_reset_fail(Node* node) } static int -node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env) +node_new_save_gimmick(Node** node, enum SaveType save_type, ParseEnv* env) { int id; @@ -2675,7 +2745,7 @@ node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env) static int node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type, - int id, ScanEnv* env) + int id, ParseEnv* env) { *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); @@ -2689,7 +2759,7 @@ node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type, } static int -node_new_keep(Node** node, ScanEnv* env) +node_new_keep(Node** node, ParseEnv* env) { int r; @@ -2743,7 +2813,7 @@ onig_reg_callout_list_at(regex_t* reg, int num) } static int -reg_callout_list_entry(ScanEnv* env, int* rnum) +reg_callout_list_entry(ParseEnv* env, int* rnum) { #define INIT_CALLOUT_LIST_NUM 3 @@ -2795,7 +2865,7 @@ reg_callout_list_entry(ScanEnv* env, int* rnum) static int node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id, - ScanEnv* env) + ParseEnv* env) { *node = node_new(); CHECK_NULL_RETURN_MEMERR(*node); @@ -2811,7 +2881,7 @@ node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id, #endif static int -make_text_segment(Node** node, ScanEnv* env) +make_text_segment(Node** node, ParseEnv* env) { int r; int i; @@ -2868,7 +2938,7 @@ make_text_segment(Node** node, ScanEnv* env) static int make_absent_engine(Node** node, int pre_save_right_id, Node* absent, Node* step_one, int lower, int upper, int possessive, - int is_range_cutter, ScanEnv* env) + int is_range_cutter, ParseEnv* env) { int r; int i; @@ -2950,7 +3020,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, static int make_absent_tail(Node** node1, Node** node2, int pre_save_right_id, - ScanEnv* env) + ParseEnv* env) { int r; int id; @@ -2998,7 +3068,7 @@ make_absent_tail(Node** node1, Node** node2, int pre_save_right_id, } static int -make_range_clear(Node** node, ScanEnv* env) +make_range_clear(Node** node, ParseEnv* env) { int r; int id; @@ -3057,7 +3127,7 @@ make_range_clear(Node** node, ScanEnv* env) static int is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody, - int* is_possessive, ScanEnv* env) + int* is_possessive, ParseEnv* env) { Node* quant; Node* body; @@ -3123,8 +3193,8 @@ is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody, } static int -make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* quant, - Node* body, int possessive, ScanEnv* env) +make_absent_tree_for_simple_one_char_repeat(Node** node, + Node* absent, Node* quant, Node* body, int possessive, ParseEnv* env) { int r; int i; @@ -3171,7 +3241,7 @@ make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* qua static int make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, - ScanEnv* env) + ParseEnv* env) { int r; int i; @@ -3844,7 +3914,7 @@ add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to) } static int -add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) +add_code_range(BBuf** pbuf, ParseEnv* env, OnigCodePoint from, OnigCodePoint to) { if (from > to) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) @@ -4172,7 +4242,7 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) } static OnigCodePoint -conv_backslash_value(OnigCodePoint c, ScanEnv* env) +conv_backslash_value(OnigCodePoint c, ParseEnv* env) { if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { switch (c) { @@ -4258,10 +4328,10 @@ enum ReduceType { RQ_ASIS = 0, /* as is */ RQ_DEL = 1, /* delete parent */ RQ_A, /* to '*' */ + RQ_P, /* to '+' */ RQ_AQ, /* to '*?' */ RQ_QQ, /* to '??' */ RQ_P_QQ, /* to '+)??' */ - RQ_PQ_Q /* to '+?)?' */ }; static enum ReduceType ReduceTypeTable[6][6] = { @@ -4270,7 +4340,7 @@ static enum ReduceType ReduceTypeTable[6][6] = { {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ - {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ + {RQ_ASIS, RQ_A, RQ_P, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ }; extern int @@ -4309,6 +4379,11 @@ onig_reduce_nested_quantifier(Node* pnode) p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; goto remove_cnode; break; + case RQ_P: + NODE_BODY(pnode) = NODE_BODY(cnode); + p->lower = 1; p->upper = INFINITE_REPEAT; p->greedy = 1; + goto remove_cnode; + break; case RQ_AQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; @@ -4323,10 +4398,6 @@ onig_reduce_nested_quantifier(Node* pnode) p->lower = 0; p->upper = 1; p->greedy = 0; c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; break; - case RQ_PQ_Q: - p->lower = 0; p->upper = 1; p->greedy = 1; - c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0; - break; case RQ_ASIS: break; } @@ -4340,7 +4411,7 @@ onig_reduce_nested_quantifier(Node* pnode) } static int -node_new_general_newline(Node** node, ScanEnv* env) +node_new_general_newline(Node** node, ParseEnv* env) { int r; int dlen, alen; @@ -4472,7 +4543,7 @@ ptoken_init(PToken* tok) } static int -fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) +fetch_interval(UChar** src, UChar* end, PToken* tok, ParseEnv* env) { int low, up, syn_allow, non_low = 0; int r = 0; @@ -4575,7 +4646,8 @@ fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) /* \M-, \C-, \c, or \... */ static int -fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) +fetch_escaped_value_raw(UChar** src, UChar* end, ParseEnv* env, + OnigCodePoint* val) { int v; OnigCodePoint c; @@ -4646,7 +4718,7 @@ fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* va } static int -fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) +fetch_escaped_value(UChar** src, UChar* end, ParseEnv* env, OnigCodePoint* val) { int r; int len; @@ -4660,7 +4732,7 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) return 0; } -static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env); +static int fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env); static OnigCodePoint get_name_end_code_point(OnigCodePoint start) @@ -4691,7 +4763,7 @@ enum REF_NUM { */ static int fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, - UChar** rname_end, ScanEnv* env, + UChar** rname_end, ParseEnv* env, int* rback_num, int* rlevel, enum REF_NUM* num_type) { int r, sign, exist_level; @@ -4825,7 +4897,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, */ static int fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, - UChar** rname_end, ScanEnv* env, int* rback_num, + UChar** rname_end, ParseEnv* env, int* rback_num, enum REF_NUM* num_type, int is_ref) { int r, sign; @@ -4957,7 +5029,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } static void -CC_ESC_WARN(ScanEnv* env, UChar *c) +CC_ESC_WARN(ParseEnv* env, UChar *c) { if (onig_warn == onig_null_warn) return ; @@ -4973,7 +5045,7 @@ CC_ESC_WARN(ScanEnv* env, UChar *c) } static void -CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) +CLOSE_BRACKET_WITHOUT_ESC_WARN(ParseEnv* env, UChar* c) { if (onig_warn == onig_null_warn) return ; @@ -5054,11 +5126,12 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, } static int -fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) +fetch_token_cc(PToken* tok, UChar** src, UChar* end, ParseEnv* env, int state) { int r; OnigCodePoint code; OnigCodePoint c, c2; + int mindigits, maxdigits; OnigSyntaxType* syn = env->syntax; OnigEncoding enc = env->enc; UChar* prev; @@ -5247,10 +5320,11 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) case 'u': if (PEND) break; - prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + mindigits = maxdigits = 4; + u_hex_digits: + r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code); if (r < 0) return r; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ @@ -5261,6 +5335,15 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) } break; + case 'U': + if (PEND) break; + prev = p; + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { + mindigits = maxdigits = 8; + goto u_hex_digits; + } + break; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { @@ -5327,15 +5410,22 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) } static int -fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) +fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env) { int r; OnigCodePoint code; OnigCodePoint c; - OnigEncoding enc = env->enc; - OnigSyntaxType* syn = env->syntax; + int mindigits, maxdigits; UChar* prev; - UChar* p = *src; + int allow_num; + OnigEncoding enc; + OnigSyntaxType* syn; + UChar* p; + + enc = env->enc; + syn = env->syntax; + p = *src; + PFETCH_READY; if (tok->code_point_continue != 0) { @@ -5574,12 +5664,20 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case 'Z': - if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; - tok->type = TK_ANCHOR; - tok->u.subtype = ANCR_SEMI_END_BUF; + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { + goto end_buf; + } + else { + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; + tok->type = TK_ANCHOR; + tok->u.subtype = ANCR_SEMI_END_BUF; + } break; case 'z': + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) + return ONIGERR_UNDEFINED_OPERATOR; + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; end_buf: tok->type = TK_ANCHOR; @@ -5668,10 +5766,11 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'u': if (PEND) break; - prev = p; + mindigits = maxdigits = 4; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + u_hex_digits: + r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code); if (r < 0) return r; if (p == prev) { /* can't read nothing. */ code = 0; /* but, it's not error */ @@ -5682,6 +5781,15 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } break; + case 'U': + if (PEND) break; + prev = p; + if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { + mindigits = maxdigits = 8; + goto u_hex_digits; + } + break; + case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': PUNFETCH; @@ -5694,7 +5802,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */ if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { - if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node)) + if (r > env->num_mem || IS_NULL(PARSEENV_MEMENV(env)[r].mem_node)) return ONIGERR_INVALID_BACKREF; } @@ -5743,6 +5851,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) int back_num; enum REF_NUM num_type; + allow_num = 1; + + backref_start: prev = p; #ifdef USE_BACKREF_WITH_LEVEL @@ -5757,6 +5868,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (r < 0) return r; if (num_type != IS_NOT_NUM) { + if (allow_num == 0) return ONIGERR_INVALID_BACKREF; + if (num_type == IS_REL_NUM) { back_num = backref_rel_to_abs(back_num, env); } @@ -5765,7 +5878,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) + IS_NULL(PARSEENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; @@ -5782,7 +5895,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) + IS_NULL(PARSEENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } @@ -5813,12 +5926,17 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) UChar* name_end; enum REF_NUM num_type; + allow_num = 1; + + call_start: prev = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type != IS_NOT_NUM) { + if (allow_num == 0) return ONIGERR_UNDEFINED_GROUP_REFERENCE; + if (num_type == IS_REL_NUM) { gnum = backref_rel_to_abs(gnum, env); if (gnum < 0) { @@ -5975,6 +6093,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '(': if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { + prev = p; PINC; if (! PEND) { c = PPEEK; @@ -6062,11 +6181,35 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) break; } } + else if (c == 'P' && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) { + PINC; /* skip 'P' */ + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + allow_num = 0; + if (c == '=') { + c = '('; + goto backref_start; + } + else if (c == '>') { +#ifdef USE_CALL + c = '('; + goto call_start; +#else + return ONIGERR_UNDEFINED_OPERATOR; +#endif + } + else { + p = prev; + goto lparen_qmark_end2; + } + } } lparen_qmark_end: PUNFETCH; } + lparen_qmark_end2: if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; tok->type = TK_SUBEXP_OPEN; break; @@ -6295,7 +6438,7 @@ add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not, } static int -add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) +add_ctype_to_cc(CClassNode* cc, int ctype, int not, ParseEnv* env) { int c, r; int ascii_mode; @@ -6398,7 +6541,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } static int -prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) +prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ParseEnv* env) { #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 #define POSIX_BRACKET_NAME_MIN_LEN 4 @@ -6472,7 +6615,7 @@ prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) } static int -fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) +fetch_char_property_to_ctype(UChar** src, UChar* end, ParseEnv* env) { int r; OnigCodePoint c; @@ -6507,7 +6650,8 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) } static int -prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, + ParseEnv* env) { int r, ctype; CClassNode* cc; @@ -6528,7 +6672,7 @@ prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) static int cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, - ScanEnv* env) + ParseEnv* env) { int r; @@ -6552,7 +6696,7 @@ cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, static int cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, int* from_raw, int to_raw, CVAL intype, CVAL* type, - CSTATE* state, ScanEnv* env) + CSTATE* state, ParseEnv* env) { int r; @@ -6621,7 +6765,7 @@ cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, static int code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, - ScanEnv* env) + ParseEnv* env) { int in_esc; OnigCodePoint code; @@ -6643,7 +6787,7 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, } static int -prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ParseEnv* env) { int r, neg, len, fetched, and_start; OnigCodePoint in_code, curr_code; @@ -6995,13 +7139,14 @@ prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } static int prs_alts(Node** top, PToken* tok, int term, - UChar** src, UChar* end, ScanEnv* env, int group_head); + UChar** src, UChar* end, ParseEnv* env, int group_head); #ifdef USE_CALLOUT /* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */ static int -prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) +prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, + ParseEnv* env) { int r; int i; @@ -7184,7 +7329,7 @@ clear_callout_args(int n, unsigned int types[], OnigValue vals[]) static int prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, int max_arg_num, unsigned int types[], OnigValue vals[], - ScanEnv* env) + ParseEnv* env) { #define MAX_CALLOUT_ARG_BYTE_LENGTH 128 @@ -7347,7 +7492,8 @@ prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, /* (*name[TAG]) (*name[TAG]{a,b,..}) */ static int -prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) +prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, + ParseEnv* env) { int r; int i; @@ -7514,7 +7660,7 @@ prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) static int prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env) + ParseEnv* env) { int r, num; Node *target; @@ -7747,7 +7893,7 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) + IS_NULL(PARSEENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } @@ -7769,7 +7915,7 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) + IS_NULL(PARSEENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } @@ -7932,12 +8078,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, break; #endif + case 'P': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) { + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + if (c == '<') goto named_group1; + + return ONIGERR_UNDEFINED_GROUP_OPTION; + } + /* else fall */ + case 'W': case 'D': case 'S': + case 'y': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + /* else fall */ + #ifdef USE_POSIXLINE_OPTION case 'p': #endif + case 'a': case '-': case 'i': case 'm': case 's': case 'x': - case 'W': case 'D': case 'S': case 'P': - case 'y': { int neg = 0; @@ -7974,10 +8134,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); break; #endif - case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break; - case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break; - case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break; - case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break; + case 'W': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); + break; + case 'D': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); + break; + case 'S': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); + break; + case 'P': + if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); + break; case 'y': /* y{g}, y{w} */ { @@ -8016,8 +8192,15 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, PFETCH(c); if (c != '}') return ONIGERR_UNDEFINED_GROUP_OPTION; - break; } /* case 'y' */ + break; + + case 'a': + if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_PYTHON)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + + OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); + break; default: return ONIGERR_UNDEFINED_GROUP_OPTION; @@ -8112,7 +8295,7 @@ static const char* ReduceQStr[] = { }; static int -assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env) +assign_quantifier_body(Node* qnode, Node* target, int group, ParseEnv* env) { QuantNode* qn; @@ -8260,35 +8443,38 @@ onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, } typedef struct { - ScanEnv* env; + ParseEnv* env; CClassNode* cc; Node* alt_root; Node** ptail; } IApplyCaseFoldArg; static int -i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) +i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, + void* arg) { IApplyCaseFoldArg* iarg; - ScanEnv* env; + ParseEnv* env; + OnigEncoding enc; CClassNode* cc; iarg = (IApplyCaseFoldArg* )arg; env = iarg->env; cc = iarg->cc; + enc = env->enc; if (to_len == 1) { - int is_in = onig_is_code_in_cc(env->enc, from, cc); + int is_in = onig_is_code_in_cc(enc, from, cc); #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || (is_in == 0 && IS_NCCLASS_NOT(cc))) { - ADD_CODE_INTO_CC(cc, *to, env->enc); + ADD_CODE_INTO_CC(cc, *to, enc); } #else if (is_in != 0) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || - ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) { - if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); + if (ONIGENC_MBC_MINLEN(enc) > 1 || + ONIGENC_CODE_TO_MBCLEN(enc, *to) != 1) { + if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, enc); add_code_range(&(cc->mbuf), env, *to, *to); } else { @@ -8305,7 +8491,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) int r, i, len; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - if (onig_is_code_in_cc(env->enc, from, cc) + if (onig_is_code_in_cc(enc, from, cc) #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS && !IS_NCCLASS_NOT(cc) #endif @@ -8320,8 +8506,9 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) Node* csnode; CClassNode* cs_cc; - index = onigenc_unicode_fold1_key(&to[i]); - if (index >= 0) { + index = 0; + if (ONIGENC_IS_UNICODE_ENCODING(enc) && + (index = onigenc_unicode_fold1_key(&to[i])) >= 0) { csnode = node_new_cclass(); cs_cc = CCLASS_(csnode); if (IS_NULL(csnode)) { @@ -8332,18 +8519,22 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) m = FOLDS1_UNFOLDS_NUM(index); for (j = 0; j < m; j++) { code = FOLDS1_UNFOLDS(index)[j]; - ADD_CODE_INTO_CC(cs_cc, code, env->enc); + ADD_CODE_INTO_CC(cs_cc, code, enc); } - ADD_CODE_INTO_CC(cs_cc, to[i], env->enc); + ADD_CODE_INTO_CC(cs_cc, to[i], enc); ns[n++] = csnode; } else { - len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); + len = ONIGENC_CODE_TO_MBC(enc, to[i], buf); if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) { csnode = node_new_str(buf, buf + len); if (IS_NULL(csnode)) goto err_free_ns; - NODE_STRING_SET_CASE_EXPANDED(csnode); + if (index == 0) + NODE_STATUS_ADD(csnode, IGNORECASE); + else + NODE_STRING_SET_CASE_EXPANDED(csnode); + ns[n++] = csnode; } else { @@ -8372,7 +8563,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) static int prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) + ParseEnv* env, int group_head) { int r, len, group; Node* qn; @@ -8778,7 +8969,7 @@ prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, static int prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) + ParseEnv* env, int group_head) { int r; Node *node, **headp; @@ -8829,7 +9020,7 @@ prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ static int prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) + ParseEnv* env, int group_head) { int r; Node *node, **headp; @@ -8892,7 +9083,7 @@ prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, } static int -prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) +prs_regexp(Node** top, UChar** src, UChar* end, ParseEnv* env) { int r; PToken tok; @@ -8908,7 +9099,7 @@ prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_CALL static int -make_call_zero_body(Node* node, ScanEnv* env, Node** rnode) +make_call_zero_body(Node* node, ParseEnv* env, Node** rnode) { int r; @@ -8930,7 +9121,7 @@ make_call_zero_body(Node* node, ScanEnv* env, Node** rnode) extern int onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, - regex_t* reg, ScanEnv* env) + regex_t* reg, ParseEnv* env) { int r; UChar* p; @@ -8945,7 +9136,6 @@ onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, reg->num_empty_check = 0; reg->repeat_range_alloc = 0; reg->repeat_range = (RepeatRange* )NULL; - reg->empty_status_mem = 0; names_clear(reg); @@ -8990,7 +9180,7 @@ onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, } extern void -onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED, +onig_scan_env_set_error_string(ParseEnv* env, int ecode ARG_UNUSED, UChar* arg, UChar* arg_end) { env->error = arg; diff --git a/src/regparse.h b/src/regparse.h index c60a42d..8875f78 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -4,7 +4,7 @@ regparse.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -73,6 +73,14 @@ enum BodyEmptyType { BODY_MAY_BE_EMPTY_REC = 3 }; +/* bytes buffer */ +typedef struct _BBuf { + UChar* p; + unsigned int used; + unsigned int alloc; +} BBuf; + + struct _Node; typedef struct { @@ -110,6 +118,7 @@ typedef struct { struct _Node* head_exact; struct _Node* next_head_exact; int include_referred; /* include called node. don't eliminate even if {0} */ + MemStatusType empty_status_mem; } QuantNode; typedef struct { @@ -340,6 +349,7 @@ typedef struct { #define NODE_ST_ABSENT_WITH_SIDE_EFFECTS (1<<24) /* stopper or clear */ #define NODE_ST_FIXED_CLEN_MIN_SURE (1<<25) #define NODE_ST_REFERENCED (1<<26) +#define NODE_ST_INPEEK (1<<27) #define NODE_STATUS(node) (((Node* )node)->u.base.status) @@ -376,6 +386,7 @@ typedef struct { #define NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) ((NODE_STATUS(node) & NODE_ST_ABSENT_WITH_SIDE_EFFECTS) != 0) #define NODE_IS_FIXED_CLEN_MIN_SURE(node) ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN_MIN_SURE) != 0) #define NODE_IS_REFERENCED(node) ((NODE_STATUS(node) & NODE_ST_REFERENCED) != 0) +#define NODE_IS_INPEEK(node) ((NODE_STATUS(node) & NODE_ST_INPEEK) != 0) #define NODE_PARENT(node) ((node)->u.base.parent) #define NODE_BODY(node) ((node)->u.base.body) @@ -384,8 +395,8 @@ typedef struct { #define NODE_CALL_BODY(node) ((node)->body) #define NODE_ANCHOR_BODY(node) ((node)->body) -#define SCANENV_MEMENV_SIZE 8 -#define SCANENV_MEMENV(senv) \ +#define PARSEENV_MEMENV_SIZE 8 +#define PARSEENV_MEMENV(senv) \ (IS_NOT_NULL((senv)->mem_env_dynamic) ? \ (senv)->mem_env_dynamic : (senv)->mem_env_static) @@ -424,7 +435,7 @@ typedef struct { int num_mem; int num_named; int mem_alloc; - MemEnv mem_env_static[SCANENV_MEMENV_SIZE]; + MemEnv mem_env_static[PARSEENV_MEMENV_SIZE]; MemEnv* mem_env_dynamic; int backref_num; int keep_num; @@ -439,14 +450,14 @@ typedef struct { #ifdef ONIG_DEBUG_PARSE unsigned int max_parse_depth; #endif -} ScanEnv; +} ParseEnv; extern int onig_renumber_name_table P_((regex_t* reg, GroupNumMap* map)); extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); -extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); +extern void onig_scan_env_set_error_string P_((ParseEnv* env, int ecode, UChar* arg, UChar* arg_end)); extern int onig_reduce_nested_quantifier P_((Node* pnode)); extern int onig_node_copy(Node** rcopy, Node* from); extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); @@ -460,7 +471,7 @@ extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); extern Node* onig_node_new_list P_((Node* left, Node* right)); extern Node* onig_node_new_alt P_((Node* left, Node* right)); extern int onig_names_free P_((regex_t* reg)); -extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); +extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ParseEnv* env)); extern int onig_free_shared_cclass_table P_((void)); extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); extern int onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]); diff --git a/src/regposix.c b/src/regposix.c index 497ba02..494446f 100644 --- a/src/regposix.c +++ b/src/regposix.c @@ -2,7 +2,7 @@ regposix.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -120,6 +120,7 @@ onig2posix_error_code(int code) { ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED, REG_BADPAT }, { ONIGERR_TOO_BIG_WIDE_CHAR_VALUE, REG_EONIG_BADWC }, { ONIGERR_TOO_LONG_WIDE_CHAR_VALUE, REG_EONIG_BADWC }, + { ONIGERR_UNDEFINED_OPERATOR, REG_BADPAT }, { ONIGERR_INVALID_CODE_POINT_VALUE, REG_EONIG_BADWC }, { ONIGERR_EMPTY_GROUP_NAME, REG_BADPAT }, { ONIGERR_INVALID_GROUP_NAME, REG_BADPAT }, @@ -141,6 +142,7 @@ onig2posix_error_code(int code) { ONIGERR_INVALID_CALLOUT_TAG_NAME, REG_BADPAT }, { ONIGERR_INVALID_CALLOUT_ARG, REG_BADPAT }, { ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION, REG_EONIG_BADARG }, + { ONIGERR_VERY_INEFFICIENT_PATTERN, REG_BADPAT }, { ONIGERR_LIBRARY_IS_NOT_INITIALIZED, REG_EONIG_INTERNAL } }; diff --git a/src/regsyntax.c b/src/regsyntax.c index 984aac6..8e1c313 100644 --- a/src/regsyntax.c +++ b/src/regsyntax.c @@ -2,7 +2,7 @@ regsyntax.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2020 K.Kosako + * Copyright (c) 2002-2021 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -240,6 +240,35 @@ OnigSyntaxType OnigSyntaxPerl_NG = { } }; +/* Python 3.9 */ +OnigSyntaxType OnigSyntaxPython = { + (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | + ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | + ONIG_SYN_OP_ESC_CONTROL_CHARS | + ONIG_SYN_OP_ESC_C_CONTROL ) + & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) + , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL | + ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE | + ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME | + ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | + ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | + ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME | + ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | + ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 ) + , ( SYN_GNU_REGEX_BV | ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH | + ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_PYTHON ) + , ONIG_OPTION_SINGLELINE + , + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + } +}; + extern int diff --git a/src/unicode.c b/src/unicode.c index 6703d4b..efe5f73 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -2,7 +2,7 @@ unicode.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -77,9 +77,8 @@ static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = { #include "unicode_fold_data.c" extern int -onigenc_unicode_mbc_case_fold(OnigEncoding enc, - OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end, - UChar* fold) +onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag, + const UChar** pp, const UChar* end, UChar* fold) { const struct ByUnfoldKey* buk; @@ -104,23 +103,27 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, } #endif - buk = onigenc_unicode_unfold_key(code); - if (buk != 0) { - if (buk->fold_len == 1) { - return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold); - } - else { - OnigCodePoint* addr; - - FOLDS_FOLD_ADDR_BUK(buk, addr); - rlen = 0; - for (i = 0; i < buk->fold_len; i++) { - OnigCodePoint c = addr[i]; - len = ONIGENC_CODE_TO_MBC(enc, c, fold); - fold += len; - rlen += len; + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(code)) { + buk = onigenc_unicode_unfold_key(code); + if (buk != 0) { + if (buk->fold_len == 1) { + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || + ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk->index))) + return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold); + } + else { + OnigCodePoint* addr; + + FOLDS_FOLD_ADDR_BUK(buk, addr); + rlen = 0; + for (i = 0; i < buk->fold_len; i++) { + OnigCodePoint c = addr[i]; + len = ONIGENC_CODE_TO_MBC(enc, c, fold); + fold += len; + rlen += len; + } + return rlen; } - return rlen; } } @@ -131,16 +134,22 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, } static int -apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg) +apply_case_fold1(OnigCaseFoldType flag, int from, int to, + OnigApplyAllCaseFoldFunc f, void* arg) { int i, j, k, n, r; for (i = from; i < to; ) { OnigCodePoint fold = *FOLDS1_FOLD(i); + if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(fold)) break; + n = FOLDS1_UNFOLDS_NUM(i); for (j = 0; j < n; j++) { OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j]; + if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(unfold)) + continue; + r = (*f)(fold, &unfold, 1, arg); if (r != 0) return r; r = (*f)(unfold, &fold, 1, arg); @@ -148,6 +157,9 @@ apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg) for (k = 0; k < j; k++) { OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k]; + if (CASE_FOLD_IS_ASCII_ONLY(flag) && + ! ONIGENC_IS_ASCII_CODE(unfold2)) continue; + r = (*f)(unfold, &unfold2, 1, arg); if (r != 0) return r; r = (*f)(unfold2, &unfold, 1, arg); @@ -225,7 +237,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, { int r; - r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg); + r = apply_case_fold1(flag, 0, FOLDS1_NORMAL_END_INDEX, f, arg); if (r != 0) return r; #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI @@ -246,7 +258,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, } else { #endif - r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg); + r = apply_case_fold1(flag, FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg); if (r != 0) return r; #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI } @@ -288,6 +300,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, n = 0; code = ONIGENC_MBC_TO_CODE(enc, p, end); + if (CASE_FOLD_IS_ASCII_ONLY(flag)) { + if (! ONIGENC_IS_ASCII_CODE(code)) return n; + } len = enclen(enc, p); #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI @@ -449,19 +464,26 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, if (buk1 != 0) { if (buk1->fold_len == 1) { int un; - items[0].byte_len = lens[0]; - items[0].code_len = 1; - items[0].code[0] = *FOLDS1_FOLD(buk1->index); - n++; + + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || + ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk1->index))) { + items[0].byte_len = lens[0]; + items[0].code_len = 1; + items[0].code[0] = *FOLDS1_FOLD(buk1->index); + n++; + } un = FOLDS1_UNFOLDS_NUM(buk1->index); for (i = 0; i < un; i++) { OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i]; if (unfold != orig_codes[0]) { - items[n].byte_len = lens[0]; - items[n].code_len = 1; - items[n].code[0] = unfold; - n++; + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || + ONIGENC_IS_ASCII_CODE(unfold)) { + items[n].byte_len = lens[0]; + items[n].code_len = 1; + items[n].code[0] = unfold; + n++; + } } } } @@ -548,10 +570,13 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { - items[n].byte_len = lens[0]; - items[n].code_len = 1; - items[n].code[0] = FOLDS1_UNFOLDS(index)[i]; - n++; + code = FOLDS1_UNFOLDS(index)[i]; + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)||ONIGENC_IS_ASCII_CODE(code)) { + items[n].byte_len = lens[0]; + items[n].code_len = 1; + items[n].code[0] = code; + n++; + } } } } -- cgit v1.2.3