diff options
author | Jörg Frings-Fürst <debian@jff.email> | 2019-12-23 07:44:50 +0100 |
---|---|---|
committer | Jörg Frings-Fürst <debian@jff.email> | 2019-12-23 07:44:50 +0100 |
commit | 9e629c8f43b43617fa5b7d3654f7d81e81b8a427 (patch) | |
tree | 581dcb2708a7eac0bcc7bbfa6478cfa50dfcf5a8 /src | |
parent | 7bbf4ae1401bc6e40f71a32d3f97952796d85690 (diff) | |
parent | 091456e1a135d4674701a264495bd34918779391 (diff) |
Merge branch 'release/debian/6.9.4-1'debian/6.9.4-1
Diffstat (limited to 'src')
75 files changed, 3685 insertions, 2931 deletions
diff --git a/src/Makefile.windows b/src/Makefile.windows index 762cf07..1e87504 100644 --- a/src/Makefile.windows +++ b/src/Makefile.windows @@ -2,6 +2,9 @@ product_name = oniguruma +TEST_DIR = $(ONIG_DIR)/../test +WIN_DIR = $(ONIG_DIR)/../windows + CPPFLAGS = CFLAGS = -O2 -nologo /W3 LDFLAGS = @@ -152,25 +155,24 @@ $(BUILD_DIR)/unicode_fold1_key.obj: $(ONIG_DIR)/unicode_fold1_key.c $(ONIG_DIR)/ $(BUILD_DIR)/unicode_fold2_key.obj: $(ONIG_DIR)/unicode_fold2_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h $(BUILD_DIR)/unicode_fold3_key.obj: $(ONIG_DIR)/unicode_fold3_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h -# C library test -ctest: $(testc) - .\$(testc) -# POSIX C library test -ptest: $(testp) - .\$(testp) +test_regset: $(TEST_DIR)/test_regset.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_regset.c $(libname) + +test_utf8: $(TEST_DIR)/test_utf8.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_utf8.c $(libname) -$(testc): $(testc).c $(libname) - $(CC) -nologo /Fe:$(testc) -DONIG_EXTERN=extern $(testc).c $(libname) +testc: $(WIN_DIR)/testc.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(WIN_DIR)/testc.c $(libname) -$(testp): $(testc).c $(dlllib) - $(CC) -nologo -DPOSIX_TEST /Fe:$(testp) $(testc).c $(dlllib) +testp: $(WIN_DIR)/testc.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /DPOSIX_TEST $(WIN_DIR)/testc.c $(libname) -$(testc)u: $(testc)u.c $(libname) - $(CC) -nologo /Fe:$(testc)u -DONIG_EXTERN=extern $(testc)u.c $(libname) +testu: $(TEST_DIR)/testu.c $(libname) + $(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(TEST_DIR)/testu.c $(libname) clean: - del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\$(testp).exe $(BUILD_DIR)\$(testc).exe $(BUILD_DIR)\$(testc).obj + del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe samples: all diff --git a/src/ascii.c b/src/ascii.c index e83e4d6..f2dc0d3 100644 --- a/src/ascii.c +++ b/src/ascii.c @@ -2,7 +2,7 @@ ascii.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -2,7 +2,7 @@ big5.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,6 +55,16 @@ big5_mbc_enc_len(const UChar* p) } static int +big5_code_to_mbclen(OnigCodePoint code) +{ + if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + if ((code & 0xff00) != 0) return 2; + if (EncLen_BIG5[(int )(code & 0xff)] == 1) return 1; + + return ONIGERR_INVALID_CODE_POINT_VALUE; +} + +static int is_valid_mbc_string(const UChar* p, const UChar* end) { while (p < end) { @@ -99,15 +109,6 @@ big5_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } -#if 0 -static int -big5_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_BIG5, flag, pp, end); -} -#endif - static int big5_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -174,7 +175,7 @@ OnigEncodingType OnigEncodingBIG5 = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, big5_mbc_to_code, - onigenc_mb2_code_to_mbclen, + big5_code_to_mbclen, big5_code_to_mbc, big5_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/config.h.win32 b/src/config.h.win32 index 1f848e2..82a35b9 100644 --- a/src/config.h.win32 +++ b/src/config.h.win32 @@ -1,3 +1,9 @@ +#if defined(__MINGW32__) || _MSC_VER >= 1600 +#define HAVE_STDINT_H 1 +#endif +#if defined(__MINGW32__) || _MSC_VER >= 1800 +#define HAVE_INTTYPES_H 1 +#endif #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 #define HAVE_MEMORY_H 1 diff --git a/src/config.h.win64 b/src/config.h.win64 index f72671b..7f19699 100644 --- a/src/config.h.win64 +++ b/src/config.h.win64 @@ -1,3 +1,9 @@ +#if defined(__MINGW32__) || _MSC_VER >= 1600 +#define HAVE_STDINT_H 1 +#endif +#if defined(__MINGW32__) || _MSC_VER >= 1800 +#define HAVE_INTTYPES_H 1 +#endif #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 #define HAVE_MEMORY_H 1 diff --git a/src/config.h.windows.in b/src/config.h.windows.in index d8de1dd..d4f73d7 100644 --- a/src/config.h.windows.in +++ b/src/config.h.windows.in @@ -1,7 +1,14 @@ +#if defined(__MINGW32__) || _MSC_VER >= 1600 +#define HAVE_STDINT_H 1 +#endif +#if defined(__MINGW32__) || _MSC_VER >= 1800 +#define HAVE_INTTYPES_H 1 +#endif #define HAVE_SYS_TYPES_H 1 #define HAVE_SYS_STAT_H 1 #define HAVE_MEMORY_H 1 #define HAVE_OFF_T 1 + #define SIZEOF_INT 4 #define SIZEOF_LONG 4 #define SIZEOF_LONG_LONG 8 diff --git a/src/cp1251.c b/src/cp1251.c index b4ce4d8..fa20780 100644 --- a/src/cp1251.c +++ b/src/cp1251.c @@ -2,8 +2,8 @@ cp1251.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2006-2018 Byte <byte AT mail DOT kna DOT ru> - * K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2006-2019 Byte <byte AT mail DOT kna DOT ru> + * K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/euc_jp.c b/src/euc_jp.c index d17386d..640b3e3 100644 --- a/src/euc_jp.c +++ b/src/euc_jp.c @@ -2,7 +2,7 @@ euc_jp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -120,25 +120,6 @@ code_to_mbclen(OnigCodePoint code) return ONIGERR_INVALID_CODE_POINT_VALUE; } -#if 0 -static int -code_to_mbc_first(OnigCodePoint code) -{ - int first; - - if ((code & 0xff0000) != 0) { - first = (code >> 16) & 0xff; - } - else if ((code & 0xff00) != 0) { - first = (code >> 8) & 0xff; - } - else { - return (int )code; - } - return first; -} -#endif - static int code_to_mbc(OnigCodePoint code, UChar *buf) { diff --git a/src/euc_jp_prop.c b/src/euc_jp_prop.c index be719cf..a816f48 100644 --- a/src/euc_jp_prop.c +++ b/src/euc_jp_prop.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -pt -T -L ANSI-C -N onigenc_euc_jp_lookup_property_name --output-file gperf1.tmp euc_jp_prop.gperf */ +/* Command-line: gperf -pt -T -L ANSI-C -N onigenc_euc_jp_lookup_property_name --output-file gperf1.tmp euc_jp_prop.gperf */ /* Computed positions: -k'1,3' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ diff --git a/src/euc_kr.c b/src/euc_kr.c index bb968b0..7fa50af 100644 --- a/src/euc_kr.c +++ b/src/euc_kr.c @@ -2,7 +2,7 @@ euc_kr.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,6 +55,16 @@ euckr_mbc_enc_len(const UChar* p) } static int +euckr_code_to_mbclen(OnigCodePoint code) +{ + if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + if ((code & 0xff00) != 0) return 2; + if (EncLen_EUCKR[(int )(code & 0xff)] == 1) return 1; + + return ONIGERR_INVALID_CODE_POINT_VALUE; +} + +static int is_valid_mbc_string(const UChar* p, const UChar* end) { while (p < end) { @@ -98,15 +108,6 @@ euckr_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } -#if 0 -static int -euckr_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_KR, flag, pp, end); -} -#endif - static int euckr_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -149,7 +150,7 @@ OnigEncodingType OnigEncodingEUC_KR = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, euckr_mbc_to_code, - onigenc_mb2_code_to_mbclen, + euckr_code_to_mbclen, euckr_code_to_mbc, euckr_mbc_case_fold, onigenc_ascii_apply_all_case_fold, @@ -174,7 +175,7 @@ OnigEncodingType OnigEncodingEUC_CN = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, euckr_mbc_to_code, - onigenc_mb2_code_to_mbclen, + euckr_code_to_mbclen, euckr_code_to_mbc, euckr_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/euc_tw.c b/src/euc_tw.c index c9acaf1..8e72b97 100644 --- a/src/euc_tw.c +++ b/src/euc_tw.c @@ -2,7 +2,7 @@ euc_tw.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,6 +55,20 @@ euctw_mbc_enc_len(const UChar* p) } static int +euctw_code_to_mbclen(OnigCodePoint code) +{ + if ((code & 0xff000000) != 0) return 4; + else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + else if ((code & 0xff00) != 0) return 2; + else { + if (EncLen_EUCTW[(int )(code & 0xff)] == 1) + return 1; + + return ONIGERR_INVALID_CODE_POINT_VALUE; + } +} + +static int is_valid_mbc_string(const UChar* p, const UChar* end) { while (p < end) { @@ -155,7 +169,7 @@ OnigEncodingType OnigEncodingEUC_TW = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, euctw_mbc_to_code, - onigenc_mb4_code_to_mbclen, + euctw_code_to_mbclen, euctw_code_to_mbc, euctw_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/gb18030.c b/src/gb18030.c index 7654432..50898eb 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -2,8 +2,8 @@ gb18030.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2005-2018 KUBO Takehiro <kubo AT jiubao DOT org> - * K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2005-2019 KUBO Takehiro <kubo AT jiubao DOT org> + * K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -33,6 +33,7 @@ #if 1 #define DEBUG_GB18030(arg) #else +#include <stdio.h> #define DEBUG_GB18030(arg) printf arg #endif @@ -67,15 +68,29 @@ gb18030_mbc_enc_len(const UChar* p) { if (GB18030_MAP[*p] != CM) return 1; + p++; if (GB18030_MAP[*p] == C4) return 4; - if (GB18030_MAP[*p] == C1) - return 1; /* illegal sequence */ + return 2; } static int +gb18030_code_to_mbclen(OnigCodePoint code) +{ + if ((code & 0xff000000) != 0) return 4; + else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + else if ((code & 0xff00) != 0) return 2; + else { + if (GB18030_MAP[(int )(code & 0xff)] == CM) + return ONIGERR_INVALID_CODE_POINT_VALUE; + + return 1; + } +} + +static int is_valid_mbc_string(const UChar* p, const UChar* end) { while (p < end) { @@ -135,15 +150,6 @@ gb18030_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end, pp, end, lower); } -#if 0 -static int -gb18030_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_GB18030, flag, pp, end); -} -#endif - static int gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -522,7 +528,7 @@ OnigEncodingType OnigEncodingGB18030 = { 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, gb18030_mbc_to_code, - onigenc_mb4_code_to_mbclen, + gb18030_code_to_mbclen, gb18030_code_to_mbc, gb18030_mbc_case_fold, onigenc_ascii_apply_all_case_fold, diff --git a/src/gperf_fold_key_conv.py b/src/gperf_fold_key_conv.py index f453186..c633100 100755 --- a/src/gperf_fold_key_conv.py +++ b/src/gperf_fold_key_conv.py @@ -12,7 +12,7 @@ REG_STR_AT = re.compile('str\[(\d+)\]') REG_RETURN_TYPE = re.compile('^const\s+short\s+int\s*\*') REG_FOLD_KEY = re.compile('unicode_fold(\d)_key\s*\(register\s+const\s+char\s*\*\s*str,\s*register\s+size_t\s+len\)') REG_ENTRY = re.compile('\{".*?",\s*(-?\d+)\s*\}') -REG_IF_LEN = re.compile('if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') +REG_IF_LEN = re.compile('\s*if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') REG_GET_HASH = re.compile('(?:register\s+)?(?:unsigned\s+)?int\s+key\s*=\s*hash\s*\(str,\s*len\);') REG_GET_CODE = re.compile('(?:register\s+)?const\s+char\s*\*\s*s\s*=\s*wordlist\[key\]\.name;') REG_CODE_CHECK = re.compile('if\s*\(\*str\s*==\s*\*s\s*&&\s*!strncmp.+\)') @@ -34,7 +34,7 @@ def parse_line(s, key_len): if r != s: return r r = re.sub(REG_ENTRY, '\\1', s) if r != s: return r - r = re.sub(REG_IF_LEN, 'if (0 == 0)', s) + r = re.sub(REG_IF_LEN, '', s) if r != s: return r r = re.sub(REG_GET_HASH, 'int key = hash(codes);', s) if r != s: return r diff --git a/src/gperf_unfold_key_conv.py b/src/gperf_unfold_key_conv.py index 3cf4836..d999d4e 100755 --- a/src/gperf_unfold_key_conv.py +++ b/src/gperf_unfold_key_conv.py @@ -12,7 +12,7 @@ REG_STR_AT = re.compile('str\[(\d+)\]') REG_UNFOLD_KEY = re.compile('onigenc_unicode_unfold_key\s*\(register\s+const\s+char\s*\*\s*str,\s*register\s+size_t\s+len\)') REG_ENTRY = re.compile('\{".+?",\s*/\*(.+?)\*/\s*(-?\d+),\s*(\d)\}') REG_EMPTY_ENTRY = re.compile('\{"",\s*(-?\d+),\s*(\d)\}') -REG_IF_LEN = re.compile('if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') +REG_IF_LEN = re.compile('\s*if\s*\(\s*len\s*<=\s*MAX_WORD_LENGTH.+') REG_GET_HASH = re.compile('(?:register\s+)?(?:unsigned\s+)?int\s+key\s*=\s*hash\s*\(str,\s*len\);') REG_GET_CODE = re.compile('(?:register\s+)?const\s+char\s*\*\s*s\s*=\s*wordlist\[key\]\.name;') REG_CODE_CHECK = re.compile('if\s*\(\*str\s*==\s*\*s\s*&&\s*!strncmp.+\)') @@ -32,7 +32,7 @@ def parse_line(s): if r != s: return r r = re.sub(REG_EMPTY_ENTRY, '{0xffffffff, \\1, \\2}', s) if r != s: return r - r = re.sub(REG_IF_LEN, 'if (0 == 0)', s) + r = re.sub(REG_IF_LEN, '', s) if r != s: return r r = re.sub(REG_GET_HASH, 'int key = hash(&code);', s) if r != s: return r diff --git a/src/iso8859_1.c b/src/iso8859_1.c index 3b64942..e681c2a 100644 --- a/src/iso8859_1.c +++ b/src/iso8859_1.c @@ -2,7 +2,7 @@ iso8859_1.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -216,32 +216,6 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_1_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (*p >= 0xaa && *p <= 0xba) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_10.c b/src/iso8859_10.c index f5882bc..e98cffb 100644 --- a/src/iso8859_10.c +++ b/src/iso8859_10.c @@ -2,7 +2,7 @@ iso8859_10.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,28 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_10_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_11.c b/src/iso8859_11.c index da8fda0..8639ce2 100644 --- a/src/iso8859_11.c +++ b/src/iso8859_11.c @@ -2,7 +2,7 @@ iso8859_11.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/iso8859_13.c b/src/iso8859_13.c index 0cf251c..2bd460f 100644 --- a/src/iso8859_13.c +++ b/src/iso8859_13.c @@ -2,7 +2,7 @@ iso8859_13.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_13_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf, 0xb5 are lower case letter, but can't convert. */ - if (*p == 0xb5) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_14.c b/src/iso8859_14.c index 030e9f5..5030b55 100644 --- a/src/iso8859_14.c +++ b/src/iso8859_14.c @@ -2,7 +2,7 @@ iso8859_14.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,29 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_14_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_15.c b/src/iso8859_15.c index 859d727..f32c3de 100644 --- a/src/iso8859_15.c +++ b/src/iso8859_15.c @@ -2,7 +2,7 @@ iso8859_15.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_15_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf etc.. are lower case letter, but can't convert. */ - if (*p == 0xaa || *p == 0xb5 || *p == 0xba) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_16.c b/src/iso8859_16.c index 2614e56..22a653a 100644 --- a/src/iso8859_16.c +++ b/src/iso8859_16.c @@ -2,7 +2,7 @@ iso8859_16.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,28 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_16_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_2.c b/src/iso8859_2.c index ba030d5..dc3d0a1 100644 --- a/src/iso8859_2.c +++ b/src/iso8859_2.c @@ -2,7 +2,7 @@ iso8859_2.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,28 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_2_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static const OnigPairCaseFoldCodes CaseFoldMap[] = { { 0xa1, 0xb1 }, { 0xa3, 0xb3 }, diff --git a/src/iso8859_3.c b/src/iso8859_3.c index f090d0b..49dc6b2 100644 --- a/src/iso8859_3.c +++ b/src/iso8859_3.c @@ -2,7 +2,7 @@ iso8859_3.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_3_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (*p == 0xb5) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_4.c b/src/iso8859_4.c index 57dc9fe..f3f6ba9 100644 --- a/src/iso8859_4.c +++ b/src/iso8859_4.c @@ -2,7 +2,7 @@ iso8859_4.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,31 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; /* return byte length of converted char to lower */ } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_4_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - if (*p == 0xa2) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_5.c b/src/iso8859_5.c index a090d25..a5f587c 100644 --- a/src/iso8859_5.c +++ b/src/iso8859_5.c @@ -2,7 +2,7 @@ iso8859_5.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,19 +114,6 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - (*pp)++; - v = (EncISO_8859_5_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_6.c b/src/iso8859_6.c index 1c16c79..fb72442 100644 --- a/src/iso8859_6.c +++ b/src/iso8859_6.c @@ -2,7 +2,7 @@ iso8859_6.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/iso8859_7.c b/src/iso8859_7.c index 8c88351..018efac 100644 --- a/src/iso8859_7.c +++ b/src/iso8859_7.c @@ -2,7 +2,7 @@ iso8859_7.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,26 +114,6 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - (*pp)++; - v = (EncISO_8859_7_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - if (*p == 0xc0 || *p == 0xe0) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/iso8859_8.c b/src/iso8859_8.c index bd3e94d..92a5eb1 100644 --- a/src/iso8859_8.c +++ b/src/iso8859_8.c @@ -2,7 +2,7 @@ iso8859_8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/iso8859_9.c b/src/iso8859_9.c index 1d291d5..1f9bdea 100644 --- a/src/iso8859_9.c +++ b/src/iso8859_9.c @@ -2,7 +2,7 @@ iso8859_9.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,32 +121,6 @@ mbc_case_fold(OnigCaseFoldType flag, return 1; } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - (*pp)++; - return TRUE; - } - - (*pp)++; - v = (EncISO_8859_9_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xdf etc.. are lower case letter, but can't convert. */ - if (*p >= 0xaa && *p <= 0xba) - return FALSE; - else - return TRUE; - } - - return (v != 0 ? TRUE : FALSE); -} -#endif - static int is_code_ctype(OnigCodePoint code, unsigned int ctype) { @@ -2,7 +2,7 @@ koi8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -115,25 +115,6 @@ koi8_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -koi8_is_mbc_ambiguous(OnigAmbigType flag, const OnigUChar** pp, const OnigUChar* end) -{ - const OnigUChar* p = *pp; - - (*pp)++; - if (((flag & ONIGENC_CASE_FOLD_ASCII_CASE) != 0 && - ONIGENC_IS_MBC_ASCII(p)) || - ((flag & ONIGENC_CASE_FOLD_NONASCII_CASE) != 0 && - !ONIGENC_IS_MBC_ASCII(p))) { - int v = (EncKOI8_CtypeTable[*p] & - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - return (v != 0 ? TRUE : FALSE); - } - return FALSE; -} -#endif - static int koi8_is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/koi8_r.c b/src/koi8_r.c index 1284f7f..c77302f 100644 --- a/src/koi8_r.c +++ b/src/koi8_r.c @@ -2,7 +2,7 @@ koi8_r.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,19 +114,6 @@ koi8_r_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, return 1; } -#if 0 -static int -koi8_r_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - int v; - const UChar* p = *pp; - - (*pp)++; - v = (EncKOI8_R_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - return (v != 0 ? TRUE : FALSE); -} -#endif - static int koi8_r_is_code_ctype(OnigCodePoint code, unsigned int ctype) { diff --git a/src/make_property.sh b/src/make_property.sh index bc5cf98..cef0a96 100755 --- a/src/make_property.sh +++ b/src/make_property.sh @@ -1,8 +1,9 @@ #!/bin/sh +GPERF=gperf + TMP1=gperf1.tmp TMP2=gperf2.tmp -GPERF=/usr/local/bin/gperf GPERF_OPT='-pt -T -L ANSI-C' diff --git a/src/make_unicode_egcb_data.py b/src/make_unicode_egcb_data.py index 0f63f97..9c71796 100755 --- a/src/make_unicode_egcb_data.py +++ b/src/make_unicode_egcb_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_egcb_data.py -# Copyright (c) 2017-2018 K.Kosako +# Copyright (c) 2017-2019 K.Kosako import sys import re @@ -13,18 +13,19 @@ PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") -VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") +VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") -VERSION_INFO = None +VERSION_INFO = [-1, -1, -1] DIC = { } PROPS = [] PropIndex = { } def check_version_info(s): - global VERSION_INFO m = VERSION_REG.match(s) if m is not None: - VERSION_INFO = m.group(1) + VERSION_INFO[0] = int(m.group(1)) + VERSION_INFO[1] = int(m.group(2)) + VERSION_INFO[2] = int(m.group(3)) def print_ranges(ranges): for (start, end) in ranges: @@ -160,7 +161,7 @@ def parse_properties(path): continue if s[0] == '#': - if VERSION_INFO is None: + if VERSION_INFO[0] < 0: check_version_info(s) m = PR_LINE_REG.match(s) @@ -194,7 +195,7 @@ PROPS = sorted(PROPS) print '/* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */' COPYRIGHT = ''' /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -222,9 +223,11 @@ COPYRIGHT = ''' print COPYRIGHT print '' -if VERSION_INFO is not None: - print "#define GRAPHEME_BREAK_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' +if VERSION_INFO[0] < 0: + raise RuntimeError("Version is not found") + +print "#define GRAPHEME_BREAK_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) +print '' ranges = [] for prop in PROPS: diff --git a/src/make_unicode_fold.sh b/src/make_unicode_fold.sh index 35ce974..1d5cc1e 100755 --- a/src/make_unicode_fold.sh +++ b/src/make_unicode_fold.sh @@ -1,6 +1,6 @@ #!/bin/sh -GPERF=/usr/local/bin/gperf +GPERF=gperf TMP0=gperf0.tmp TMP1=gperf1.tmp diff --git a/src/make_unicode_fold_data.py b/src/make_unicode_fold_data.py index 783988c..55d5b88 100755 --- a/src/make_unicode_fold_data.py +++ b/src/make_unicode_fold_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_fold_data.py -# Copyright (c) 2016-2018 K.Kosako +# Copyright (c) 2016-2019 K.Kosako import sys import re @@ -16,9 +16,9 @@ DataName = 'OnigUnicodeFolds' ENCODING = 'utf-8' LINE_REG = re.compile("([0-9A-F]{1,6}); (.); ([0-9A-F]{1,6})(?: ([0-9A-F]{1,6}))?(?: ([0-9A-F]{1,6}))?;(?:\s*#\s*)(.*)") -VERSION_REG = re.compile("#.*-(\d+\.\d+\.\d+)\.txt") +VERSION_REG = re.compile("#.*-(\d+)\.(\d+)\.(\d+)\.txt") -VERSION_INFO = None +VERSION_INFO = [-1, -1, -1] FOLDS = {} TURKISH_FOLDS = {} @@ -56,18 +56,19 @@ def form3bytes(x): return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0) def check_version_info(s): - global VERSION_INFO - if VERSION_INFO is None: - m = VERSION_REG.match(s) - if m is not None: - VERSION_INFO = m.group(1) + m = VERSION_REG.match(s) + if m is not None: + VERSION_INFO[0] = int(m.group(1)) + VERSION_INFO[1] = int(m.group(2)) + VERSION_INFO[2] = int(m.group(3)) def parse_line(s): if len(s) == 0: - return False + return False if s[0] == '#': + if VERSION_INFO[0] < 0: check_version_info(s) - return False + return False m = LINE_REG.match(s) if m is None: @@ -232,9 +233,11 @@ def output_fold_source(f, out_comment): print >> f, "/* This file was generated by make_unicode_fold_data.py. */" print >> f, '#include "regenc.h"' print >> f, '' - if VERSION_INFO is not None: - print "#define UNICODE_CASEFOLD_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' + if VERSION_INFO[0] < 0: + raise RuntimeError("Version is not found") + + print "#define UNICODE_CASEFOLD_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) + print '' #output_macros(f, DataName) print >> f, '' #output_typedef(f) @@ -246,7 +249,7 @@ HEAD = ''' /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/make_unicode_property.sh b/src/make_unicode_property.sh index 124d76a..51c8951 100755 --- a/src/make_unicode_property.sh +++ b/src/make_unicode_property.sh @@ -1,10 +1,11 @@ #!/bin/sh +GPERF=gperf + NAME=unicode_property_data TMP1=gperf1.tmp TMP2=gperf2.tmp TMP= -GPERF=/usr/local/bin/gperf GPERF_OPT='-T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool' POOL_CAST='s/\(int *\)\(size_t *\)&\(\(struct +unicode_prop_name_pool_t *\* *\) *0\)->unicode_prop_name_pool_str([^,]+)/pool_offset(\1)/g' diff --git a/src/make_unicode_property_data.py b/src/make_unicode_property_data.py index dc3071a..9776628 100755 --- a/src/make_unicode_property_data.py +++ b/src/make_unicode_property_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_property_data.py -# Copyright (c) 2016-2018 K.Kosako +# Copyright (c) 2016-2019 K.Kosako import sys import re @@ -22,9 +22,12 @@ PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") -VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") +UNICODE_VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") +EMOJI_VERSION_REG = re.compile("(?i)#\s*Version:\s*(\d+)\.(\d+)") + +VERSION_INFO = [-1, -1, -1] +EMOJI_VERSION_INFO = [-1, -1] -VERSION_INFO = None DIC = { } KDIC = { } PropIndex = { } @@ -40,13 +43,6 @@ def fix_block_name(name): s = re.sub(r'[- ]+', '_', name) return 'In_' + s -def check_version_info(s): - global VERSION_INFO - m = VERSION_REG.match(s) - if m is not None: - VERSION_INFO = m.group(1) - - def print_ranges(ranges): for (start, end) in ranges: print "0x%06x, 0x%06x" % (start, end) @@ -233,7 +229,8 @@ def parse_unicode_data_file(f): normalize_ranges_in_dic(dic) return dic, assigned -def parse_properties(path, klass, prop_prefix = None): +def parse_properties(path, klass, prop_prefix = None, version_reg = None): + version_match = None with open(path, 'r') as f: dic = { } prop = None @@ -243,9 +240,10 @@ def parse_properties(path, klass, prop_prefix = None): if len(s) == 0: continue - if s[0] == '#': - if VERSION_INFO is None: - check_version_info(s) + if s[0] == '#' and version_reg is not None and version_match is None: + version_match = version_reg.match(s) + if version_match is not None: + continue m = PR_LINE_REG.match(s) if m: @@ -266,7 +264,7 @@ def parse_properties(path, klass, prop_prefix = None): props.append(prop) normalize_ranges_in_dic(dic) - return (dic, props) + return (dic, props, version_match) def parse_property_aliases(path): a = { } @@ -414,11 +412,11 @@ def entry_and_print_prop_and_index(name, index): nname = normalize_prop_name(name) print_prop_and_index(nname, index) -def parse_and_merge_properties(path, klass): - dic, props = parse_properties(path, klass) +def parse_and_merge_properties(path, klass, prop_prefix = None, version_reg = None): + dic, props, ver_m = parse_properties(path, klass, prop_prefix, version_reg) merge_dic(DIC, dic) merge_props(PROPS, props) - return dic, props + return dic, props, ver_m ### main ### argv = sys.argv @@ -447,11 +445,21 @@ with open('UnicodeData.txt', 'r') as f: PROPS = DIC.keys() PROPS = list_sub(PROPS, POSIX_LIST) -parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property') -dic, props = parse_and_merge_properties('Scripts.txt', 'Script') +_, _, ver_m = parse_and_merge_properties('DerivedCoreProperties.txt', 'Derived Property', None, UNICODE_VERSION_REG) +if ver_m is not None: + VERSION_INFO[0] = int(ver_m.group(1)) + VERSION_INFO[1] = int(ver_m.group(2)) + VERSION_INFO[2] = int(ver_m.group(3)) + +dic, props, _ = parse_and_merge_properties('Scripts.txt', 'Script') DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic)) + parse_and_merge_properties('PropList.txt', 'Binary Property') -parse_and_merge_properties('emoji-data.txt', 'Emoji Property') + +_, _, ver_m = parse_and_merge_properties('emoji-data.txt', 'Emoji Property', None, EMOJI_VERSION_REG) +if ver_m is not None: + EMOJI_VERSION_INFO[0] = int(ver_m.group(1)) + EMOJI_VERSION_INFO[1] = int(ver_m.group(2)) PROPS.append('Unknown') KDIC['Unknown'] = 'Script' @@ -464,9 +472,9 @@ dic, BLOCKS = parse_blocks('Blocks.txt') merge_dic(DIC, dic) if INCLUDE_GRAPHEME_CLUSTER_DATA: - dic, props = parse_properties('GraphemeBreakProperty.txt', - 'GraphemeBreak Property', - GRAPHEME_CLUSTER_BREAK_NAME_PREFIX) + dic, props, _ = parse_properties('GraphemeBreakProperty.txt', + 'GraphemeBreak Property', + GRAPHEME_CLUSTER_BREAK_NAME_PREFIX) merge_dic(DIC, dic) merge_props(PROPS, props) #prop = GRAPHEME_CLUSTER_BREAK_NAME_PREFIX + 'Other' @@ -533,9 +541,13 @@ sys.stdout.write(s) if OUTPUT_LIST_MODE: UPF = open("UNICODE_PROPERTIES", "w") - if VERSION_INFO is not None: - print >> UPF, "Unicode Properties (from Unicode Version: %s)" % VERSION_INFO - print >> UPF, '' + if VERSION_INFO[0] < 0: + raise RuntimeError("Unicode Version is not found") + if EMOJI_VERSION_INFO[0] < 0: + raise RuntimeError("Emoji Version is not found") + + print >> UPF, "Unicode Properties (Unicode Version: %d.%d.%d, Emoji: %d.%d)" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2], EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) + print >> UPF, '' index = -1 for prop in POSIX_LIST: @@ -569,9 +581,14 @@ if not(POSIX_ONLY): print '%%' print '' if not(POSIX_ONLY): - if VERSION_INFO is not None: - print "#define UNICODE_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' + if VERSION_INFO[0] < 0: + raise RuntimeError("Unicode Version is not found") + if EMOJI_VERSION_INFO[0] < 0: + raise RuntimeError("Emoji Version is not found") + + print "#define UNICODE_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) + print "#define UNICODE_EMOJI_VERSION %02d%02d" % (EMOJI_VERSION_INFO[0], EMOJI_VERSION_INFO[1]) + print '' print "#define PROPERTY_NAME_MAX_SIZE %d" % (PROPERTY_NAME_MAX_LEN + 10) print "#define CODE_RANGES_NUM %d" % (index + 1) diff --git a/src/make_unicode_wb_data.py b/src/make_unicode_wb_data.py index 624fa7e..ddedd5d 100755 --- a/src/make_unicode_wb_data.py +++ b/src/make_unicode_wb_data.py @@ -13,18 +13,19 @@ PR_LINE_REG = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") PA_LINE_REG = re.compile("(\w+)\s*;\s*(\w+)") PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") BL_LINE_REG = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") -VERSION_REG = re.compile("#\s*.*-(\d+\.\d+\.\d+)\.txt") +VERSION_REG = re.compile("#\s*.*-(\d+)\.(\d+)\.(\d+)\.txt") -VERSION_INFO = None +VERSION_INFO = [-1, -1, -1] DIC = { } PROPS = [] PropIndex = { } def check_version_info(s): - global VERSION_INFO m = VERSION_REG.match(s) if m is not None: - VERSION_INFO = m.group(1) + VERSION_INFO[0] = int(m.group(1)) + VERSION_INFO[1] = int(m.group(2)) + VERSION_INFO[2] = int(m.group(3)) def print_ranges(ranges): for (start, end) in ranges: @@ -160,7 +161,7 @@ def parse_properties(path): continue if s[0] == '#': - if VERSION_INFO is None: + if VERSION_INFO[0] < 0: check_version_info(s) m = PR_LINE_REG.match(s) @@ -194,7 +195,7 @@ PROPS = sorted(PROPS) print '/* unicode_wb_data.c: Generated by make_unicode_wb_data.py. */' COPYRIGHT = ''' /*- - * Copyright (c) 2019 K.Kosako <kkosako0 AT gmail DOT com> + * Copyright (c) 2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -222,9 +223,11 @@ COPYRIGHT = ''' print COPYRIGHT print '' -if VERSION_INFO is not None: - print "#define WORD_BREAK_PROPERTY_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) - print '' +if VERSION_INFO[0] < 0: + raise RuntimeError("Version is not found.") + +print "#define WORD_BREAK_PROPERTY_VERSION %02d%02d%02d" % (VERSION_INFO[0], VERSION_INFO[1], VERSION_INFO[2]) +print '' ranges = [] for prop in PROPS: diff --git a/src/mktable.c b/src/mktable.c index 80ac08a..318bac0 100644 --- a/src/mktable.c +++ b/src/mktable.c @@ -2,7 +2,7 @@ mktable.c **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/onig_init.c b/src/onig_init.c index 7ad98b7..c660e7d 100644 --- a/src/onig_init.c +++ b/src/onig_init.c @@ -2,7 +2,7 @@ onig_init.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2016-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2016-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/oniggnu.h b/src/oniggnu.h index d688883..96d9085 100644 --- a/src/oniggnu.h +++ b/src/oniggnu.h @@ -4,7 +4,7 @@ oniggnu.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/onigposix.h b/src/onigposix.h index da0f919..5ff779f 100644 --- a/src/onigposix.h +++ b/src/onigposix.h @@ -4,7 +4,7 @@ onigposix.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -95,6 +95,7 @@ typedef struct { #endif #endif +#ifndef ONIG_STATIC #ifndef ONIG_EXTERN #if defined(_WIN32) && !defined(__GNUC__) #if defined(ONIGURUMA_EXPORT) @@ -108,6 +109,9 @@ typedef struct { #ifndef ONIG_EXTERN #define ONIG_EXTERN extern #endif +#else +#define ONIG_EXTERN extern +#endif #ifndef ONIGURUMA_H typedef unsigned int OnigOptionType; diff --git a/src/oniguruma.h b/src/oniguruma.h index f6aa5ba..08ac6f7 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -4,7 +4,7 @@ oniguruma.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -36,9 +36,9 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 6 #define ONIGURUMA_VERSION_MINOR 9 -#define ONIGURUMA_VERSION_TEENY 2 +#define ONIGURUMA_VERSION_TEENY 4 -#define ONIGURUMA_VERSION_INT 60902 +#define ONIGURUMA_VERSION_INT 60904 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -52,6 +52,7 @@ extern "C" { # define PV_(args) args #endif +#ifndef ONIG_STATIC #ifndef ONIG_EXTERN #if defined(_WIN32) && !defined(__GNUC__) #if defined(ONIGURUMA_EXPORT) @@ -65,6 +66,9 @@ extern "C" { #ifndef ONIG_EXTERN #define ONIG_EXTERN extern #endif +#else +#define ONIG_EXTERN extern +#endif /* PART: character encoding */ @@ -517,6 +521,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1U<<21) /* [..\w..] etc.. */ #define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1U<<22) #define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1U<<23) /* [0-9-a]=[0-9\-a] */ +#define ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC (1U<<26) /* syntax (behavior) warning */ #define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1U<<24) /* [,-,] */ #define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1U<<25) /* (?:a*)+ */ @@ -682,6 +687,14 @@ typedef OnigRegexType* OnigRegex; typedef OnigRegexType regex_t; #endif +struct OnigRegSetStruct; +typedef struct OnigRegSetStruct OnigRegSet; + +typedef enum { + ONIG_REGSET_POSITION_LEAD = 0, + ONIG_REGSET_REGEX_LEAD = 1, + ONIG_REGSET_PRIORITY_TO_REGEX_ORDER = 2 +} OnigRegSetLead; typedef struct { int num_of_elements; @@ -766,6 +779,8 @@ int onig_init P_((void)); ONIG_EXTERN int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...)); ONIG_EXTERN +int onig_is_error_code_needs_param PV_((int code)); +ONIG_EXTERN void onig_set_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN void onig_set_verb_warn_func P_((OnigWarnFunc f)); @@ -790,6 +805,26 @@ ONIG_EXTERN int onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN int onig_match_with_param P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp)); + +ONIG_EXTERN +int onig_regset_new P_((OnigRegSet** rset, int n, regex_t* regs[])); +ONIG_EXTERN +int onig_regset_add P_((OnigRegSet* set, regex_t* reg)); +ONIG_EXTERN +int onig_regset_replace P_((OnigRegSet* set, int at, regex_t* reg)); +ONIG_EXTERN +void onig_regset_free P_((OnigRegSet* set)); +ONIG_EXTERN +int onig_regset_number_of_regex P_((OnigRegSet* set)); +ONIG_EXTERN +regex_t* onig_regset_get_regex P_((OnigRegSet* set, int at)); +ONIG_EXTERN +OnigRegion* onig_regset_get_region P_((OnigRegSet* set, int at)); +ONIG_EXTERN +int onig_regset_search P_((OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, int* rmatch_pos)); +ONIG_EXTERN +int onig_regset_search_with_param P_((OnigRegSet* set, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegSetLead lead, OnigOptionType option, OnigMatchParam* mps[], int* rmatch_pos)); + ONIG_EXTERN OnigRegion* onig_region_new P_((void)); ONIG_EXTERN diff --git a/src/regcomp.c b/src/regcomp.c index c2c04a4..69d4b95 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -2,7 +2,7 @@ regcomp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -224,17 +224,17 @@ ops_free(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: if (! is_in_string_pool(reg, op->exact_len_n.s)) xfree(op->exact_len_n.s); break; - case OP_EXACTN: case OP_EXACTMB2N: case OP_EXACTMB3N: case OP_EXACTN_IC: + case OP_STR_N: case OP_STR_MB2N: case OP_STR_MB3N: case OP_STR_N_IC: if (! is_in_string_pool(reg, op->exact_n.s)) xfree(op->exact_n.s); break; - case OP_EXACT1: case OP_EXACT2: case OP_EXACT3: case OP_EXACT4: - case OP_EXACT5: case OP_EXACTMB2N1: case OP_EXACTMB2N2: - case OP_EXACTMB2N3: case OP_EXACT1_IC: + case OP_STR_1: case OP_STR_2: case OP_STR_3: case OP_STR_4: + case OP_STR_5: case OP_STR_MB2N1: case OP_STR_MB2N2: + case OP_STR_MB2N3: case OP_STR_1_IC: break; case OP_CCLASS_NOT: case OP_CCLASS: @@ -298,17 +298,17 @@ ops_calc_size_of_string_pool(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: total += op->exact_len_n.len * op->exact_len_n.n; break; - case OP_EXACTN: - case OP_EXACTN_IC: + case OP_STR_N: + case OP_STR_N_IC: total += op->exact_n.n; break; - case OP_EXACTMB2N: + case OP_STR_MB2N: total += op->exact_n.n * 2; break; - case OP_EXACTMB3N: + case OP_STR_MB3N: total += op->exact_n.n * 3; break; @@ -349,15 +349,15 @@ ops_make_string_pool(regex_t* reg) #endif switch (opcode) { - case OP_EXACTMBN: + case OP_STR_MBN: len = op->exact_len_n.len * op->exact_len_n.n; xmemcpy(curr, op->exact_len_n.s, len); xfree(op->exact_len_n.s); op->exact_len_n.s = curr; curr += len; break; - case OP_EXACTN: - case OP_EXACTN_IC: + case OP_STR_N: + case OP_STR_N_IC: len = op->exact_n.n; copy: xmemcpy(curr, op->exact_n.s, len); @@ -365,11 +365,11 @@ ops_make_string_pool(regex_t* reg) op->exact_n.s = curr; curr += len; break; - case OP_EXACTMB2N: + case OP_STR_MB2N: len = op->exact_n.n * 2; goto copy; break; - case OP_EXACTMB3N: + case OP_STR_MB3N: len = op->exact_n.n * 3; goto copy; break; @@ -427,7 +427,7 @@ onig_positive_int_multiply(int x, int y) static void -swap_node(Node* a, Node* b) +node_swap(Node* a, Node* b) { Node c; @@ -452,6 +452,81 @@ swap_node(Node* a, Node* b) } } +static int +node_list_len(Node* list) +{ + int len; + + len = 1; + while (IS_NOT_NULL(NODE_CDR(list))) { + list = NODE_CDR(list); + len++; + } + + return len; +} + +static Node* +node_list_add(Node* list, Node* x) +{ + Node *n; + + n = onig_node_new_list(x, NULL); + if (IS_NULL(n)) return NULL_NODE; + + if (IS_NOT_NULL(list)) { + while (IS_NOT_NULL(NODE_CDR(list))) + list = NODE_CDR(list); + + NODE_CDR(list) = n; + } + + return n; +} + +static int +node_str_node_cat(Node* node, Node* add) +{ + int r; + + if (STR_(node)->flag != STR_(add)->flag) + return ONIGERR_TYPE_BUG; + + r = onig_node_str_cat(node, STR_(add)->s, STR_(add)->end); + if (r != 0) return r; + + if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) + STR_(node)->case_min_len += STR_(add)->case_min_len; + + return 0; +} + +static int +node_str_cat_case_fold(Node* node, const UChar* s, const UChar* end, int case_min_len) +{ + int r; + + if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) + return ONIGERR_TYPE_BUG; + + r = onig_node_str_cat(node, s, end); + if (r != 0) return r; + + STR_(node)->case_min_len += case_min_len; + return 0; +} + +static void +node_conv_to_str_node(Node* node, int flag) +{ + NODE_SET_TYPE(node, NODE_STRING); + STR_(node)->flag = flag; + STR_(node)->s = STR_(node)->buf; + STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; +} + static OnigLen distance_add(OnigLen d1, OnigLen d2) { @@ -549,81 +624,108 @@ static int compile_length_tree(Node* node, regex_t* reg); static int compile_tree(Node* node, regex_t* reg, ScanEnv* env); -#define IS_NEED_STR_LEN_OP_EXACT(op) \ - ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\ - (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC) +#define IS_NEED_STR_LEN_OP(op) \ + ((op) == OP_STR_N || (op) == OP_STR_MB2N ||\ + (op) == OP_STR_MB3N || (op) == OP_STR_MBN || (op) == OP_STR_N_IC) static int -select_str_opcode(int mb_len, int str_len, int ignore_case) +select_str_opcode(int mb_len, int str_len) { int op; - if (ignore_case) { + switch (mb_len) { + case 1: switch (str_len) { - case 1: op = OP_EXACT1_IC; break; - default: op = OP_EXACTN_IC; break; + case 1: op = OP_STR_1; break; + case 2: op = OP_STR_2; break; + case 3: op = OP_STR_3; break; + case 4: op = OP_STR_4; break; + case 5: op = OP_STR_5; break; + default: op = OP_STR_N; break; } - } - else { - switch (mb_len) { - case 1: - switch (str_len) { - case 1: op = OP_EXACT1; break; - case 2: op = OP_EXACT2; break; - case 3: op = OP_EXACT3; break; - case 4: op = OP_EXACT4; break; - case 5: op = OP_EXACT5; break; - default: op = OP_EXACTN; break; - } - break; + break; - case 2: - switch (str_len) { - case 1: op = OP_EXACTMB2N1; break; - case 2: op = OP_EXACTMB2N2; break; - case 3: op = OP_EXACTMB2N3; break; - default: op = OP_EXACTMB2N; break; - } - break; + case 2: + switch (str_len) { + case 1: op = OP_STR_MB2N1; break; + case 2: op = OP_STR_MB2N2; break; + case 3: op = OP_STR_MB2N3; break; + default: op = OP_STR_MB2N; break; + } + break; - case 3: - op = OP_EXACTMB3N; - break; + case 3: + op = OP_STR_MB3N; + break; - default: - op = OP_EXACTMBN; - break; - } + default: + op = OP_STR_MBN; + break; } + return op; } static int -compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env) +is_strict_real_node(Node* node) +{ + switch (NODE_TYPE(node)) { + case NODE_STRING: + { + StrNode* sn = STR_(node); + return (sn->end != sn->s); + } + break; + + case NODE_CCLASS: + case NODE_CTYPE: + return 1; + break; + + default: + return 0; + break; + } +} + +static int +compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) { int r; - int saved_num_null_check = reg->num_null_check; + int saved_num_empty_check; + int emptiness; + Node* body; - if (empty_info != BODY_IS_NOT_EMPTY) { + body = NODE_BODY((Node* )qn); + emptiness = qn->emptiness; + saved_num_empty_check = reg->num_empty_check; + + if (emptiness != BODY_IS_NOT_EMPTY) { r = add_op(reg, OP_EMPTY_CHECK_START); if (r != 0) return r; - COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */ - reg->num_null_check++; + COP(reg)->empty_check_start.mem = reg->num_empty_check; /* NULL CHECK ID */ + reg->num_empty_check++; } - r = compile_tree(node, reg, env); + r = compile_tree(body, reg, env); if (r != 0) return r; - if (empty_info != BODY_IS_NOT_EMPTY) { - if (empty_info == BODY_IS_EMPTY) + if (emptiness != BODY_IS_NOT_EMPTY) { + if (emptiness == BODY_IS_EMPTY_POSSIBILITY) r = add_op(reg, OP_EMPTY_CHECK_END); - else if (empty_info == BODY_IS_EMPTY_MEM) - r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); - else if (empty_info == BODY_IS_EMPTY_REC) + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) { + if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0) + r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); + else + r = add_op(reg, OP_EMPTY_CHECK_END); + } +#ifdef USE_CALL + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); +#endif if (r != 0) return r; - COP(reg)->empty_check_end.mem = saved_num_null_check; /* NULL CHECK ID */ + COP(reg)->empty_check_end.mem = saved_num_empty_check; /* NULL CHECK ID */ } return r; } @@ -660,14 +762,13 @@ compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env) static int add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, int str_len, - regex_t* reg ARG_UNUSED, int ignore_case) + regex_t* reg ARG_UNUSED) { return 1; } static int -add_compile_string(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) +add_compile_string(UChar* s, int mb_len, int str_len, regex_t* reg) { int op; int r; @@ -675,14 +776,14 @@ add_compile_string(UChar* s, int mb_len, int str_len, UChar* p; UChar* end; - op = select_str_opcode(mb_len, str_len, ignore_case); + op = select_str_opcode(mb_len, str_len); r = add_op(reg, op); if (r != 0) return r; byte_len = mb_len * str_len; end = s + byte_len; - if (op == OP_EXACTMBN) { + if (op == OP_STR_MBN) { p = onigenc_strdup(reg->enc, s, end); CHECK_NULL_RETURN_MEMERR(p); @@ -690,11 +791,11 @@ add_compile_string(UChar* s, int mb_len, int str_len, COP(reg)->exact_len_n.n = str_len; COP(reg)->exact_len_n.s = p; } - else if (IS_NEED_STR_LEN_OP_EXACT(op)) { + else if (IS_NEED_STR_LEN_OP(op)) { p = onigenc_strdup(reg->enc, s, end); CHECK_NULL_RETURN_MEMERR(p); - if (op == OP_EXACTN_IC) + if (op == OP_STR_N_IC) COP(reg)->exact_n.n = byte_len; else COP(reg)->exact_n.n = str_len; @@ -702,8 +803,8 @@ add_compile_string(UChar* s, int mb_len, int str_len, COP(reg)->exact_n.s = p; } else { + xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s)); xmemcpy(COP(reg)->exact.s, s, (size_t )byte_len); - COP(reg)->exact.s[byte_len] = '\0'; } return 0; @@ -712,7 +813,7 @@ add_compile_string(UChar* s, int mb_len, int str_len, static int compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig; + int rlen, r, len, prev_len, slen; UChar *p, *prev; StrNode* sn; OnigEncoding enc = reg->enc; @@ -721,7 +822,7 @@ compile_length_string_node(Node* node, regex_t* reg) if (sn->end <= sn->s) return 0; - ambig = NODE_STRING_IS_AMBIG(node); + if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) return 1; p = prev = sn->s; prev_len = enclen(enc, p); @@ -735,7 +836,7 @@ compile_length_string_node(Node* node, regex_t* reg) slen++; } else { - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, slen, reg); rlen += r; prev = p; slen = 1; @@ -744,25 +845,59 @@ compile_length_string_node(Node* node, regex_t* reg) p += len; } - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, slen, reg); rlen += r; return rlen; } static int -compile_length_string_raw_node(StrNode* sn, regex_t* reg) +compile_length_string_crude_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; return add_compile_string_length(sn->s, 1 /* sb */, (int )(sn->end - sn->s), - reg, 0); + reg); +} + +static int +compile_ambig_string_node(Node* node, regex_t* reg) +{ + int r; + int len; + int byte_len; + UChar* p; + StrNode* sn; + OnigEncoding enc = reg->enc; + + sn = STR_(node); + len = enclen(enc, sn->s); + byte_len = (int )(sn->end - sn->s); + if (len == byte_len) { + r = add_op(reg, OP_STR_1_IC); + if (r != 0) return r; + + xmemset(COP(reg)->exact.s, 0, sizeof(COP(reg)->exact.s)); + xmemcpy(COP(reg)->exact.s, sn->s, (size_t )byte_len); + } + else { + r = add_op(reg, OP_STR_N_IC); + if (r != 0) return r; + + p = onigenc_strdup(enc, sn->s, sn->end); + CHECK_NULL_RETURN_MEMERR(p); + + COP(reg)->exact_n.s = p; + COP(reg)->exact_n.n = byte_len; + } + + return 0; } static int compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig; + int r, len, prev_len, slen; UChar *p, *prev, *end; StrNode* sn; OnigEncoding enc = reg->enc; @@ -772,7 +907,9 @@ compile_string_node(Node* node, regex_t* reg) return 0; end = sn->end; - ambig = NODE_STRING_IS_AMBIG(node); + if (NODE_STRING_IS_CASE_FOLD_MATCH(node) != 0) { + return compile_ambig_string_node(node, reg); + } p = prev = sn->s; prev_len = enclen(enc, p); @@ -785,7 +922,7 @@ compile_string_node(Node* node, regex_t* reg) slen++; } else { - r = add_compile_string(prev, prev_len, slen, reg, ambig); + r = add_compile_string(prev, prev_len, slen, reg); if (r != 0) return r; prev = p; @@ -796,16 +933,16 @@ compile_string_node(Node* node, regex_t* reg) p += len; } - return add_compile_string(prev, prev_len, slen, reg, ambig); + return add_compile_string(prev, prev_len, slen, reg); } static int -compile_string_raw_node(StrNode* sn, regex_t* reg) +compile_string_crude_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; - return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg, 0); + return add_compile_string(sn->s, 1 /* sb */, (int )(sn->end - sn->s), reg); } static void* @@ -869,15 +1006,27 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) return 0; } +static void +set_addr_in_repeat_range(regex_t* reg) +{ + int i; + + for (i = 0; i < reg->num_repeat; i++) { + RepeatRange* p = reg->repeat_range + i; + int offset = p->u.offset; + p->u.pcode = reg->ops + offset; + } +} + static int -entry_repeat_range(regex_t* reg, int id, int lower, int upper) +entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index) { #define REPEAT_RANGE_ALLOC 4 - OnigRepeatRange* p; + RepeatRange* p; if (reg->repeat_range_alloc == 0) { - p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC); + p = (RepeatRange* )xmalloc(sizeof(RepeatRange) * REPEAT_RANGE_ALLOC); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = REPEAT_RANGE_ALLOC; @@ -885,7 +1034,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) else if (reg->repeat_range_alloc <= id) { int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; - p = (OnigRepeatRange* )xrealloc(reg->repeat_range, sizeof(OnigRepeatRange) * n); + p = (RepeatRange* )xrealloc(reg->repeat_range, sizeof(RepeatRange) * n); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = n; @@ -894,13 +1043,14 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) p = reg->repeat_range; } - p[id].lower = lower; - p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); + p[id].lower = lower; + p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); + p[id].u.offset = ops_index; return 0; } static int -compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, +compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, regex_t* reg, ScanEnv* env) { int r; @@ -910,24 +1060,16 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, if (r != 0) return r; COP(reg)->repeat.id = num_repeat; - COP(reg)->repeat.addr = SIZE_INC_OP + target_len + SIZE_OP_REPEAT_INC; + COP(reg)->repeat.addr = SIZE_INC + target_len + OPSIZE_REPEAT_INC; - r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); + r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper, + COP_CURR_OFFSET(reg) + OPSIZE_REPEAT); if (r != 0) return r; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - if ( -#ifdef USE_CALL - NODE_IS_IN_MULTI_ENTRY(qn) || -#endif - NODE_IS_IN_REAL_REPEAT(qn)) { - r = add_op(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); - } - else { - r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); - } + r = add_op(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); if (r != 0) return r; COP(reg)->repeat_inc.id = num_repeat; @@ -937,7 +1079,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, static int is_anychar_infinite_greedy(QuantNode* qn) { - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && + if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) && NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn))) return 1; else @@ -951,8 +1093,8 @@ static int compile_length_quantifier_node(QuantNode* qn, regex_t* reg) { int len, mod_tlen; - int infinite = IS_REPEAT_INFINITE(qn->upper); - enum BodyEmpty empty_info = qn->empty_info; + int infinite = IS_INFINITE_REPEAT(qn->upper); + enum BodyEmptyType emptiness = qn->emptiness; int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg); if (tlen < 0) return tlen; @@ -963,22 +1105,21 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) if (qn->lower <= 1 || int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0) { if (IS_NOT_NULL(qn->next_head_exact)) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + return OPSIZE_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + return OPSIZE_ANYCHAR_STAR + tlen * qn->lower; } } - if (empty_info == BODY_IS_NOT_EMPTY) - mod_tlen = tlen; - else - mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); + mod_tlen = tlen; + if (emptiness != BODY_IS_NOT_EMPTY) + mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || int_multiply_cmp(tlen, qn->lower, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { - len = SIZE_OP_JUMP; + len = OPSIZE_JUMP; } else { len = tlen * qn->lower; @@ -987,36 +1128,36 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) - len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH_OR_JUMP_EXACT1 + mod_tlen + OPSIZE_JUMP; else #endif if (IS_NOT_NULL(qn->next_head_exact)) - len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH_IF_PEEK_NEXT + mod_tlen + OPSIZE_JUMP; else - len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP; + len += OPSIZE_PUSH + mod_tlen + OPSIZE_JUMP; } else - len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH; + len += OPSIZE_JUMP + mod_tlen + OPSIZE_PUSH; } else if (qn->upper == 0) { - if (qn->is_refered != 0) { /* /(?<n>..){0}/ */ - len = SIZE_OP_JUMP + tlen; + if (qn->include_referred != 0) { /* /(?<n>..){0}/ */ + len = OPSIZE_JUMP + tlen; } else len = 0; } else if (!infinite && qn->greedy && (qn->upper == 1 || - int_multiply_cmp(tlen + SIZE_OP_PUSH, qn->upper, + int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { len = tlen * qn->lower; - len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); + len += (OPSIZE_PUSH + tlen) * (qn->upper - qn->lower); } else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ - len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen; + len = OPSIZE_PUSH + OPSIZE_JUMP + tlen; } else { - len = SIZE_OP_REPEAT_INC + mod_tlen + SIZE_OP_REPEAT; + len = OPSIZE_REPEAT_INC + mod_tlen + OPSIZE_REPEAT; } return len; @@ -1026,8 +1167,8 @@ static int compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) { int i, r, mod_tlen; - int infinite = IS_REPEAT_INFINITE(qn->upper); - enum BodyEmpty empty_info = qn->empty_info; + int infinite = IS_INFINITE_REPEAT(qn->upper); + enum BodyEmptyType emptiness = qn->emptiness; int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg); if (tlen < 0) return tlen; @@ -1055,10 +1196,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } } - if (empty_info == BODY_IS_NOT_EMPTY) - mod_tlen = tlen; - else - mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); + mod_tlen = tlen; + if (emptiness != BODY_IS_NOT_EMPTY) + mod_tlen += OPSIZE_EMPTY_CHECK_START + OPSIZE_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || @@ -1071,16 +1211,16 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) - COP(reg)->jump.addr = SIZE_OP_PUSH_OR_JUMP_EXACT1 + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH_OR_JUMP_EXACT1 + SIZE_INC; else #endif if (IS_NOT_NULL(qn->next_head_exact)) - COP(reg)->jump.addr = SIZE_OP_PUSH_IF_PEEK_NEXT + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH_IF_PEEK_NEXT + SIZE_INC; else - COP(reg)->jump.addr = SIZE_OP_PUSH + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_PUSH + SIZE_INC; } else { - COP(reg)->jump.addr = SIZE_OP_JUMP + SIZE_INC_OP; + COP(reg)->jump.addr = OPSIZE_JUMP + SIZE_INC; } } else { @@ -1093,36 +1233,36 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (IS_NOT_NULL(qn->head_exact)) { r = add_op(reg, OP_PUSH_OR_JUMP_EXACT1); if (r != 0) return r; - COP(reg)->push_or_jump_exact1.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push_or_jump_exact1.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1); + addr = -(mod_tlen + (int )OPSIZE_PUSH_OR_JUMP_EXACT1); } else #endif if (IS_NOT_NULL(qn->next_head_exact)) { r = add_op(reg, OP_PUSH_IF_PEEK_NEXT); if (r != 0) return r; - COP(reg)->push_if_peek_next.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push_if_peek_next.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH_IF_PEEK_NEXT); + addr = -(mod_tlen + (int )OPSIZE_PUSH_IF_PEEK_NEXT); } else { r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + mod_tlen + OPSIZE_JUMP; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; - addr = -(mod_tlen + (int )SIZE_OP_PUSH); + addr = -(mod_tlen + (int )OPSIZE_PUSH); } r = add_op(reg, OP_JUMP); @@ -1132,9 +1272,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) else { r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = mod_tlen + SIZE_INC_OP; + COP(reg)->jump.addr = mod_tlen + SIZE_INC; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_quant_body_with_empty_check(qn, reg, env); if (r != 0) return r; r = add_op(reg, OP_PUSH); @@ -1143,10 +1283,10 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } } else if (qn->upper == 0) { - if (qn->is_refered != 0) { /* /(?<n>..){0}/ */ + if (qn->include_referred != 0) { /* /(?<n>..){0}/ */ r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = tlen + SIZE_INC_OP; + COP(reg)->jump.addr = tlen + SIZE_INC; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } @@ -1157,7 +1297,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } else if (! infinite && qn->greedy && (qn->upper == 1 || - int_multiply_cmp(tlen + SIZE_OP_PUSH, qn->upper, + int_multiply_cmp(tlen + OPSIZE_PUSH, qn->upper, QUANTIFIER_EXPAND_LIMIT_SIZE) <= 0)) { int n = qn->upper - qn->lower; @@ -1165,7 +1305,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; for (i = 0; i < n; i++) { - int v = onig_positive_int_multiply(n - i, tlen + SIZE_OP_PUSH); + int v = onig_positive_int_multiply(n - i, tlen + OPSIZE_PUSH); if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; r = add_op(reg, OP_PUSH); @@ -1179,16 +1319,16 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) else if (! qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + OPSIZE_JUMP; r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = tlen + SIZE_INC_OP; + COP(reg)->jump.addr = tlen + SIZE_INC; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } else { - r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg, env); + r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env); } return r; } @@ -1240,40 +1380,40 @@ compile_length_bag_node(BagNode* node, regex_t* reg) #ifdef USE_CALL if (node->m.regnum == 0 && NODE_IS_CALLED(node)) { - len = tlen + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; + len = tlen + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN; return len; } if (NODE_IS_CALLED(node)) { - len = SIZE_OP_MEMORY_START_PUSH + tlen - + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + len = OPSIZE_MEM_START_PUSH + tlen + + OPSIZE_CALL + OPSIZE_JUMP + OPSIZE_RETURN; + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH); else len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END); } else if (NODE_IS_RECURSION(node)) { - len = SIZE_OP_MEMORY_START_PUSH; - len += tlen + (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC); + len = OPSIZE_MEM_START_PUSH; + len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum) + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_REC); } else #endif { - if (MEM_STATUS_AT0(reg->bt_mem_start, node->m.regnum)) - len = SIZE_OP_MEMORY_START_PUSH; + if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum)) + len = OPSIZE_MEM_START_PUSH; else - len = SIZE_OP_MEMORY_START; + len = OPSIZE_MEM_START; - len += tlen + (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum) - ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END); + len += tlen + (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum) + ? OPSIZE_MEM_END_PUSH : OPSIZE_MEM_END); } break; case BAG_STOP_BACKTRACK: - if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { + if (NODE_IS_STRICT_REAL_REPEAT(node)) { int v; QuantNode* qn; @@ -1283,10 +1423,10 @@ compile_length_bag_node(BagNode* node, regex_t* reg) v = onig_positive_int_multiply(qn->lower, tlen); if (v < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; - len = v + SIZE_OP_PUSH + tlen + SIZE_OP_POP_OUT + SIZE_OP_JUMP; + len = v + OPSIZE_PUSH + tlen + OPSIZE_POP_OUT + OPSIZE_JUMP; } else { - len = SIZE_OP_ATOMIC_START + tlen + SIZE_OP_ATOMIC_END; + len = OPSIZE_ATOMIC_START + tlen + OPSIZE_ATOMIC_END; } break; @@ -1298,8 +1438,8 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len = compile_length_tree(cond, reg); if (len < 0) return len; - len += SIZE_OP_PUSH; - len += SIZE_OP_ATOMIC_START + SIZE_OP_ATOMIC_END; + len += OPSIZE_PUSH; + len += OPSIZE_ATOMIC_START + OPSIZE_ATOMIC_END; if (IS_NOT_NULL(Then)) { tlen = compile_length_tree(Then, reg); @@ -1307,8 +1447,9 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len += tlen; } + len += OPSIZE_JUMP + OPSIZE_ATOMIC_END; + if (IS_NOT_NULL(Else)) { - len += SIZE_OP_JUMP; tlen = compile_length_tree(Else, reg); if (tlen < 0) return tlen; len += tlen; @@ -1331,24 +1472,25 @@ static int compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) { int r; - int len; #ifdef USE_CALL if (NODE_IS_CALLED(node)) { + int len; + r = add_op(reg, OP_CALL); if (r != 0) return r; - node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + SIZE_OP_JUMP; + node->m.called_addr = COP_CURR_OFFSET(reg) + 1 + OPSIZE_JUMP; NODE_STATUS_ADD(node, ADDR_FIXED); COP(reg)->call.addr = (int )node->m.called_addr; if (node->m.regnum == 0) { len = compile_length_tree(NODE_BAG_BODY(node), reg); - len += SIZE_OP_RETURN; + len += OPSIZE_RETURN; r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = len + SIZE_INC_OP; + COP(reg)->jump.addr = len + SIZE_INC; r = compile_tree(NODE_BAG_BODY(node), reg, env); if (r != 0) return r; @@ -1358,25 +1500,24 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) } else { len = compile_length_tree(NODE_BAG_BODY(node), reg); - len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN); - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + len += (OPSIZE_MEM_START_PUSH + OPSIZE_RETURN); + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + ? OPSIZE_MEM_END_PUSH_REC : OPSIZE_MEM_END_PUSH); else - len += (NODE_IS_RECURSION(node) - ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + len += (NODE_IS_RECURSION(node) ? OPSIZE_MEM_END_REC : OPSIZE_MEM_END); r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = len + SIZE_INC_OP; + COP(reg)->jump.addr = len + SIZE_INC; } } #endif - if (MEM_STATUS_AT0(reg->bt_mem_start, node->m.regnum)) - r = add_op(reg, OP_MEMORY_START_PUSH); + if (MEM_STATUS_AT0(reg->push_mem_start, node->m.regnum)) + r = add_op(reg, OP_MEM_START_PUSH); else - r = add_op(reg, OP_MEMORY_START); + r = add_op(reg, OP_MEM_START); if (r != 0) return r; COP(reg)->memory_start.num = node->m.regnum; @@ -1384,11 +1525,11 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) if (r != 0) return r; #ifdef USE_CALL - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) r = add_op(reg, (NODE_IS_RECURSION(node) - ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH)); + ? OP_MEM_END_PUSH_REC : OP_MEM_END_PUSH)); else - r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEMORY_END_REC : OP_MEMORY_END)); + r = add_op(reg, (NODE_IS_RECURSION(node) ? OP_MEM_END_REC : OP_MEM_END)); if (r != 0) return r; COP(reg)->memory_end.num = node->m.regnum; @@ -1397,10 +1538,10 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_RETURN); } #else - if (MEM_STATUS_AT0(reg->bt_mem_end, node->m.regnum)) - r = add_op(reg, OP_MEMORY_END_PUSH); + if (MEM_STATUS_AT0(reg->push_mem_end, node->m.regnum)) + r = add_op(reg, OP_MEM_END_PUSH); else - r = add_op(reg, OP_MEMORY_END); + r = add_op(reg, OP_MEM_END); if (r != 0) return r; COP(reg)->memory_end.num = node->m.regnum; #endif @@ -1423,7 +1564,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) break; case BAG_STOP_BACKTRACK: - if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { + if (NODE_IS_STRICT_REAL_REPEAT(node)) { QuantNode* qn = QUANT_(NODE_BAG_BODY(node)); r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env); if (r != 0) return r; @@ -1433,7 +1574,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + len + SIZE_OP_POP_OUT + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + len + OPSIZE_POP_OUT + OPSIZE_JUMP; r = compile_tree(NODE_QUANT_BODY(qn), reg, env); if (r != 0) return r; @@ -1442,7 +1583,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_JUMP); if (r != 0) return r; - COP(reg)->jump.addr = -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP_OUT); + COP(reg)->jump.addr = -((int )OPSIZE_PUSH + len + (int )OPSIZE_POP_OUT); } else { r = add_op(reg, OP_ATOMIC_START); @@ -1455,7 +1596,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) case BAG_IF_ELSE: { - int cond_len, then_len, jump_len; + int cond_len, then_len, else_len, jump_len; Node* cond = NODE_BAG_BODY(node); Node* Then = node->te.Then; Node* Else = node->te.Else; @@ -1472,12 +1613,11 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) else then_len = 0; - jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END; - if (IS_NOT_NULL(Else)) jump_len += SIZE_OP_JUMP; + jump_len = cond_len + then_len + OPSIZE_ATOMIC_END + OPSIZE_JUMP; r = add_op(reg, OP_PUSH); if (r != 0) return r; - COP(reg)->push.addr = SIZE_INC_OP + jump_len; + COP(reg)->push.addr = SIZE_INC + jump_len; r = compile_tree(cond, reg, env); if (r != 0) return r; @@ -1490,11 +1630,20 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) } if (IS_NOT_NULL(Else)) { - int else_len = compile_length_tree(Else, reg); - r = add_op(reg, OP_JUMP); - if (r != 0) return r; - COP(reg)->jump.addr = else_len + SIZE_INC_OP; + else_len = compile_length_tree(Else, reg); + if (else_len < 0) return else_len; + } + else + else_len = 0; + + r = add_op(reg, OP_JUMP); + if (r != 0) return r; + COP(reg)->jump.addr = OPSIZE_ATOMIC_END + else_len + SIZE_INC; + r = add_op(reg, OP_ATOMIC_END); + if (r != 0) return r; + + if (IS_NOT_NULL(Else)) { r = compile_tree(Else, reg, env); } } @@ -1517,16 +1666,16 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) switch (node->type) { case ANCR_PREC_READ: - len = SIZE_OP_PREC_READ_START + tlen + SIZE_OP_PREC_READ_END; + len = OPSIZE_PREC_READ_START + tlen + OPSIZE_PREC_READ_END; break; case ANCR_PREC_READ_NOT: - len = SIZE_OP_PREC_READ_NOT_START + tlen + SIZE_OP_PREC_READ_NOT_END; + len = OPSIZE_PREC_READ_NOT_START + tlen + OPSIZE_PREC_READ_NOT_END; break; case ANCR_LOOK_BEHIND: - len = SIZE_OP_LOOK_BEHIND + tlen; + len = OPSIZE_LOOK_BEHIND + tlen; break; case ANCR_LOOK_BEHIND_NOT: - len = SIZE_OP_LOOK_BEHIND_NOT_START + tlen + SIZE_OP_LOOK_BEHIND_NOT_END; + len = OPSIZE_LOOK_BEHIND_NOT_START + tlen + OPSIZE_LOOK_BEHIND_NOT_END; break; case ANCR_WORD_BOUNDARY: @@ -1535,7 +1684,7 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg) case ANCR_WORD_BEGIN: case ANCR_WORD_END: #endif - len = SIZE_OP_WORD_BOUNDARY; + len = OPSIZE_WORD_BOUNDARY; break; case ANCR_TEXT_SEGMENT_BOUNDARY: @@ -1619,7 +1768,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) r = add_op(reg, OP_PREC_READ_NOT_START); if (r != 0) return r; - COP(reg)->prec_read_not_start.addr = SIZE_INC_OP + len + SIZE_OP_PREC_READ_NOT_END; + COP(reg)->prec_read_not_start.addr = SIZE_INC + len + OPSIZE_PREC_READ_NOT_END; r = compile_tree(NODE_ANCHOR_BODY(node), reg, env); if (r != 0) return r; r = add_op(reg, OP_PREC_READ_NOT_END); @@ -1649,7 +1798,7 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) len = compile_length_tree(NODE_ANCHOR_BODY(node), reg); r = add_op(reg, OP_LOOK_BEHIND_NOT_START); if (r != 0) return r; - COP(reg)->look_behind_not_start.addr = SIZE_INC_OP + len + SIZE_OP_LOOK_BEHIND_NOT_END; + COP(reg)->look_behind_not_start.addr = SIZE_INC + len + OPSIZE_LOOK_BEHIND_NOT_END; if (node->char_len < 0) { r = get_char_len_node(NODE_ANCHOR_BODY(node), reg, &n); @@ -1735,25 +1884,25 @@ compile_length_gimmick_node(GimmickNode* node, regex_t* reg) switch (node->type) { case GIMMICK_FAIL: - len = SIZE_OP_FAIL; + len = OPSIZE_FAIL; break; case GIMMICK_SAVE: - len = SIZE_OP_PUSH_SAVE_VAL; + len = OPSIZE_PUSH_SAVE_VAL; break; case GIMMICK_UPDATE_VAR: - len = SIZE_OP_UPDATE_VAR; + len = OPSIZE_UPDATE_VAR; break; #ifdef USE_CALLOUT case GIMMICK_CALLOUT: switch (node->detail_type) { case ONIG_CALLOUT_OF_CONTENTS: - len = SIZE_OP_CALLOUT_CONTENTS; + len = OPSIZE_CALLOUT_CONTENTS; break; case ONIG_CALLOUT_OF_NAME: - len = SIZE_OP_CALLOUT_NAME; + len = OPSIZE_CALLOUT_NAME; break; default: @@ -1792,13 +1941,13 @@ compile_length_tree(Node* node, regex_t* reg) r += compile_length_tree(NODE_CAR(node), reg); n++; } while (IS_NOT_NULL(node = NODE_CDR(node))); - r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1); + r += (OPSIZE_PUSH + OPSIZE_JUMP) * (n - 1); } break; case NODE_STRING: - if (NODE_STRING_IS_RAW(node)) - r = compile_length_string_raw_node(STR_(node), reg); + if (NODE_STRING_IS_CRUDE(node)) + r = compile_length_string_crude_node(STR_(node), reg); else r = compile_length_string_node(node, reg); break; @@ -1812,12 +1961,12 @@ compile_length_tree(Node* node, regex_t* reg) break; case NODE_BACKREF: - r = SIZE_OP_BACKREF; + r = OPSIZE_BACKREF; break; #ifdef USE_CALL case NODE_CALL: - r = SIZE_OP_CALL; + r = OPSIZE_CALL; break; #endif @@ -1864,7 +2013,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) do { len += compile_length_tree(NODE_CAR(x), reg); if (IS_NOT_NULL(NODE_CDR(x))) { - len += SIZE_OP_PUSH + SIZE_OP_JUMP; + len += OPSIZE_PUSH + OPSIZE_JUMP; } } while (IS_NOT_NULL(x = NODE_CDR(x))); pos = COP_CURR_OFFSET(reg) + 1 + len; /* goal position */ @@ -1875,7 +2024,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) enum OpCode push = NODE_IS_SUPER(node) ? OP_PUSH_SUPER : OP_PUSH; r = add_op(reg, push); if (r != 0) break; - COP(reg)->push.addr = SIZE_INC_OP + len + SIZE_OP_JUMP; + COP(reg)->push.addr = SIZE_INC + len + OPSIZE_JUMP; } r = compile_tree(NODE_CAR(node), reg, env); if (r != 0) break; @@ -1890,8 +2039,8 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) break; case NODE_STRING: - if (NODE_STRING_IS_RAW(node)) - r = compile_string_raw_node(STR_(node), reg); + if (NODE_STRING_IS_CRUDE(node)) + r = compile_string_crude_node(STR_(node), reg); else r = compile_string_node(node, reg); break; @@ -2061,8 +2210,9 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) Node** ptarget = &(NODE_BODY(node)); Node* old = *ptarget; r = noname_disable_map(ptarget, map, counter); + if (r != 0) return r; if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) { - onig_reduce_nested_quantifier(node, *ptarget); + r = onig_reduce_nested_quantifier(node); } } break; @@ -2274,11 +2424,11 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) } } - loc = env->capture_history; - MEM_STATUS_CLEAR(env->capture_history); + loc = env->cap_history; + MEM_STATUS_CLEAR(env->cap_history); for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { if (MEM_STATUS_AT(loc, i)) { - MEM_STATUS_ON_SIMPLE(env->capture_history, map[i].new_val); + MEM_STATUS_ON_SIMPLE(env->cap_history, map[i].new_val); } } @@ -2654,7 +2804,7 @@ is_exclusive(Node* x, Node* y, regex_t* reg) len = NODE_STRING_LEN(x); if (len > NODE_STRING_LEN(y)) len = NODE_STRING_LEN(y); - if (NODE_STRING_IS_AMBIG(x) || NODE_STRING_IS_AMBIG(y)) { + if (NODE_STRING_IS_CASE_FOLD_MATCH(x) || NODE_STRING_IS_CASE_FOLD_MATCH(y)) { /* tiny version */ return 0; } @@ -2714,7 +2864,7 @@ get_head_value_node(Node* node, int exact, regex_t* reg) break; if (exact == 0 || - ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_RAW(node)) { + ! IS_IGNORECASE(reg->options) || NODE_STRING_IS_CRUDE(node)) { n = node; } } @@ -2842,9 +2992,9 @@ tree_min_len(Node* node, ScanEnv* env) if (NODE_IS_RECURSION(node)) break; backs = BACKREFS_P(br); - len = tree_min_len(mem_env[backs[0]].node, env); + len = tree_min_len(mem_env[backs[0]].mem_node, env); for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].node, env); + tmin = tree_min_len(mem_env[backs[i]].mem_node, env); if (len > tmin) len = tmin; } } @@ -3013,7 +3163,7 @@ tree_max_len(Node* node, ScanEnv* env) } backs = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { - tmax = tree_max_len(mem_env[backs[i]].node, env); + tmax = tree_max_len(mem_env[backs[i]].mem_node, env); if (len < tmax) len = tmax; } } @@ -3035,7 +3185,7 @@ tree_max_len(Node* node, ScanEnv* env) if (qn->upper != 0) { len = tree_max_len(NODE_BODY(node), env); if (len != 0) { - if (! IS_REPEAT_INFINITE(qn->upper)) + if (! IS_INFINITE_REPEAT(qn->upper)) len = distance_multiply(len, qn->upper); else len = INFINITE_LEN; @@ -3150,7 +3300,7 @@ check_backrefs(Node* node, ScanEnv* env) if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; - NODE_STATUS_ADD(mem_env[backs[i]].node, BACKREF); + NODE_STATUS_ADD(mem_env[backs[i]].mem_node, BACKREF); } r = 0; } @@ -3164,6 +3314,204 @@ check_backrefs(Node* node, ScanEnv* env) return r; } +static int +set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) +{ + int r; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + r = set_empty_repeat_node_trav(NODE_CAR(node), empty, env); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + { + AnchorNode* an = ANCHOR_(node); + + if (! ANCHOR_HAS_BODY(an)) { + r = 0; + break; + } + + switch (an->type) { + case ANCR_PREC_READ: + case ANCR_LOOK_BEHIND: + empty = NULL_NODE; + break; + default: + break; + } + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + } + break; + + case NODE_QUANT: + { + QuantNode* qn = QUANT_(node); + + if (qn->emptiness != BODY_IS_NOT_EMPTY) empty = node; + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + } + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) { + r = set_empty_repeat_node_trav(NODE_BODY(node), empty, env); + if (r != 0) return r; + } + { + BagNode* en = BAG_(node); + + if (en->type == BAG_MEMORY) { + if (NODE_IS_BACKREF(node)) { + if (IS_NOT_NULL(empty)) + SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty; + } + } + else if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = set_empty_repeat_node_trav(en->te.Then, empty, env); + if (r != 0) return r; + } + if (IS_NOT_NULL(en->te.Else)) { + r = set_empty_repeat_node_trav(en->te.Else, empty, env); + } + } + } + break; + + default: + r = 0; + break; + } + + return r; +} + +static int +is_ancestor_node(Node* node, Node* me) +{ + Node* parent; + + while ((parent = NODE_PARENT(me)) != NULL_NODE) { + if (parent == node) return 1; + me = parent; + } + return 0; +} + +static void +set_empty_status_check_trav(Node* node, ScanEnv* env) +{ + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + set_empty_status_check_trav(NODE_CAR(node), env); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + { + AnchorNode* an = ANCHOR_(node); + + if (! ANCHOR_HAS_BODY(an)) break; + set_empty_status_check_trav(NODE_BODY(node), env); + } + break; + + case NODE_QUANT: + set_empty_status_check_trav(NODE_BODY(node), env); + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) + set_empty_status_check_trav(NODE_BODY(node), env); + { + BagNode* en = BAG_(node); + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + set_empty_status_check_trav(en->te.Then, env); + } + if (IS_NOT_NULL(en->te.Else)) { + set_empty_status_check_trav(en->te.Else, env); + } + } + } + break; + + case NODE_BACKREF: + { + int i; + int* backs; + MemEnv* mem_env = SCANENV_MEMENV(env); + BackRefNode* br = BACKREF_(node); + backs = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + Node* ernode = mem_env[backs[i]].empty_repeat_node; + if (IS_NOT_NULL(ernode)) { + if (! is_ancestor_node(ernode, node)) { + MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]); + NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK); + NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK); + } + } + } + } + break; + + default: + break; + } +} + +static void +set_parent_node_trav(Node* node, Node* parent) +{ + NODE_PARENT(node) = parent; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + set_parent_node_trav(NODE_CAR(node), node); + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + if (! ANCHOR_HAS_BODY(ANCHOR_(node))) break; + set_parent_node_trav(NODE_BODY(node), node); + break; + + case NODE_QUANT: + set_parent_node_trav(NODE_BODY(node), node); + break; + + case NODE_BAG: + if (IS_NOT_NULL(NODE_BODY(node))) + set_parent_node_trav(NODE_BODY(node), node); + { + BagNode* en = BAG_(node); + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) + set_parent_node_trav(en->te.Then, node); + if (IS_NOT_NULL(en->te.Else)) { + set_parent_node_trav(en->te.Else, node); + } + } + } + break; + + default: + break; + } +} + #ifdef USE_CALL @@ -3269,6 +3617,9 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head) if ((eret & RECURSION_MUST) == 0) r &= ~RECURSION_MUST; } + else { + r &= ~RECURSION_MUST; + } } else { r = infinite_recursive_call_check(NODE_BODY(node), env, head); @@ -3443,7 +3794,7 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) r = recursive_call_check_trav(NODE_BODY(node), env, state); if (QUANT_(node)->upper == 0) { if (r == FOUND_CALLED_NODE) - QUANT_(node)->is_refered = 1; + QUANT_(node)->include_referred = 1; } break; @@ -3466,8 +3817,10 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) if (! NODE_IS_RECURSION(node)) { NODE_STATUS_ADD(node, MARK1); r = recursive_call_check(NODE_BODY(node)); - if (r != 0) + if (r != 0) { NODE_STATUS_ADD(node, RECURSION); + MEM_STATUS_ON(env->backtrack_mem, en->m.regnum); + } NODE_STATUS_REMOVE(node, MARK1); } @@ -3508,6 +3861,96 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state) #endif +static void +remove_from_list(Node* prev, Node* a) +{ + if (NODE_CDR(prev) != a) return ; + + NODE_CDR(prev) = NODE_CDR(a); + NODE_CDR(a) = NULL_NODE; +} + +static int +reduce_string_list(Node* node) +{ + int r = 0; + + switch (NODE_TYPE(node)) { + case NODE_LIST: + { + Node* prev; + Node* curr; + Node* prev_node; + Node* next_node; + + prev = NULL_NODE; + do { + next_node = NODE_CDR(node); + curr = NODE_CAR(node); + if (NODE_TYPE(curr) == NODE_STRING) { + if (IS_NULL(prev) || STR_(curr)->flag != STR_(prev)->flag) { + prev = curr; + prev_node = node; + } + else { + r = node_str_node_cat(prev, curr); + if (r != 0) return r; + remove_from_list(prev_node, node); + onig_node_free(node); + } + } + else { + prev = NULL_NODE; + prev_node = node; + } + + node = next_node; + } while (r == 0 && IS_NOT_NULL(node)); + } + break; + + case NODE_ALT: + do { + r = reduce_string_list(NODE_CAR(node)); + } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_ANCHOR: + if (IS_NULL(NODE_BODY(node))) + break; + /* fall */ + case NODE_QUANT: + r = reduce_string_list(NODE_BODY(node)); + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + r = reduce_string_list(NODE_BODY(node)); + if (r != 0) return r; + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = reduce_string_list(en->te.Then); + if (r != 0) return r; + } + if (IS_NOT_NULL(en->te.Else)) { + r = reduce_string_list(en->te.Else); + if (r != 0) return r; + } + } + } + break; + + default: + break; + } + + return r; +} + + #define IN_ALT (1<<0) #define IN_NOT (1<<1) #define IN_REAL_REPEAT (1<<2) @@ -3530,7 +3973,7 @@ divide_look_behind_alternatives(Node* node) head = NODE_ANCHOR_BODY(an); np = NODE_CAR(head); - swap_node(node, head); + node_swap(node, head); NODE_CAR(node) = head; NODE_BODY(head) = np; @@ -3552,7 +3995,7 @@ divide_look_behind_alternatives(Node* node) } static int -setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) +tune_look_behind(Node* node, regex_t* reg, ScanEnv* env) { int r, len; AnchorNode* an = ANCHOR_(node); @@ -3573,7 +4016,7 @@ setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) } static int -next_setup(Node* node, Node* next_node, regex_t* reg) +tune_next(Node* node, Node* next_node, regex_t* reg) { NodeType type; @@ -3581,7 +4024,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) type = NODE_TYPE(node); if (type == NODE_QUANT) { QuantNode* qn = QUANT_(node); - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { + if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) { #ifdef USE_QUANT_PEEK_NEXT Node* n = get_head_value_node(next_node, 1, reg); /* '\0': for UTF-16BE etc... */ @@ -3591,7 +4034,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) #endif /* automatic posseivation a*b ==> (?>a*)b */ if (qn->lower <= 1) { - if (NODE_IS_SIMPLE_TYPE(NODE_BODY(node))) { + if (is_strict_real_node(NODE_BODY(node))) { Node *x, *y; x = get_head_value_node(NODE_BODY(node), 0, reg); if (IS_NOT_NULL(x)) { @@ -3599,8 +4042,8 @@ next_setup(Node* node, Node* next_node, regex_t* reg) if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) { Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK); CHECK_NULL_RETURN_MEMERR(en); - NODE_STATUS_ADD(en, STOP_BT_SIMPLE_REPEAT); - swap_node(node, en); + NODE_STATUS_ADD(en, STRICT_REAL_REPEAT); + node_swap(node, en); NODE_BODY(node) = en; } } @@ -3620,23 +4063,57 @@ next_setup(Node* node, Node* next_node, regex_t* reg) static int -update_string_node_case_fold(regex_t* reg, Node *node) +is_all_code_len_1_items(int n, OnigCaseFoldCodeItem items[]) { - UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + int i; + + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->code_len != 1) return 0; + } + + return 1; +} + +static int +get_min_max_byte_len_case_fold_items(int n, OnigCaseFoldCodeItem items[], int* rmin, int* rmax) +{ + int i, len, minlen, maxlen; + + minlen = INT_MAX; + maxlen = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + + len = item->byte_len; + if (len < minlen) minlen = len; + if (len > maxlen) maxlen = len; + } + + *rmin = minlen; + *rmax = maxlen; + return 0; +} + +static int +conv_string_case_fold(OnigEncoding enc, OnigCaseFoldType case_fold_flag, + UChar* s, UChar* end, UChar** rs, UChar** rend, int* rcase_min_len) +{ + UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar *sbuf, *ebuf, *sp; - int r, i, len, sbuf_size; - StrNode* sn = STR_(node); + int i, n, len, sbuf_size; - end = sn->end; - sbuf_size = (int )(end - sn->s) * 2; + *rs = NULL; + sbuf_size = (int )(end - s) * 2; sbuf = (UChar* )xmalloc(sbuf_size); CHECK_NULL_RETURN_MEMERR(sbuf); ebuf = sbuf + sbuf_size; + n = 0; sp = sbuf; - p = sn->s; + p = s; while (p < end) { - len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf); + len = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, buf); for (i = 0; i < len; i++) { if (sp >= ebuf) { sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2); @@ -3648,364 +4125,310 @@ update_string_node_case_fold(regex_t* reg, Node *node) *sp++ = buf[i]; } + n++; } - r = onig_node_str_set(node, sbuf, sp); - if (r != 0) { - xfree(sbuf); - return r; - } - - xfree(sbuf); + *rs = sbuf; + *rend = sp; + *rcase_min_len = n; return 0; } static int -expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end, regex_t* reg) +make_code_list_to_string(Node** rnode, OnigEncoding enc, + int n, OnigCodePoint codes[]) { - int r; - Node *node; + int r, i, len; + Node* node; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - node = onig_node_new_str(s, end); - if (IS_NULL(node)) return ONIGERR_MEMORY; + *rnode = NULL_NODE; + node = onig_node_new_str(NULL, NULL); + CHECK_NULL_RETURN_MEMERR(node); - r = update_string_node_case_fold(reg, node); - if (r != 0) { - onig_node_free(node); - return r; + for (i = 0; i < n; i++) { + len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf); + if (len < 0) { + r = len; + goto err; + } + + r = onig_node_str_cat(node, buf, buf + len); + if (r != 0) goto err; } - NODE_STRING_SET_AMBIG(node); - NODE_STRING_SET_DONT_GET_OPT_INFO(node); *rnode = node; return 0; + + err: + onig_node_free(node); + return r; } static int -expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], UChar *p, - int slen, UChar *end, regex_t* reg, Node **rnode) +unravel_cf_node_add(Node** rlist, Node* add) { - int r, i, j; - int len; - int varlen; - Node *anode, *var_anode, *snode, *xnode, *an; - UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + Node *list; - *rnode = var_anode = NULL_NODE; - - varlen = 0; - for (i = 0; i < item_num; i++) { - if (items[i].byte_len != slen) { - varlen = 1; - break; - } + list = *rlist; + if (IS_NULL(list)) { + list = onig_node_new_list(add, NULL); + CHECK_NULL_RETURN_MEMERR(list); + *rlist = list; + } + else { + Node* r = node_list_add(list, add); + CHECK_NULL_RETURN_MEMERR(r); } - if (varlen != 0) { - *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(var_anode)) return ONIGERR_MEMORY; + return 0; +} + +static int +unravel_cf_string_add(Node** rlist, Node** rsn, UChar* s, UChar* end, + unsigned int flag, int case_min_len) +{ + int r; + Node *sn, *list; - xnode = onig_node_new_list(NULL, NULL); - if (IS_NULL(xnode)) goto mem_err; - NODE_CAR(var_anode) = xnode; + list = *rlist; + sn = *rsn; - anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(anode)) goto mem_err; - NODE_CAR(xnode) = anode; + if (IS_NOT_NULL(sn) && STR_(sn)->flag == flag) { + if (NODE_STRING_IS_CASE_FOLD_MATCH(sn)) + r = node_str_cat_case_fold(sn, s, end, case_min_len); + else + r = onig_node_str_cat(sn, s, end); } else { - *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(anode)) return ONIGERR_MEMORY; + sn = onig_node_new_str(s, end); + CHECK_NULL_RETURN_MEMERR(sn); + + STR_(sn)->flag = flag; + STR_(sn)->case_min_len = case_min_len; + r = unravel_cf_node_add(&list, sn); } - snode = onig_node_new_str(p, p + slen); - if (IS_NULL(snode)) goto mem_err; + if (r == 0) { + *rlist = list; + *rsn = sn; + } + return r; +} - NODE_CAR(anode) = snode; +static int +unravel_cf_string_fold_add(Node** rlist, Node** rsn, OnigEncoding enc, + OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) +{ + int r; + int case_min_len; + UChar *rs, *rend; - for (i = 0; i < item_num; i++) { - snode = onig_node_new_str(NULL, NULL); - if (IS_NULL(snode)) goto mem_err; + r = conv_string_case_fold(enc, case_fold_flag, s, end, + &rs, &rend, &case_min_len); + if (r != 0) return r; - for (j = 0; j < items[i].code_len; j++) { - len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf); - if (len < 0) { - r = len; - goto mem_err2; - } + r = unravel_cf_string_add(rlist, rsn, rs, rend, + NODE_STRING_CASE_FOLD_MATCH, case_min_len); + xfree(rs); - r = onig_node_str_cat(snode, buf, buf + len); - if (r != 0) goto mem_err2; - } + return r; +} - an = onig_node_new_alt(NULL_NODE, NULL_NODE); - if (IS_NULL(an)) { - goto mem_err2; - } +static int +unravel_cf_string_alt_or_cc_add(Node** rlist, int n, + OnigCaseFoldCodeItem items[], int byte_len, OnigEncoding enc, + OnigCaseFoldType case_fold_flag, UChar* s, UChar* end) +{ + int r, i; + Node* node; - if (items[i].byte_len != slen && IS_NOT_NULL(var_anode)) { - Node *rem; - UChar *q = p + items[i].byte_len; + if (is_all_code_len_1_items(n, items)) { + OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */ - if (q < end) { - r = expand_case_fold_make_rem_string(&rem, q, end, reg); - if (r != 0) { - onig_node_free(an); - goto mem_err2; - } + codes[0] = ONIGENC_MBC_TO_CODE(enc, s, end); + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + codes[i+1] = item->code[0]; + } + r = onig_new_cclass_with_code_list(&node, enc, n + 1, codes); + if (r != 0) return r; + } + else { + Node *snode, *alt, *curr; - xnode = onig_node_list_add(NULL_NODE, snode); - if (IS_NULL(xnode)) { - onig_node_free(an); - onig_node_free(rem); - goto mem_err2; - } - if (IS_NULL(onig_node_list_add(xnode, rem))) { - onig_node_free(an); - onig_node_free(xnode); - onig_node_free(rem); - goto mem_err; - } + snode = onig_node_new_str(s, end); + CHECK_NULL_RETURN_MEMERR(snode); + node = curr = onig_node_new_alt(snode, NULL_NODE); + if (IS_NULL(curr)) { + onig_node_free(snode); + return ONIGERR_MEMORY; + } - NODE_CAR(an) = xnode; + r = 0; + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + r = make_code_list_to_string(&snode, enc, item->code_len, item->code); + if (r != 0) { + onig_node_free(node); + return r; } - else { - NODE_CAR(an) = snode; + + alt = onig_node_new_alt(snode, NULL_NODE); + if (IS_NULL(alt)) { + onig_node_free(snode); + onig_node_free(node); + return ONIGERR_MEMORY; } - NODE_CDR(var_anode) = an; - var_anode = an; - } - else { - NODE_CAR(an) = snode; - NODE_CDR(anode) = an; - anode = an; + NODE_CDR(curr) = alt; + curr = alt; } } - return varlen; - - mem_err2: - onig_node_free(snode); - - mem_err: - onig_node_free(*rnode); - - return ONIGERR_MEMORY; + r = unravel_cf_node_add(rlist, node); + if (r != 0) onig_node_free(node); + return r; } static int -is_good_case_fold_items_for_search(OnigEncoding enc, int slen, - int n, OnigCaseFoldCodeItem items[]) +unravel_cf_look_behind_add(Node** rlist, Node** rsn, + int n, OnigCaseFoldCodeItem items[], OnigEncoding enc, + UChar* s, int one_len) { - int i, len; - UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + int r, i, found; + found = 0; for (i = 0; i < n; i++) { OnigCaseFoldCodeItem* item = items + i; + if (item->byte_len == one_len) { + if (item->code_len == 1) { + found = 1; + } + } + } - if (item->code_len != 1) return 0; - if (item->byte_len != slen) return 0; - len = ONIGENC_CODE_TO_MBC(enc, item->code[0], buf); - if (len != slen) return 0; + if (found == 0) { + r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */, 0); } + else { + Node* node; + OnigCodePoint codes[14];/* least ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM + 1 */ - return 1; -} + found = 0; + codes[found++] = ONIGENC_MBC_TO_CODE(enc, s, s + one_len); + for (i = 0; i < n; i++) { + OnigCaseFoldCodeItem* item = items + i; + if (item->byte_len == one_len) { + if (item->code_len == 1) { + codes[found++] = item->code[0]; + } + } + } + r = onig_new_cclass_with_code_list(&node, enc, found, codes); + if (r != 0) return r; + + r = unravel_cf_node_add(rlist, node); + if (r != 0) onig_node_free(node); + + *rsn = NULL_NODE; + } -#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8 + return r; +} static int -expand_case_fold_string(Node* node, regex_t* reg, int state) -{ - int r, n, len, alt_num; - int fold_len; - int prev_is_ambig, prev_is_good, is_good, is_in_look_behind; - UChar *start, *end, *p; - UChar* foldp; - Node *top_root, *root, *snode, *prev_node; +unravel_case_fold_string(Node* node, regex_t* reg, int state) +{ + int r, n, one_len, min_len, max_len, in_look_behind; + UChar *start, *end, *p, *q; + StrNode* snode; + Node *sn, *list; + OnigEncoding enc; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - UChar buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - StrNode* sn; - if (NODE_STRING_IS_AMBIG(node)) return 0; + if (NODE_STRING_IS_CASE_EXPANDED(node)) return 0; - sn = STR_(node); + snode = STR_(node); - start = sn->s; - end = sn->end; + start = snode->s; + end = snode->end; if (start >= end) return 0; - is_in_look_behind = (state & IN_LOOK_BEHIND) != 0; + in_look_behind = (state & IN_LOOK_BEHIND) != 0; + enc = reg->enc; - r = 0; - top_root = root = prev_node = snode = NULL_NODE; - alt_num = 1; + list = sn = NULL_NODE; p = start; while (p < end) { - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag, - p, end, items); + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, + items); if (n < 0) { r = n; goto err; } - len = enclen(reg->enc, p); - is_good = is_good_case_fold_items_for_search(reg->enc, len, n, items); - - if (is_in_look_behind || - (IS_NOT_NULL(snode) || - (is_good - /* expand single char case: ex. /(?i:a)/ */ - && !(p == start && p + len >= end)))) { - if (IS_NULL(snode)) { - if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - prev_node = snode = onig_node_new_str(NULL, NULL); - if (IS_NULL(snode)) goto mem_err; - if (IS_NOT_NULL(root)) { - if (IS_NULL(onig_node_list_add(root, snode))) { - onig_node_free(snode); - goto mem_err; - } - } - - prev_is_ambig = -1; /* -1: new */ - prev_is_good = 0; /* escape compiler warning */ - } - else { - prev_is_ambig = NODE_STRING_IS_AMBIG(snode); - prev_is_good = NODE_STRING_IS_GOOD_AMBIG(snode); - } - - if (n != 0) { - foldp = p; - fold_len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, - &foldp, end, buf); - foldp = buf; - } - else { - foldp = p; fold_len = len; - } - - if ((prev_is_ambig == 0 && n != 0) || - (prev_is_ambig > 0 && (n == 0 || prev_is_good != is_good))) { - if (IS_NULL(root) /* && IS_NOT_NULL(prev_node) */) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } - } - - prev_node = snode = onig_node_new_str(foldp, foldp + fold_len); - if (IS_NULL(snode)) goto mem_err; - if (IS_NULL(onig_node_list_add(root, snode))) { - onig_node_free(snode); - goto mem_err; - } - } - else { - r = onig_node_str_cat(snode, foldp, foldp + fold_len); - if (r != 0) goto err; - } - - if (n != 0) NODE_STRING_SET_AMBIG(snode); - if (is_good != 0) NODE_STRING_SET_GOOD_AMBIG(snode); + one_len = enclen(enc, p); + if (n == 0) { + q = p + one_len; + r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */, 0); + if (r != 0) goto err; } else { - alt_num *= (n + 1); - if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break; - - if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(prev_node); - goto mem_err; - } + if (in_look_behind != 0) { + q = p + one_len; + r = unravel_cf_look_behind_add(&list, &sn, n, items, enc, p, one_len); + if (r != 0) goto err; } - - r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node); - if (r < 0) goto mem_err; - if (r == 1) { - if (IS_NULL(root)) { - top_root = prev_node; + else { + get_min_max_byte_len_case_fold_items(n, items, &min_len, &max_len); + q = p + max_len; + if (one_len == max_len && min_len == max_len) { + r = unravel_cf_string_alt_or_cc_add(&list, n, items, max_len, enc, + reg->case_fold_flag, p, q); + if (r != 0) goto err; + sn = NULL_NODE; } else { - if (IS_NULL(onig_node_list_add(root, prev_node))) { - onig_node_free(prev_node); - goto mem_err; - } - } - - root = NODE_CAR(prev_node); - } - else { /* r == 0 */ - if (IS_NOT_NULL(root)) { - if (IS_NULL(onig_node_list_add(root, prev_node))) { - onig_node_free(prev_node); - goto mem_err; - } + r = unravel_cf_string_fold_add(&list, &sn, enc, reg->case_fold_flag, + p, q); + if (r != 0) goto err; } } - - snode = NULL_NODE; } - p += len; + p = q; } - if (p < end) { - Node *srem; - - r = expand_case_fold_make_rem_string(&srem, p, end, reg); - if (r != 0) goto mem_err; - - if (IS_NOT_NULL(prev_node) && IS_NULL(root)) { - top_root = root = onig_node_list_add(NULL_NODE, prev_node); - if (IS_NULL(root)) { - onig_node_free(srem); - onig_node_free(prev_node); - goto mem_err; - } - } - - if (IS_NULL(root)) { - prev_node = srem; + if (IS_NOT_NULL(list)) { + if (node_list_len(list) == 1) { + node_swap(node, NODE_CAR(list)); } else { - if (IS_NULL(onig_node_list_add(root, srem))) { - onig_node_free(srem); - goto mem_err; - } + node_swap(node, list); } + onig_node_free(list); + } + else { + node_swap(node, sn); + onig_node_free(sn); } - - /* ending */ - top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node); - swap_node(node, top_root); - onig_node_free(top_root); return 0; - mem_err: - r = ONIGERR_MEMORY; - err: - onig_node_free(top_root); + if (IS_NOT_NULL(list)) + onig_node_free(list); + else if (IS_NOT_NULL(sn)) + onig_node_free(sn); + return r; } -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT -static enum BodyEmpty +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +static enum BodyEmptyType quantifiers_memory_node_info(Node* node) { - int r = BODY_IS_EMPTY; + int r = BODY_IS_EMPTY_POSSIBILITY; switch (NODE_TYPE(node)) { case NODE_LIST: @@ -4022,7 +4445,7 @@ quantifiers_memory_node_info(Node* node) #ifdef USE_CALL case NODE_CALL: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_REC; /* tiny version */ + return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */ } else r = quantifiers_memory_node_info(NODE_BODY(node)); @@ -4044,9 +4467,9 @@ quantifiers_memory_node_info(Node* node) switch (en->type) { case BAG_MEMORY: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_REC; + return BODY_IS_EMPTY_POSSIBILITY_REC; } - return BODY_IS_EMPTY_MEM; + return BODY_IS_EMPTY_POSSIBILITY_MEM; break; case BAG_OPTION: @@ -4083,7 +4506,7 @@ quantifiers_memory_node_info(Node* node) return r; } -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #ifdef USE_CALL @@ -4092,7 +4515,7 @@ quantifiers_memory_node_info(Node* node) __inline #endif static int -setup_call_node_call(CallNode* cn, ScanEnv* env, int state) +tune_call_node_call(CallNode* cn, ScanEnv* env, int state) { MemEnv* mem_env = SCANENV_MEMENV(env); @@ -4112,7 +4535,7 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) } set_call_attr: - NODE_CALL_BODY(cn) = mem_env[cn->group_num].node; + NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node; if (IS_NULL(NODE_CALL_BODY(cn))) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); @@ -4143,23 +4566,23 @@ setup_call_node_call(CallNode* cn, ScanEnv* env, int state) } static void -setup_call2_call(Node* node) +tune_call2_call(Node* node) { switch (NODE_TYPE(node)) { case NODE_LIST: case NODE_ALT: do { - setup_call2_call(NODE_CAR(node)); + tune_call2_call(NODE_CAR(node)); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); break; case NODE_BAG: @@ -4169,19 +4592,19 @@ setup_call2_call(Node* node) if (en->type == BAG_MEMORY) { if (! NODE_IS_MARK1(node)) { NODE_STATUS_ADD(node, MARK1); - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); NODE_STATUS_REMOVE(node, MARK1); } } else if (en->type == BAG_IF_ELSE) { - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); if (IS_NOT_NULL(en->te.Then)) - setup_call2_call(en->te.Then); + tune_call2_call(en->te.Then); if (IS_NOT_NULL(en->te.Else)) - setup_call2_call(en->te.Else); + tune_call2_call(en->te.Else); } else { - setup_call2_call(NODE_BODY(node)); + tune_call2_call(NODE_BODY(node)); } } break; @@ -4197,7 +4620,7 @@ setup_call2_call(Node* node) NODE_STATUS_ADD(called, CALLED); BAG_(called)->m.entry_count++; - setup_call2_call(called); + tune_call2_call(called); } NODE_STATUS_REMOVE(node, MARK1); } @@ -4209,7 +4632,7 @@ setup_call2_call(Node* node) } static int -setup_call(Node* node, ScanEnv* env, int state) +tune_call(Node* node, ScanEnv* env, int state) { int r; @@ -4217,7 +4640,7 @@ setup_call(Node* node, ScanEnv* env, int state) case NODE_LIST: case NODE_ALT: do { - r = setup_call(NODE_CAR(node), env, state); + r = tune_call(NODE_CAR(node), env, state); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -4225,12 +4648,12 @@ setup_call(Node* node, ScanEnv* env, int state) if (QUANT_(node)->upper == 0) state |= IN_ZERO_REPEAT; - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); else r = 0; break; @@ -4244,20 +4667,20 @@ setup_call(Node* node, ScanEnv* env, int state) NODE_STATUS_ADD(node, IN_ZERO_REPEAT); BAG_(node)->m.entry_count--; } - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); } else if (en->type == BAG_IF_ELSE) { - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); if (r != 0) return r; if (IS_NOT_NULL(en->te.Then)) { - r = setup_call(en->te.Then, env, state); + r = tune_call(en->te.Then, env, state); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_call(en->te.Else, env, state); + r = tune_call(en->te.Else, env, state); } else - r = setup_call(NODE_BODY(node), env, state); + r = tune_call(NODE_BODY(node), env, state); } break; @@ -4267,7 +4690,7 @@ setup_call(Node* node, ScanEnv* env, int state) CALL_(node)->entry_count--; } - r = setup_call_node_call(CALL_(node), env, state); + r = tune_call_node_call(CALL_(node), env, state); break; default: @@ -4279,7 +4702,7 @@ setup_call(Node* node, ScanEnv* env, int state) } static int -setup_call2(Node* node) +tune_call2(Node* node) { int r = 0; @@ -4287,23 +4710,23 @@ setup_call2(Node* node) case NODE_LIST: case NODE_ALT: do { - r = setup_call2(NODE_CAR(node)); + r = tune_call2(NODE_CAR(node)); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_QUANT: if (QUANT_(node)->upper != 0) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); break; case NODE_ANCHOR: if (ANCHOR_HAS_BODY(ANCHOR_(node))) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); break; case NODE_BAG: if (! NODE_IS_IN_ZERO_REPEAT(node)) - r = setup_call2(NODE_BODY(node)); + r = tune_call2(NODE_BODY(node)); { BagNode* en = BAG_(node); @@ -4311,18 +4734,18 @@ setup_call2(Node* node) if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = setup_call2(en->te.Then); + r = tune_call2(en->te.Then); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_call2(en->te.Else); + r = tune_call2(en->te.Else); } } break; case NODE_CALL: if (! NODE_IS_IN_ZERO_REPEAT(node)) { - setup_call2_call(node); + tune_call2_call(node); } break; @@ -4335,7 +4758,7 @@ setup_call2(Node* node) static void -setup_called_state_call(Node* node, int state) +tune_called_state_call(Node* node, int state) { switch (NODE_TYPE(node)) { case NODE_ALT: @@ -4343,7 +4766,7 @@ setup_called_state_call(Node* node, int state) /* fall */ case NODE_LIST: do { - setup_called_state_call(NODE_CAR(node), state); + tune_called_state_call(NODE_CAR(node), state); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; @@ -4351,12 +4774,12 @@ setup_called_state_call(Node* node, int state) { QuantNode* qn = QUANT_(node); - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - setup_called_state_call(NODE_QUANT_BODY(qn), state); + tune_called_state_call(NODE_QUANT_BODY(qn), state); } break; @@ -4371,7 +4794,7 @@ setup_called_state_call(Node* node, int state) /* fall */ case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: - setup_called_state_call(NODE_ANCHOR_BODY(an), state); + tune_called_state_call(NODE_ANCHOR_BODY(an), state); break; default: break; @@ -4387,31 +4810,33 @@ setup_called_state_call(Node* node, int state) if (NODE_IS_MARK1(node)) { if ((~en->m.called_state & state) != 0) { en->m.called_state |= state; - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); } } else { NODE_STATUS_ADD(node, MARK1); en->m.called_state |= state; - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); NODE_STATUS_REMOVE(node, MARK1); } } else if (en->type == BAG_IF_ELSE) { + state |= IN_ALT; + tune_called_state_call(NODE_BODY(node), state); if (IS_NOT_NULL(en->te.Then)) { - setup_called_state_call(en->te.Then, state); + tune_called_state_call(en->te.Then, state); } if (IS_NOT_NULL(en->te.Else)) - setup_called_state_call(en->te.Else, state); + tune_called_state_call(en->te.Else, state); } else { - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); } } break; case NODE_CALL: - setup_called_state_call(NODE_BODY(node), state); + tune_called_state_call(NODE_BODY(node), state); break; default: @@ -4420,7 +4845,7 @@ setup_called_state_call(Node* node, int state) } static void -setup_called_state(Node* node, int state) +tune_called_state(Node* node, int state) { switch (NODE_TYPE(node)) { case NODE_ALT: @@ -4428,13 +4853,13 @@ setup_called_state(Node* node, int state) /* fall */ case NODE_LIST: do { - setup_called_state(NODE_CAR(node), state); + tune_called_state(NODE_CAR(node), state); } while (IS_NOT_NULL(node = NODE_CDR(node))); break; #ifdef USE_CALL case NODE_CALL: - setup_called_state_call(node, state); + tune_called_state_call(node, state); break; #endif @@ -4451,14 +4876,15 @@ setup_called_state(Node* node, int state) /* fall */ case BAG_OPTION: case BAG_STOP_BACKTRACK: - setup_called_state(NODE_BODY(node), state); + tune_called_state(NODE_BODY(node), state); break; case BAG_IF_ELSE: - setup_called_state(NODE_BODY(node), state); + state |= IN_ALT; + tune_called_state(NODE_BODY(node), state); if (IS_NOT_NULL(en->te.Then)) - setup_called_state(en->te.Then, state); + tune_called_state(en->te.Then, state); if (IS_NOT_NULL(en->te.Else)) - setup_called_state(en->te.Else, state); + tune_called_state(en->te.Else, state); break; } } @@ -4468,12 +4894,12 @@ setup_called_state(Node* node, int state) { QuantNode* qn = QUANT_(node); - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - setup_called_state(NODE_QUANT_BODY(qn), state); + tune_called_state(NODE_QUANT_BODY(qn), state); } break; @@ -4488,7 +4914,7 @@ setup_called_state(Node* node, int state) /* fall */ case ANCR_PREC_READ: case ANCR_LOOK_BEHIND: - setup_called_state(NODE_ANCHOR_BODY(an), state); + tune_called_state(NODE_ANCHOR_BODY(an), state); break; default: break; @@ -4509,13 +4935,13 @@ setup_called_state(Node* node, int state) #endif /* USE_CALL */ -static int setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env); +static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); #ifdef __GNUC__ __inline #endif static int -setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) { /* allowed node types in look-behind */ #define ALLOWED_TYPE_IN_LB \ @@ -4543,10 +4969,10 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) switch (an->type) { case ANCR_PREC_READ: - r = setup_tree(NODE_ANCHOR_BODY(an), reg, state, env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env); break; case ANCR_PREC_READ_NOT: - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); break; case ANCR_LOOK_BEHIND: @@ -4555,9 +4981,9 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_BAG_IN_LB, ALLOWED_ANCHOR_IN_LB); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_LOOK_BEHIND), env); if (r != 0) return r; - r = setup_look_behind(node, reg, env); + r = tune_look_behind(node, reg, env); } break; @@ -4567,10 +4993,10 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_BAG_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND), - env); + r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state|IN_NOT|IN_LOOK_BEHIND), + env); if (r != 0) return r; - r = setup_look_behind(node, reg, env); + r = tune_look_behind(node, reg, env); } break; @@ -4586,7 +5012,7 @@ setup_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) __inline #endif static int -setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) { int r; OnigLen d; @@ -4600,44 +5026,37 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) NODE_STATUS_ADD(node, IN_MULTI_ENTRY); } - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) { d = tree_min_len(body, env); if (d == 0) { -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT - qn->empty_info = quantifiers_memory_node_info(body); - if (qn->empty_info == BODY_IS_EMPTY_REC) { - if (NODE_TYPE(body) == NODE_BAG && - BAG_(body)->type == BAG_MEMORY) { - MEM_STATUS_ON(env->bt_mem_end, BAG_(body)->m.regnum); - } - } +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT + qn->emptiness = quantifiers_memory_node_info(body); #else - qn->empty_info = BODY_IS_EMPTY; + qn->emptiness = BODY_IS_EMPTY_POSSIBILITY; #endif } } - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; - r = setup_tree(body, reg, state, env); + r = tune_tree(body, reg, state, env); if (r != 0) return r; /* expand string */ #define EXPAND_STRING_MAX_LENGTH 100 if (NODE_TYPE(body) == NODE_STRING) { - if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && + if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper && qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { int len = NODE_STRING_LEN(body); - StrNode* sn = STR_(body); if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { int i, n = qn->lower; - onig_node_conv_to_str_node(node, STR_(body)->flag); + node_conv_to_str_node(node, STR_(body)->flag); for (i = 0; i < n; i++) { - r = onig_node_str_cat(node, sn->s, sn->end); + r = node_str_node_cat(node, body); if (r != 0) return r; } onig_node_free(body); @@ -4646,7 +5065,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } } - if (qn->greedy && (qn->empty_info == BODY_IS_NOT_EMPTY)) { + if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) { if (NODE_TYPE(body) == NODE_QUANT) { QuantNode* tqn = QUANT_(body); if (IS_NOT_NULL(tqn->head_exact)) { @@ -4662,8 +5081,8 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) return r; } -/* setup_tree does the following work. - 1. check empty loop. (set qn->empty_info) +/* tune_tree does the following work. + 1. check empty loop. (set qn->emptiness) 2. expand ignore-case in char class. 3. set memory status bit flags. (reg->mem_stats) 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. @@ -4671,7 +5090,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) 6. expand repeated string. */ static int -setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { int r = 0; @@ -4680,9 +5099,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { Node* prev = NULL_NODE; do { - r = setup_tree(NODE_CAR(node), reg, state, env); + r = tune_tree(NODE_CAR(node), reg, state, env); if (IS_NOT_NULL(prev) && r == 0) { - r = next_setup(prev, NODE_CAR(node), reg); + r = tune_next(prev, NODE_CAR(node), reg); } prev = NODE_CAR(node); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); @@ -4691,13 +5110,13 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) case NODE_ALT: do { - r = setup_tree(NODE_CAR(node), reg, (state | IN_ALT), env); + r = tune_tree(NODE_CAR(node), reg, (state | IN_ALT), env); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; case NODE_STRING: - if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_RAW(node)) { - r = expand_case_fold_string(node, reg, state); + if (IS_IGNORECASE(reg->options) && !NODE_STRING_IS_CRUDE(node)) { + r = unravel_case_fold_string(node, reg, state); } break; @@ -4710,12 +5129,18 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) for (i = 0; i < br->back_num; i++) { if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; MEM_STATUS_ON(env->backrefed_mem, p[i]); - MEM_STATUS_ON(env->bt_mem_start, p[i]); +#if 0 #ifdef USE_BACKREF_WITH_LEVEL if (NODE_IS_NEST_LEVEL(node)) { - MEM_STATUS_ON(env->bt_mem_end, p[i]); + MEM_STATUS_ON(env->backtrack_mem, p[i]); } #endif +#else + /* More precisely, it should be checked whether alt/repeat exists before + the subject capture node, and then this backreference position + exists before (or in) the capture node. */ + MEM_STATUS_ON(env->backtrack_mem, p[i]); +#endif } } break; @@ -4729,7 +5154,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { OnigOptionType options = reg->options; reg->options = BAG_(node)->o.options; - r = setup_tree(NODE_BODY(node), reg, state, env); + r = tune_tree(NODE_BODY(node), reg, state, env); reg->options = options; } break; @@ -4741,46 +5166,46 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_MULTI_ENTRY)) != 0 || NODE_IS_RECURSION(node)) { - MEM_STATUS_ON(env->bt_mem_start, en->m.regnum); + MEM_STATUS_ON(env->backtrack_mem, en->m.regnum); } - r = setup_tree(NODE_BODY(node), reg, state, env); + r = tune_tree(NODE_BODY(node), reg, state, env); break; case BAG_STOP_BACKTRACK: { Node* target = NODE_BODY(node); - r = setup_tree(target, reg, state, env); + r = tune_tree(target, reg, state, env); if (NODE_TYPE(target) == NODE_QUANT) { QuantNode* tqn = QUANT_(target); - if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && + if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 && tqn->greedy != 0) { /* (?>a*), a*+ etc... */ - if (NODE_IS_SIMPLE_TYPE(NODE_BODY(target))) - NODE_STATUS_ADD(node, STOP_BT_SIMPLE_REPEAT); + if (is_strict_real_node(NODE_BODY(target))) + NODE_STATUS_ADD(node, STRICT_REAL_REPEAT); } } } break; case BAG_IF_ELSE: - r = setup_tree(NODE_BODY(node), reg, (state | IN_ALT), env); + r = tune_tree(NODE_BODY(node), reg, (state | IN_ALT), env); if (r != 0) return r; if (IS_NOT_NULL(en->te.Then)) { - r = setup_tree(en->te.Then, reg, (state | IN_ALT), env); + r = tune_tree(en->te.Then, reg, (state | IN_ALT), env); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) - r = setup_tree(en->te.Else, reg, (state | IN_ALT), env); + r = tune_tree(en->te.Else, reg, (state | IN_ALT), env); break; } } break; case NODE_QUANT: - r = setup_quant(node, reg, state, env); + r = tune_quant(node, reg, state, env); break; case NODE_ANCHOR: - r = setup_anchor(node, reg, state, env); + r = tune_anchor(node, reg, state, env); break; #ifdef USE_CALL @@ -4879,7 +5304,7 @@ typedef struct { } MinMax; typedef struct { - MinMax mmd; + MinMax mm; OnigEncoding enc; OnigOptionType options; OnigCaseFoldType case_fold_flag; @@ -4892,17 +5317,16 @@ typedef struct { } OptAnc; typedef struct { - MinMax mmd; /* position */ + MinMax mm; /* position */ OptAnc anc; int reach_end; int case_fold; - int good_case_fold; int len; UChar s[OPT_EXACT_MAXLEN]; } OptStr; typedef struct { - MinMax mmd; /* position */ + MinMax mm; /* position */ OptAnc anc; int value; /* weighted value */ UChar map[CHAR_MAP_SIZE]; @@ -5119,11 +5543,10 @@ is_full_opt_exact(OptStr* e) static void clear_opt_exact(OptStr* e) { - clear_mml(&e->mmd); + clear_mml(&e->mm); clear_opt_anc_info(&e->anc); e->reach_end = 0; e->case_fold = 0; - e->good_case_fold = 0; e->len = 0; e->s[0] = '\0'; } @@ -5147,11 +5570,6 @@ concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc) to->case_fold = 1; } - else { - if (to->good_case_fold != 0) { - if (add->good_case_fold == 0) return 0; - } - } } r = 0; @@ -5206,7 +5624,7 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) return ; } - if (! is_equal_mml(&to->mmd, &add->mmd)) { + if (! is_equal_mml(&to->mm, &add->mm)) { clear_opt_exact(to); return ; } @@ -5228,8 +5646,6 @@ alt_merge_opt_exact(OptStr* to, OptStr* add, OptEnv* env) to->len = i; if (add->case_fold != 0) to->case_fold = 1; - if (add->good_case_fold == 0) - to->good_case_fold = 0; alt_merge_opt_anc_info(&to->anc, &add->anc); if (! to->reach_end) to->anc.right = 0; @@ -5262,10 +5678,7 @@ select_opt_exact(OnigEncoding enc, OptStr* now, OptStr* alt) if (now->case_fold == 0) vn *= 2; if (alt->case_fold == 0) va *= 2; - if (now->good_case_fold != 0) vn *= 4; - if (alt->good_case_fold != 0) va *= 4; - - if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0) + if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0) copy_opt_exact(now, alt); } @@ -5349,7 +5762,7 @@ select_opt_map(OptMap* now, OptMap* alt) vn = z / now->value; va = z / alt->value; - if (comp_distance_value(&now->mmd, &alt->mmd, vn, va) > 0) + if (comp_distance_value(&now->mm, &alt->mm, vn, va) > 0) copy_opt_map(now, alt); } @@ -5363,17 +5776,14 @@ comp_opt_exact_or_map(OptStr* e, OptMap* m) if (m->value <= 0) return -1; if (e->case_fold != 0) { - if (e->good_case_fold != 0) - case_value = 2; - else - case_value = 1; + case_value = 1; } else case_value = 3; ae = COMP_EM_BASE * e->len * case_value; am = COMP_EM_BASE * 5 * 2 / m->value; - return comp_distance_value(&e->mmd, &m->mmd, ae, am); + return comp_distance_value(&e->mm, &m->mm, ae, am); } static void @@ -5381,14 +5791,14 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) { int i, val; - /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */ + /* if (! is_equal_mml(&to->mm, &add->mm)) return ; */ if (to->value == 0) return ; - if (add->value == 0 || to->mmd.max < add->mmd.min) { + if (add->value == 0 || to->mm.max < add->mm.min) { clear_opt_map(to); return ; } - alt_merge_mml(&to->mmd, &add->mmd); + alt_merge_mml(&to->mm, &add->mm); val = 0; for (i = 0; i < CHAR_MAP_SIZE; i++) { @@ -5406,9 +5816,9 @@ alt_merge_opt_map(OnigEncoding enc, OptMap* to, OptMap* add) static void set_bound_node_opt_info(OptNode* opt, MinMax* plen) { - copy_mml(&(opt->sb.mmd), plen); - copy_mml(&(opt->spr.mmd), plen); - copy_mml(&(opt->map.mmd), plen); + copy_mml(&(opt->sb.mm), plen); + copy_mml(&(opt->spr.mm), plen); + copy_mml(&(opt->map.mm), plen); } static void @@ -5443,7 +5853,7 @@ concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add) } if (add->map.value > 0 && to->len.max == 0) { - if (add->map.mmd.max == 0) + if (add->map.mm.max == 0) add->map.anc.left |= to->anc.left; } @@ -5468,10 +5878,7 @@ concat_left_node_opt_info(OnigEncoding enc, OptNode* to, OptNode* add) if (to->spr.len > 0) { if (add->len.max > 0) { - if (to->spr.len > (int )add->len.max) - to->spr.len = add->len.max; - - if (to->spr.mmd.max == 0) + if (to->spr.mm.max == 0) select_opt_exact(enc, &to->sb, &to->spr); else select_opt_exact(enc, &to->sm, &to->spr); @@ -5511,7 +5918,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) r = 0; enc = env->enc; clear_node_opt_info(opt); - set_bound_node_opt_info(opt, &env->mmd); + set_bound_node_opt_info(opt, &env->mm); switch (NODE_TYPE(node)) { case NODE_LIST: @@ -5523,7 +5930,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) do { r = optimize_nodes(NODE_CAR(nd), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mmd, &xo.len); + add_mml(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); } } while (r == 0 && IS_NOT_NULL(nd = NODE_CDR(nd))); @@ -5548,9 +5955,8 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) { StrNode* sn = STR_(node); int slen = (int )(sn->end - sn->s); - /* int is_raw = NODE_STRING_IS_RAW(node); */ - if (! NODE_STRING_IS_AMBIG(node)) { + if (! NODE_STRING_IS_CASE_FOLD_MATCH(node)) { concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); if (slen > 0) { add_char_opt_map(&opt->map, *(sn->s), enc); @@ -5558,28 +5964,20 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) set_mml(&opt->len, slen, slen); } else { - int max; + int max, min; - if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) { - int n = onigenc_strlen(enc, sn->s, sn->end); - max = ONIGENC_MBC_MAXLEN_DIST(enc) * n; - } - else { - concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); - opt->sb.case_fold = 1; - if (NODE_STRING_IS_GOOD_AMBIG(node)) - opt->sb.good_case_fold = 1; - - if (slen > 0) { - r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, - enc, env->case_fold_flag); - if (r != 0) break; - } + concat_opt_exact_str(&opt->sb, sn->s, sn->end, enc); + opt->sb.case_fold = 1; - max = slen; + if (slen > 0) { + r = add_char_amb_opt_map(&opt->map, sn->s, sn->end, + enc, env->case_fold_flag); + if (r != 0) break; } - set_mml(&opt->len, slen, max); + max = slen; + min = sn->case_min_len * ONIGENC_MBC_MINLEN(enc); + set_mml(&opt->len, min, max); } } break; @@ -5589,7 +5987,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) int z; CClassNode* cc = CCLASS_(node); - /* no need to check ignore case. (set in setup_tree()) */ + /* no need to check ignore case. (set in tune_tree()) */ if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) { OnigLen min = ONIGENC_MBC_MINLEN(enc); @@ -5699,11 +6097,11 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) break; } backs = BACKREFS_P(br); - min = tree_min_len(mem_env[backs[0]].node, env->scan_env); - max = tree_max_len(mem_env[backs[0]].node, env->scan_env); + min = tree_min_len(mem_env[backs[0]].mem_node, env->scan_env); + max = tree_max_len(mem_env[backs[0]].mem_node, env->scan_env); for (i = 1; i < br->back_num; i++) { - tmin = tree_min_len(mem_env[backs[i]].node, env->scan_env); - tmax = tree_max_len(mem_env[backs[i]].node, env->scan_env); + tmin = tree_min_len(mem_env[backs[i]].mem_node, env->scan_env); + tmax = tree_max_len(mem_env[backs[i]].mem_node, env->scan_env); if (min > tmin) min = tmin; if (max < tmax) max = tmax; } @@ -5752,8 +6150,8 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) opt->sm.reach_end = 0; } - if (IS_REPEAT_INFINITE(qn->upper)) { - if (env->mmd.max == 0 && + if (IS_INFINITE_REPEAT(qn->upper)) { + if (env->mm.max == 0 && NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) { if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) add_opt_anc_info(&opt->anc, ANCR_ANYCHAR_INF_ML); @@ -5821,7 +6219,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) copy_opt_env(&nenv, env); r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); if (r == 0) { - add_mml(&nenv.mmd, &xo.len); + add_mml(&nenv.mm, &xo.len); concat_left_node_opt_info(enc, opt, &xo); if (IS_NOT_NULL(en->te.Then)) { r = optimize_nodes(en->te.Then, &xo, &nenv); @@ -5870,15 +6268,6 @@ set_optimize_exact(regex_t* reg, OptStr* e) if (e->case_fold) { reg->optimize = OPTIMIZE_STR_CASE_FOLD; - if (e->good_case_fold != 0) { - if (e->len >= 2) { - r = set_sunday_quick_search_or_bmh_skip_table(reg, 1, - reg->exact, reg->exact_end, - reg->map, &(reg->map_offset)); - if (r != 0) return r; - reg->optimize = OPTIMIZE_STR_CASE_FOLD_FAST; - } - } } else { int allow_reverse; @@ -5901,11 +6290,17 @@ set_optimize_exact(regex_t* reg, OptStr* e) } } - reg->dmin = e->mmd.min; - reg->dmax = e->mmd.max; + reg->dist_min = e->mm.min; + reg->dist_max = e->mm.max; + + if (reg->dist_min != INFINITE_LEN) { + int n; + if (e->case_fold != 0) + n = 1; + else + n = (int )(reg->exact_end - reg->exact); - if (reg->dmin != INFINITE_LEN) { - reg->threshold_len = reg->dmin + (int )(reg->exact_end - reg->exact); + reg->threshold_len = reg->dist_min + n; } return 0; @@ -5920,11 +6315,11 @@ set_optimize_map(regex_t* reg, OptMap* m) reg->map[i] = m->map[i]; reg->optimize = OPTIMIZE_MAP; - reg->dmin = m->mmd.min; - reg->dmax = m->mmd.max; + reg->dist_min = m->mm.min; + reg->dist_max = m->mm.max; - if (reg->dmin != INFINITE_LEN) { - reg->threshold_len = reg->dmin + 1; + if (reg->dist_min != INFINITE_LEN) { + reg->threshold_len = reg->dist_min + 1; } } @@ -5950,7 +6345,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) env.options = reg->options; env.case_fold_flag = reg->case_fold_flag; env.scan_env = scan_env; - clear_mml(&env.mmd); + clear_mml(&env.mm); r = optimize_nodes(node, &opt, &env); if (r != 0) return r; @@ -5966,8 +6361,8 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) ANCR_PREC_READ_NOT); if (reg->anchor & (ANCR_END_BUF | ANCR_SEMI_END_BUF)) { - reg->anchor_dmin = opt.len.min; - reg->anchor_dmax = opt.len.max; + reg->anc_dist_min = opt.len.min; + reg->anc_dist_max = opt.len.max; } if (opt.sb.len > 0 || opt.sm.len > 0) { @@ -6002,8 +6397,8 @@ clear_optimize_info(regex_t* reg) { reg->optimize = OPTIMIZE_NONE; reg->anchor = 0; - reg->anchor_dmin = 0; - reg->anchor_dmax = 0; + reg->anc_dist_min = 0; + reg->anc_dist_max = 0; reg->sub_anchor = 0; reg->exact_end = (UChar* )NULL; reg->map_offset = 0; @@ -6122,12 +6517,12 @@ print_optimize_info(FILE* f, regex_t* reg) { static const char* on[] = { "NONE", "STR", "STR_FAST", "STR_FAST_STEP_FORWARD", - "STR_CASE_FOLD_FAST", "STR_CASE_FOLD", "MAP" }; + "STR_CASE_FOLD", "MAP" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); if ((reg->anchor & ANCR_END_BUF_MASK) != 0) - print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax); + print_distance_range(f, reg->anc_dist_min, reg->anc_dist_max); fprintf(f, "\n"); if (reg->optimize) { @@ -6275,7 +6670,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, Node* root; ScanEnv scan_env; #ifdef USE_CALL - UnsetAddrList uslist; + UnsetAddrList uslist = {0}; #endif root = 0; @@ -6299,13 +6694,17 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->string_pool_end = 0; reg->num_mem = 0; reg->num_repeat = 0; - reg->num_null_check = 0; + reg->num_empty_check = 0; reg->repeat_range_alloc = 0; - reg->repeat_range = (OnigRepeatRange* )NULL; + reg->repeat_range = (RepeatRange* )NULL; + reg->empty_status_mem = 0; r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; + r = reduce_string_list(root); + if (r != 0) goto err; + /* mixed use named group and no-named group */ if (scan_env.num_named > 0 && IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && @@ -6326,38 +6725,65 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, r = unset_addr_list_init(&uslist, scan_env.num_call); if (r != 0) goto err; scan_env.unset_addr_list = &uslist; - r = setup_call(root, &scan_env, 0); + r = tune_call(root, &scan_env, 0); if (r != 0) goto err_unset; - r = setup_call2(root); + r = tune_call2(root); if (r != 0) goto err_unset; r = recursive_call_check_trav(root, &scan_env, 0); if (r < 0) goto err_unset; r = infinite_recursive_call_check_trav(root, &scan_env); if (r != 0) goto err_unset; - setup_called_state(root, 0); + tune_called_state(root, 0); } reg->num_call = scan_env.num_call; #endif - r = setup_tree(root, reg, 0, &scan_env); +#ifdef ONIG_DEBUG_PARSE + fprintf(stderr, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); + fprintf(stderr, "TREE (parsed)\n"); + print_tree(stderr, root); + fprintf(stderr, "\n"); +#endif + + r = tune_tree(root, reg, 0, &scan_env); if (r != 0) goto err_unset; + if (scan_env.backref_num != 0) { + set_parent_node_trav(root, NULL_NODE); + r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env); + if (r != 0) goto err_unset; + set_empty_status_check_trav(root, &scan_env); + } + #ifdef ONIG_DEBUG_PARSE + fprintf(stderr, "TREE (after tune)\n"); print_tree(stderr, root); + fprintf(stderr, "\n"); #endif - reg->capture_history = scan_env.capture_history; - reg->bt_mem_start = scan_env.bt_mem_start; - reg->bt_mem_start |= reg->capture_history; - if (IS_FIND_CONDITION(reg->options)) - MEM_STATUS_ON_ALL(reg->bt_mem_end); + reg->capture_history = scan_env.cap_history; + reg->push_mem_start = scan_env.backtrack_mem | scan_env.cap_history; + +#ifdef USE_CALLOUT + if (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) { + reg->push_mem_end = reg->push_mem_start; + } else { - reg->bt_mem_end = scan_env.bt_mem_end; - reg->bt_mem_end |= reg->capture_history; + if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start)) + reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history; + else + reg->push_mem_end = reg->push_mem_start & + (scan_env.backrefed_mem | scan_env.cap_history); } - reg->bt_mem_start |= reg->bt_mem_end; +#else + if (MEM_STATUS_IS_ALL_ON(reg->push_mem_start)) + reg->push_mem_end = scan_env.backrefed_mem | scan_env.cap_history; + else + reg->push_mem_end = reg->push_mem_start & + (scan_env.backrefed_mem | scan_env.cap_history); +#endif clear_optimize_info(reg); #ifndef ONIG_DONT_OPTIMIZE @@ -6391,14 +6817,20 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, } #endif - if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0) + set_addr_in_repeat_range(reg); + + if ((reg->push_mem_end != 0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + || (reg->num_repeat != 0) + || (reg->num_empty_check != 0) +#endif #ifdef USE_CALLOUT || (IS_NOT_NULL(reg->extp) && reg->extp->callout_num != 0) #endif ) reg->stack_pop_level = STACK_POP_LEVEL_ALL; else { - if (reg->bt_mem_start != 0) + if (reg->push_mem_start != 0) reg->stack_pop_level = STACK_POP_LEVEL_MEM_START; else reg->stack_pop_level = STACK_POP_LEVEL_FREE; @@ -6531,11 +6963,14 @@ onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, if (IS_NULL(*reg)) return ONIGERR_MEMORY; r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); - if (r != 0) goto err; + if (r != 0) { + xfree(*reg); + *reg = NULL; + return r; + } r = onig_compile(*reg, pattern, pattern_end, einfo); if (r != 0) { - err: onig_free(*reg); *reg = NULL; } @@ -6672,6 +7107,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) } else { len = ONIGENC_CODE_TO_MBCLEN(enc, code); + if (len < 0) return 0; } return onig_is_code_in_cc_len(len, code, cc); } @@ -6679,12 +7115,14 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) #ifdef ONIG_DEBUG_PARSE +#ifdef USE_CALL static void p_string(FILE* f, int len, UChar* s) { fputs(":", f); while (len-- > 0) { fputc(*s++, f); } } +#endif static void Indent(FILE* f, int indent) @@ -6704,7 +7142,7 @@ print_indent_tree(FILE* f, Node* node, int indent) Indent(f, indent); if (IS_NULL(node)) { fprintf(f, "ERROR: null node!!!\n"); - exit (0); + exit(0); } type = NODE_TYPE(node); @@ -6728,28 +7166,22 @@ print_indent_tree(FILE* f, Node* node, int indent) case NODE_STRING: { + char* str; char* mode; - char* dont; - char* good; - if (NODE_STRING_IS_RAW(node)) - mode = "-raw"; - else if (NODE_STRING_IS_AMBIG(node)) - mode = "-ambig"; + if (NODE_STRING_IS_CRUDE(node)) + mode = "-crude"; + else if (NODE_STRING_IS_CASE_FOLD_MATCH(node)) + mode = "-case_fold_match"; else mode = ""; - if (NODE_STRING_IS_GOOD_AMBIG(node)) - good = "-good"; - else - good = ""; - - if (NODE_STRING_IS_DONT_GET_OPT_INFO(node)) - dont = " (dont-opt)"; + if (STR_(node)->s == STR_(node)->end) + str = "empty-string"; else - dont = ""; + str = "string"; - fprintf(f, "<string%s%s%s:%p>", mode, good, dont, node); + fprintf(f, "<%s%s:%p>", str, mode, node); for (p = STR_(node)->s; p < STR_(node)->end; p++) { if (*p >= 0x20 && *p < 0x7f) fputc(*p, f); @@ -6871,6 +7303,34 @@ print_indent_tree(FILE* f, Node* node, int indent) case NODE_BAG: fprintf(f, "<bag:%p> ", node); + if (BAG_(node)->type == BAG_IF_ELSE) { + Node* Then; + Node* Else; + BagNode* bn; + + bn = BAG_(node); + fprintf(f, "if-else\n"); + print_indent_tree(f, NODE_BODY(node), indent + add); + + Then = bn->te.Then; + Else = bn->te.Else; + if (IS_NULL(Then)) { + Indent(f, indent + add); + fprintf(f, "THEN empty\n"); + } + else + print_indent_tree(f, Then, indent + add); + + if (IS_NULL(Else)) { + Indent(f, indent + add); + fprintf(f, "ELSE empty\n"); + } + else + print_indent_tree(f, Else, indent + add); + + break; + } + switch (BAG_(node)->type) { case BAG_OPTION: fprintf(f, "option:%d", BAG_(node)->o.options); @@ -6881,8 +7341,7 @@ print_indent_tree(FILE* f, Node* node, int indent) case BAG_STOP_BACKTRACK: fprintf(f, "stop-bt"); break; - case BAG_IF_ELSE: - fprintf(f, "if-else"); + default: break; } fprintf(f, "\n"); diff --git a/src/regenc.c b/src/regenc.c index 6376565..16ac313 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -2,7 +2,7 @@ regenc.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -182,7 +182,8 @@ onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc, p += enclen(enc, p); } else { - if (prev) *prev = (const UChar* )NULL; /* Sorry */ + if (prev) + *prev = onigenc_get_prev_char_head(enc, start, p); } return p; } @@ -208,20 +209,6 @@ onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n) return (UChar* )s; } -#if 0 -extern int -onigenc_mbc_enc_len_end(OnigEncoding enc, const UChar* p, const UChar* end) -{ - int len; - int n; - - len = ONIGENC_MBC_ENC_LEN(enc, p); - n = (int )(end - p); - - return (n < len ? n : len); -} -#endif - extern UChar* onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n) { @@ -705,18 +692,6 @@ onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p, return 1; /* return byte length of converted char to lower */ } -#if 0 -extern int -onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp)++; - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); -} -#endif - extern int onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED) { @@ -833,39 +808,6 @@ onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED, } } -#if 0 -extern int -onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag, - const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - if (ONIGENC_IS_MBC_ASCII(p)) { - (*pp)++; - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); - } - - (*pp) += enclen(enc, p); - return FALSE; -} -#endif - -extern int -onigenc_mb2_code_to_mbclen(OnigCodePoint code) -{ - if ((code & 0xff00) != 0) return 2; - else return 1; -} - -extern int -onigenc_mb4_code_to_mbclen(OnigCodePoint code) -{ - if ((code & 0xff000000) != 0) return 4; - else if ((code & 0xff0000) != 0) return 3; - else if ((code & 0xff00) != 0) return 2; - else return 1; -} - extern int onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) { diff --git a/src/regenc.h b/src/regenc.h index bd2819e..db35841 100644 --- a/src/regenc.h +++ b/src/regenc.h @@ -4,7 +4,7 @@ regenc.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -163,13 +163,11 @@ extern int onigenc_length_check_is_valid_mbc_string P_((OnigEncoding enc, const /* methods for multi byte encoding */ extern OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end)); extern int onigenc_mbn_mbc_case_fold P_((OnigEncoding enc, OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower)); -extern int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code)); extern int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); extern int onigenc_minimum_property_name_to_ctype P_((OnigEncoding enc, UChar* p, UChar* end)); extern int onigenc_unicode_property_name_to_ctype P_((OnigEncoding enc, UChar* p, UChar* end)); extern int onigenc_is_mbc_word_ascii P_((OnigEncoding enc, UChar* s, const UChar* end)); extern int onigenc_mb2_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); -extern int onigenc_mb4_code_to_mbclen P_((OnigCodePoint code)); extern int onigenc_mb4_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); extern int onigenc_mb4_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); extern struct PropertyNameCtype* onigenc_euc_jp_lookup_property_name P_((register const char *str, register size_t len)); diff --git a/src/regerror.c b/src/regerror.c index 7564827..b57a276 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -2,7 +2,7 @@ regerror.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -257,6 +257,23 @@ static int to_ascii(OnigEncoding enc, UChar *s, UChar *end, } +extern int +onig_is_error_code_needs_param(int code) +{ + switch (code) { + case ONIGERR_UNDEFINED_NAME_REFERENCE: + case ONIGERR_UNDEFINED_GROUP_REFERENCE: + case ONIGERR_MULTIPLEX_DEFINED_NAME: + case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL: + case ONIGERR_INVALID_GROUP_NAME: + case ONIGERR_INVALID_CHAR_IN_GROUP_NAME: + case ONIGERR_INVALID_CHAR_PROPERTY_NAME: + return 1; + default: + return 0; + } +} + /* for ONIG_MAX_ERROR_MESSAGE_LEN */ #define MAX_ERROR_PAR_LEN 30 diff --git a/src/regexec.c b/src/regexec.c index 6618996..ce498c6 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -2,7 +2,7 @@ regexec.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -39,6 +39,20 @@ #define CHECK_INTERRUPT_IN_MATCH +#define STACK_MEM_START(reg, i) \ + (MEM_STATUS_AT((reg)->push_mem_start, (i)) != 0 ? \ + STACK_AT(mem_start_stk[i])->u.mem.pstr : (UChar* )((void* )(mem_start_stk[i]))) + +#define STACK_MEM_END(reg, i) \ + (MEM_STATUS_AT((reg)->push_mem_end, (i)) != 0 ? \ + STACK_AT(mem_end_stk[i])->u.mem.pstr : (UChar* )((void* )(mem_end_stk[i]))) + +static int forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, UChar* range, UChar** low, UChar** high, UChar** low_prev); + +static int +search_in_range(regex_t* reg, const UChar* str, const UChar* end, const UChar* start, const UChar* range, /* match range */ const UChar* data_range, /* subject string range */ OnigRegion* region, OnigOptionType option, OnigMatchParam* mp); + + #ifdef USE_CALLOUT typedef struct { int last_match_at_call_counter; @@ -129,7 +143,7 @@ typedef struct { } MatchArg; -#ifdef ONIG_DEBUG +#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) /* arguments type */ typedef enum { @@ -149,102 +163,108 @@ typedef struct { } OpInfoType; static OpInfoType OpInfo[] = { - { OP_FINISH, "finish" }, - { OP_END, "end" }, - { OP_EXACT1, "exact1" }, - { OP_EXACT2, "exact2" }, - { OP_EXACT3, "exact3" }, - { OP_EXACT4, "exact4" }, - { OP_EXACT5, "exact5" }, - { OP_EXACTN, "exactn" }, - { OP_EXACTMB2N1, "exactmb2-n1" }, - { OP_EXACTMB2N2, "exactmb2-n2" }, - { OP_EXACTMB2N3, "exactmb2-n3" }, - { OP_EXACTMB2N, "exactmb2-n" }, - { OP_EXACTMB3N, "exactmb3n" }, - { OP_EXACTMBN, "exactmbn" }, - { OP_EXACT1_IC, "exact1-ic" }, - { OP_EXACTN_IC, "exactn-ic" }, - { OP_CCLASS, "cclass" }, - { OP_CCLASS_MB, "cclass-mb" }, - { OP_CCLASS_MIX, "cclass-mix" }, - { OP_CCLASS_NOT, "cclass-not" }, - { OP_CCLASS_MB_NOT, "cclass-mb-not" }, - { OP_CCLASS_MIX_NOT, "cclass-mix-not" }, - { OP_ANYCHAR, "anychar" }, - { OP_ANYCHAR_ML, "anychar-ml" }, - { OP_ANYCHAR_STAR, "anychar*" }, - { OP_ANYCHAR_ML_STAR, "anychar-ml*" }, - { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next" }, - { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next" }, - { OP_WORD, "word" }, - { OP_WORD_ASCII, "word-ascii" }, - { OP_NO_WORD, "not-word" }, - { OP_NO_WORD_ASCII, "not-word-ascii" }, - { OP_WORD_BOUNDARY, "word-boundary" }, - { OP_NO_WORD_BOUNDARY, "not-word-boundary" }, - { OP_WORD_BEGIN, "word-begin" }, - { OP_WORD_END, "word-end" }, - { OP_TEXT_SEGMENT_BOUNDARY, "text-segment-boundary" }, - { OP_BEGIN_BUF, "begin-buf" }, - { OP_END_BUF, "end-buf" }, - { OP_BEGIN_LINE, "begin-line" }, - { OP_END_LINE, "end-line" }, - { OP_SEMI_END_BUF, "semi-end-buf" }, - { OP_BEGIN_POSITION, "begin-position" }, - { OP_BACKREF1, "backref1" }, - { OP_BACKREF2, "backref2" }, - { OP_BACKREF_N, "backref-n" }, - { OP_BACKREF_N_IC, "backref-n-ic" }, - { OP_BACKREF_MULTI, "backref_multi" }, - { OP_BACKREF_MULTI_IC, "backref_multi-ic" }, - { OP_BACKREF_WITH_LEVEL, "backref_with_level" }, - { OP_BACKREF_WITH_LEVEL_IC, "backref_with_level-c" }, - { OP_BACKREF_CHECK, "backref_check" }, - { OP_BACKREF_CHECK_WITH_LEVEL, "backref_check_with_level" }, - { OP_MEMORY_START_PUSH, "mem-start-push" }, - { OP_MEMORY_START, "mem-start" }, - { OP_MEMORY_END_PUSH, "mem-end-push" }, - { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec" }, - { OP_MEMORY_END, "mem-end" }, - { OP_MEMORY_END_REC, "mem-end-rec" }, - { OP_FAIL, "fail" }, - { OP_JUMP, "jump" }, - { OP_PUSH, "push" }, - { OP_PUSH_SUPER, "push-super" }, - { OP_POP_OUT, "pop-out" }, + { OP_FINISH, "finish"}, + { OP_END, "end"}, + { OP_STR_1, "str_1"}, + { OP_STR_2, "str_2"}, + { OP_STR_3, "str_3"}, + { OP_STR_4, "str_4"}, + { OP_STR_5, "str_5"}, + { OP_STR_N, "str_n"}, + { OP_STR_MB2N1, "str_mb2-n1"}, + { OP_STR_MB2N2, "str_mb2-n2"}, + { OP_STR_MB2N3, "str_mb2-n3"}, + { OP_STR_MB2N, "str_mb2-n"}, + { OP_STR_MB3N, "str_mb3n"}, + { OP_STR_MBN, "str_mbn"}, + { OP_STR_1_IC, "str_1-ic"}, + { OP_STR_N_IC, "str_n-ic"}, + { OP_CCLASS, "cclass"}, + { OP_CCLASS_MB, "cclass-mb"}, + { OP_CCLASS_MIX, "cclass-mix"}, + { OP_CCLASS_NOT, "cclass-not"}, + { OP_CCLASS_MB_NOT, "cclass-mb-not"}, + { OP_CCLASS_MIX_NOT, "cclass-mix-not"}, + { OP_ANYCHAR, "anychar"}, + { OP_ANYCHAR_ML, "anychar-ml"}, + { OP_ANYCHAR_STAR, "anychar*"}, + { OP_ANYCHAR_ML_STAR, "anychar-ml*"}, + { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next"}, + { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next"}, + { OP_WORD, "word"}, + { OP_WORD_ASCII, "word-ascii"}, + { OP_NO_WORD, "not-word"}, + { OP_NO_WORD_ASCII, "not-word-ascii"}, + { OP_WORD_BOUNDARY, "word-boundary"}, + { OP_NO_WORD_BOUNDARY, "not-word-boundary"}, + { OP_WORD_BEGIN, "word-begin"}, + { OP_WORD_END, "word-end"}, + { OP_TEXT_SEGMENT_BOUNDARY, "text-segment-boundary"}, + { OP_BEGIN_BUF, "begin-buf"}, + { OP_END_BUF, "end-buf"}, + { OP_BEGIN_LINE, "begin-line"}, + { OP_END_LINE, "end-line"}, + { OP_SEMI_END_BUF, "semi-end-buf"}, + { OP_BEGIN_POSITION, "begin-position"}, + { OP_BACKREF1, "backref1"}, + { OP_BACKREF2, "backref2"}, + { OP_BACKREF_N, "backref-n"}, + { OP_BACKREF_N_IC, "backref-n-ic"}, + { OP_BACKREF_MULTI, "backref_multi"}, + { OP_BACKREF_MULTI_IC, "backref_multi-ic"}, + { OP_BACKREF_WITH_LEVEL, "backref_with_level"}, + { OP_BACKREF_WITH_LEVEL_IC, "backref_with_level-c"}, + { OP_BACKREF_CHECK, "backref_check"}, + { OP_BACKREF_CHECK_WITH_LEVEL, "backref_check_with_level"}, + { OP_MEM_START_PUSH, "mem-start-push"}, + { OP_MEM_START, "mem-start"}, + { OP_MEM_END_PUSH, "mem-end-push"}, +#ifdef USE_CALL + { OP_MEM_END_PUSH_REC, "mem-end-push-rec"}, +#endif + { OP_MEM_END, "mem-end"}, +#ifdef USE_CALL + { OP_MEM_END_REC, "mem-end-rec"}, +#endif + { OP_FAIL, "fail"}, + { OP_JUMP, "jump"}, + { OP_PUSH, "push"}, + { OP_PUSH_SUPER, "push-super"}, + { OP_POP_OUT, "pop-out"}, #ifdef USE_OP_PUSH_OR_JUMP_EXACT - { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1" }, + { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1"}, +#endif + { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next"}, + { OP_REPEAT, "repeat"}, + { OP_REPEAT_NG, "repeat-ng"}, + { OP_REPEAT_INC, "repeat-inc"}, + { OP_REPEAT_INC_NG, "repeat-inc-ng"}, + { OP_EMPTY_CHECK_START, "empty-check-start"}, + { OP_EMPTY_CHECK_END, "empty-check-end"}, + { OP_EMPTY_CHECK_END_MEMST, "empty-check-end-memst"}, +#ifdef USE_CALL + { OP_EMPTY_CHECK_END_MEMST_PUSH,"empty-check-end-memst-push"}, +#endif + { OP_PREC_READ_START, "push-pos"}, + { OP_PREC_READ_END, "pop-pos"}, + { OP_PREC_READ_NOT_START, "prec-read-not-start"}, + { OP_PREC_READ_NOT_END, "prec-read-not-end"}, + { OP_ATOMIC_START, "atomic-start"}, + { OP_ATOMIC_END, "atomic-end"}, + { OP_LOOK_BEHIND, "look-behind"}, + { OP_LOOK_BEHIND_NOT_START, "look-behind-not-start"}, + { OP_LOOK_BEHIND_NOT_END, "look-behind-not-end"}, + { OP_PUSH_SAVE_VAL, "push-save-val"}, + { OP_UPDATE_VAR, "update-var"}, +#ifdef USE_CALL + { OP_CALL, "call"}, + { OP_RETURN, "return"}, #endif - { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next" }, - { OP_REPEAT, "repeat" }, - { OP_REPEAT_NG, "repeat-ng" }, - { OP_REPEAT_INC, "repeat-inc" }, - { OP_REPEAT_INC_NG, "repeat-inc-ng" }, - { OP_REPEAT_INC_SG, "repeat-inc-sg" }, - { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg" }, - { OP_EMPTY_CHECK_START, "empty-check-start" }, - { OP_EMPTY_CHECK_END, "empty-check-end" }, - { OP_EMPTY_CHECK_END_MEMST, "empty-check-end-memst" }, - { OP_EMPTY_CHECK_END_MEMST_PUSH,"empty-check-end-memst-push" }, - { OP_PREC_READ_START, "push-pos" }, - { OP_PREC_READ_END, "pop-pos" }, - { OP_PREC_READ_NOT_START, "prec-read-not-start" }, - { OP_PREC_READ_NOT_END, "prec-read-not-end" }, - { OP_ATOMIC_START, "atomic-start" }, - { OP_ATOMIC_END, "atomic-end" }, - { OP_LOOK_BEHIND, "look-behind" }, - { OP_LOOK_BEHIND_NOT_START, "look-behind-not-start" }, - { OP_LOOK_BEHIND_NOT_END, "look-behind-not-end" }, - { OP_CALL, "call" }, - { OP_RETURN, "return" }, - { OP_PUSH_SAVE_VAL, "push-save-val" }, - { OP_UPDATE_VAR, "update-var" }, #ifdef USE_CALLOUT - { OP_CALLOUT_CONTENTS, "callout-contents" }, - { OP_CALLOUT_NAME, "callout-name" }, + { OP_CALLOUT_CONTENTS, "callout-contents"}, + { OP_CALLOUT_NAME, "callout-name"}, #endif - { -1, "" } + { -1, ""} }; static char* @@ -320,32 +340,32 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, fprintf(f, "%s", op2name(opcode)); switch (opcode) { - case OP_EXACT1: + case OP_STR_1: p_string(f, 1, p->exact.s); break; - case OP_EXACT2: + case OP_STR_2: p_string(f, 2, p->exact.s); break; - case OP_EXACT3: + case OP_STR_3: p_string(f, 3, p->exact.s); break; - case OP_EXACT4: + case OP_STR_4: p_string(f, 4, p->exact.s); break; - case OP_EXACT5: + case OP_STR_5: p_string(f, 5, p->exact.s); break; - case OP_EXACTN: + case OP_STR_N: len = p->exact_n.n; p_string(f, len, p->exact_n.s); break; - case OP_EXACTMB2N1: + case OP_STR_MB2N1: p_string(f, 2, p->exact.s); break; - case OP_EXACTMB2N2: + case OP_STR_MB2N2: p_string(f, 4, p->exact.s); break; - case OP_EXACTMB2N3: + case OP_STR_MB2N3: p_string(f, 3, p->exact.s); break; - case OP_EXACTMB2N: + case OP_STR_MB2N: len = p->exact_n.n; p_len_string(f, len, 2, p->exact_n.s); break; - case OP_EXACTMB3N: + case OP_STR_MB3N: len = p->exact_n.n; p_len_string(f, len, 3, p->exact_n.s); break; - case OP_EXACTMBN: + case OP_STR_MBN: { int mb_len; @@ -357,11 +377,11 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, while (n-- > 0) { fputc(*q++, f); } } break; - case OP_EXACT1_IC: + case OP_STR_1_IC: len = enclen(enc, p->exact.s); p_string(f, len, p->exact.s); break; - case OP_EXACTN_IC: + case OP_STR_N_IC: len = p->exact_n.n; p_len_string(f, len, 1, p->exact_n.s); break; @@ -375,13 +395,13 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_CCLASS_MB_NOT: { OnigCodePoint ncode; - OnigCodePoint* codes; + OnigCodePoint* codes; codes = (OnigCodePoint* )p->cclass_mb.mb; GET_CODE_POINT(ncode, codes); codes++; GET_CODE_POINT(code, codes); - fprintf(f, ":%u:%u", code, ncode); + fprintf(f, ":%d:0x%x", ncode, code); } break; case OP_CCLASS_MIX: @@ -447,15 +467,18 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, } break; - case OP_MEMORY_START: - case OP_MEMORY_START_PUSH: + case OP_MEM_START: + case OP_MEM_START_PUSH: mem = p->memory_start.num; fprintf(f, ":%d", mem); break; - case OP_MEMORY_END_PUSH: - case OP_MEMORY_END_PUSH_REC: - case OP_MEMORY_END: - case OP_MEMORY_END_REC: + + case OP_MEM_END: + case OP_MEM_END_PUSH: +#ifdef USE_CALL + case OP_MEM_END_REC: + case OP_MEM_END_PUSH_REC: +#endif mem = p->memory_end.num; fprintf(f, ":%d", mem); break; @@ -499,8 +522,6 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_REPEAT_INC: case OP_REPEAT_INC_NG: - case OP_REPEAT_INC_SG: - case OP_REPEAT_INC_NG_SG: mem = p->repeat.id; fprintf(f, ":%d", mem); break; @@ -511,7 +532,9 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; case OP_EMPTY_CHECK_END: case OP_EMPTY_CHECK_END_MEMST: +#ifdef USE_CALL case OP_EMPTY_CHECK_END_MEMST_PUSH: +#endif mem = p->empty_check_end.mem; fprintf(f, ":%d", mem); break; @@ -534,10 +557,12 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, p_rel_addr(f, addr, p, start); break; +#ifdef USE_CALL case OP_CALL: addr = p->call.addr; fprintf(f, ":{/%d}", addr); break; +#endif case OP_PUSH_SAVE_VAL: { @@ -607,7 +632,9 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_ATOMIC_START: case OP_ATOMIC_END: case OP_LOOK_BEHIND_NOT_END: +#ifdef USE_CALL case OP_RETURN: +#endif break; default: @@ -615,7 +642,7 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; } } -#endif /* ONIG_DEBUG */ +#endif /* defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) */ #ifdef ONIG_DEBUG_COMPILE extern void @@ -625,8 +652,8 @@ onig_print_compiled_byte_code_list(FILE* f, regex_t* reg) Operation* start = reg->ops; Operation* end = reg->ops + reg->ops_used; - fprintf(f, "bt_mem_start: 0x%x, bt_mem_end: 0x%x\n", - reg->bt_mem_start, reg->bt_mem_end); + fprintf(f, "push_mem_start: 0x%x, push_mem_end: 0x%x\n", + reg->push_mem_start, reg->push_mem_end); fprintf(f, "code-length: %d\n", reg->ops_used); bp = start; @@ -943,7 +970,7 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) result = ONIGERR_INVALID_ARGUMENT;\ }\ best_len = result;\ - goto finish;\ + goto match_at_end;\ break;\ }\ } while(0) @@ -965,21 +992,31 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) /* handled by normal-POP */ #define STK_MEM_START 0x0010 #define STK_MEM_END 0x8030 -#define STK_REPEAT_INC 0x0050 +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR +#define STK_REPEAT_INC (0x0040 | STK_MASK_POP_HANDLED) +#else +#define STK_REPEAT_INC 0x0040 +#endif #ifdef USE_CALLOUT #define STK_CALLOUT 0x0070 #endif /* avoided by normal-POP */ #define STK_VOID 0x0000 /* for fill a blank */ +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR +#define STK_EMPTY_CHECK_START (0x3000 | STK_MASK_POP_HANDLED) +#else #define STK_EMPTY_CHECK_START 0x3000 +#endif #define STK_EMPTY_CHECK_END 0x5000 /* for recursive call */ #define STK_MEM_END_MARK 0x8100 #define STK_TO_VOID_START 0x1200 /* mark for "(?>...)" */ -#define STK_REPEAT 0x0300 +/* #define STK_REPEAT 0x0300 */ #define STK_CALL_FRAME 0x0400 #define STK_RETURN 0x0500 #define STK_SAVE_VAL 0x0600 +#define STK_PREC_READ_START 0x0700 +#define STK_PREC_READ_END 0x0800 /* stack type check mask */ #define STK_MASK_POP_USED STK_ALT_FLAG @@ -1000,11 +1037,10 @@ typedef struct _StackType { UChar* pstr_prev; /* previous char position of pstr */ } state; struct { - int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */ - Operation* pcode; /* byte code position (head of repeated target) */ - } repeat; - struct { - StackIndex si; /* index of stack */ + int count; +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex prev_index; /* index of stack */ +#endif } repeat_inc; struct { UChar *pstr; /* start/end position */ @@ -1013,7 +1049,10 @@ typedef struct _StackType { StackIndex prev_end; /* prev. info (for backtrack "(...)*" ) */ } mem; struct { - UChar *pstr; /* start position */ + UChar *pstr; /* start position */ +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex prev_index; /* index of stack */ +#endif } empty_check; #ifdef USE_CALL struct { @@ -1059,29 +1098,64 @@ struct OnigCalloutArgsStruct { #endif +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define PTR_NUM_SIZE(reg) ((reg)->num_repeat + (reg)->num_empty_check + ((reg)->num_mem + 1) * 2) +#define UPDATE_FOR_STACK_REALLOC do{\ + repeat_stk = (StackIndex* )alloc_base;\ + empty_check_stk = (StackIndex* )(repeat_stk + reg->num_repeat);\ + mem_start_stk = (StackIndex* )(empty_check_stk + reg->num_empty_check);\ + mem_end_stk = mem_start_stk + num_mem + 1;\ +} while(0) + +#define SAVE_REPEAT_STK_VAR(sid) stk->u.repeat_inc.prev_index = repeat_stk[sid] +#define LOAD_TO_REPEAT_STK_VAR(sid) repeat_stk[sid] = GET_STACK_INDEX(stk) +#define POP_REPEAT_INC else if (stk->type == STK_REPEAT_INC) {repeat_stk[stk->zid] = stk->u.repeat_inc.prev_index;} + +#define SAVE_EMPTY_CHECK_STK_VAR(sid) stk->u.empty_check.prev_index = empty_check_stk[sid] +#define LOAD_TO_EMPTY_CHECK_STK_VAR(sid) empty_check_stk[sid] = GET_STACK_INDEX(stk) +#define POP_EMPTY_CHECK_START else if (stk->type == STK_EMPTY_CHECK_START) {empty_check_stk[stk->zid] = stk->u.empty_check.prev_index;} + +#else + +#define PTR_NUM_SIZE(reg) (((reg)->num_mem + 1) * 2) +#define UPDATE_FOR_STACK_REALLOC do{\ + mem_start_stk = (StackIndex* )alloc_base;\ + mem_end_stk = mem_start_stk + num_mem + 1;\ +} while(0) + +#define SAVE_REPEAT_STK_VAR(sid) +#define LOAD_TO_REPEAT_STK_VAR(sid) +#define POP_REPEAT_INC + +#define SAVE_EMPTY_CHECK_STK_VAR(sid) +#define LOAD_TO_EMPTY_CHECK_STK_VAR(sid) +#define POP_EMPTY_CHECK_START + +#endif /* USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR */ #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE -#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mp) do { \ +#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ - (msa).match_stack_limit = (mp)->match_stack_limit;\ - (msa).retry_limit_in_match = (mp)->retry_limit_in_match;\ - (msa).mp = mp;\ + (msa).match_stack_limit = (mpv)->match_stack_limit;\ + (msa).retry_limit_in_match = (mpv)->retry_limit_in_match;\ + (msa).mp = mpv;\ (msa).best_len = ONIG_MISMATCH;\ - (msa).ptr_num = (reg)->num_repeat + ((reg)->num_mem + 1) * 2; \ + (msa).ptr_num = PTR_NUM_SIZE(reg);\ } while(0) #else -#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mp) do { \ +#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ - (msa).match_stack_limit = (mp)->match_stack_limit;\ - (msa).retry_limit_in_match = (mp)->retry_limit_in_match;\ - (msa).mp = mp;\ - (msa).ptr_num = (reg)->num_repeat + ((reg)->num_mem + 1) * 2; \ + (msa).match_stack_limit = (mpv)->match_stack_limit;\ + (msa).retry_limit_in_match = (mpv)->retry_limit_in_match;\ + (msa).mp = mpv;\ + (msa).ptr_num = PTR_NUM_SIZE(reg);\ } while(0) #endif @@ -1136,12 +1210,6 @@ struct OnigCalloutArgsStruct { };\ } while(0) -#define UPDATE_FOR_STACK_REALLOC do{\ - repeat_stk = (StackIndex* )alloc_base;\ - mem_start_stk = (StackIndex* )(repeat_stk + reg->num_repeat);\ - mem_end_stk = mem_start_stk + num_mem + 1;\ -} while(0) - static unsigned int MatchStackLimit = DEFAULT_MATCH_STACK_LIMIT_SIZE; extern unsigned int @@ -1162,7 +1230,9 @@ onig_set_match_stack_limit_size(unsigned int size) static unsigned long RetryLimitInMatch = DEFAULT_RETRY_LIMIT_IN_MATCH; #define CHECK_RETRY_LIMIT_IN_MATCH do {\ - if (retry_in_match_counter++ > retry_limit_in_match) goto retry_limit_in_match_over;\ + if (retry_in_match_counter++ > retry_limit_in_match) {\ + MATCH_AT_ERROR_RETURN(ONIGERR_RETRY_LIMIT_IN_MATCH_OVER);\ + }\ } while (0) #else @@ -1544,27 +1614,31 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) #define STACK_PUSH_SUPER_ALT(pat,s,sprev) STACK_PUSH(STK_SUPER_ALT,pat,s,sprev) -#define STACK_PUSH_POS(s,sprev) \ - STACK_PUSH(STK_TO_VOID_START,(Operation* )0,s,sprev) +#define STACK_PUSH_PREC_READ_START(s,sprev) \ + STACK_PUSH(STK_PREC_READ_START,(Operation* )0,s,sprev) #define STACK_PUSH_ALT_PREC_READ_NOT(pat,s,sprev) \ STACK_PUSH(STK_ALT_PREC_READ_NOT,pat,s,sprev) #define STACK_PUSH_TO_VOID_START STACK_PUSH_TYPE(STK_TO_VOID_START) #define STACK_PUSH_ALT_LOOK_BEHIND_NOT(pat,s,sprev) \ STACK_PUSH(STK_ALT_LOOK_BEHIND_NOT,pat,s,sprev) +#if 0 #define STACK_PUSH_REPEAT(sid, pat) do {\ STACK_ENSURE(1);\ stk->type = STK_REPEAT;\ stk->zid = (sid);\ - stk->u.repeat.pcode = (pat);\ - stk->u.repeat.count = 0;\ + stk->u.repeat.pcode = (pat);\ STACK_INC;\ } while(0) +#endif -#define STACK_PUSH_REPEAT_INC(sindex) do {\ +#define STACK_PUSH_REPEAT_INC(sid, ct) do {\ STACK_ENSURE(1);\ stk->type = STK_REPEAT_INC;\ - stk->u.repeat_inc.si = (sindex);\ + stk->zid = (sid);\ + stk->u.repeat_inc.count = (ct);\ + SAVE_REPEAT_STK_VAR(sid);\ + LOAD_TO_REPEAT_STK_VAR(sid);\ STACK_INC;\ } while(0) @@ -1637,6 +1711,8 @@ stack_double(int is_alloca, char** arg_alloc_base, stk->type = STK_EMPTY_CHECK_START;\ stk->zid = (cnum);\ stk->u.empty_check.pstr = (s);\ + SAVE_EMPTY_CHECK_STK_VAR(cnum);\ + LOAD_TO_EMPTY_CHECK_STK_VAR(cnum);\ STACK_INC;\ } while(0) @@ -1774,7 +1850,7 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_BASE_CHECK(p, at) \ if ((p) < stk_base) {\ fprintf(stderr, "at %s\n", at);\ - goto stack_error;\ + MATCH_AT_ERROR_RETURN(ONIGERR_STACK_BUG);\ } #else #define STACK_BASE_CHECK(p, at) @@ -1825,13 +1901,12 @@ stack_double(int is_alloca, char** arg_alloc_base, mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ - else if (stk->type == STK_REPEAT_INC) {\ - STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ - }\ else if (stk->type == STK_MEM_END) {\ mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ + POP_REPEAT_INC \ + POP_EMPTY_CHECK_START \ POP_CALLOUT_CASE\ }\ }\ @@ -1850,13 +1925,12 @@ stack_double(int is_alloca, char** arg_alloc_base, mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ - else if (stk->type == STK_REPEAT_INC) {\ - STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ - }\ else if (stk->type == STK_MEM_END) {\ mem_start_stk[stk->zid] = stk->u.mem.prev_start;\ mem_end_stk[stk->zid] = stk->u.mem.prev_end;\ }\ + POP_REPEAT_INC \ + POP_EMPTY_CHECK_START \ /* Don't call callout here because negation of total success by (?!..) (?<!..) */\ }\ }\ @@ -1887,65 +1961,99 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ } while(0) -#define STACK_EMPTY_CHECK(isnull,sid,s) do {\ - StackType* k = stk;\ +#define STACK_GET_PREC_READ_START(k) do {\ + int level = 0;\ + k = stk;\ while (1) {\ k--;\ - STACK_BASE_CHECK(k, "STACK_EMPTY_CHECK"); \ - if (k->type == STK_EMPTY_CHECK_START) {\ - if (k->zid == (sid)) {\ - (isnull) = (k->u.empty_check.pstr == (s));\ + STACK_BASE_CHECK(k, "STACK_GET_PREC_READ_START");\ + if (IS_TO_VOID_TARGET(k)) {\ + k->type = STK_VOID;\ + }\ + else if (k->type == STK_PREC_READ_START) {\ + if (level == 0) {\ break;\ }\ + level--;\ + }\ + else if (k->type == STK_PREC_READ_END) {\ + level++;\ }\ }\ } while(0) + +#define EMPTY_CHECK_START_SEARCH(sid, k) do {\ + k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k, "EMPTY_CHECK_START_SEARCH"); \ + if (k->type == STK_EMPTY_CHECK_START) {\ + if (k->zid == (sid)) break;\ + }\ + }\ +} while(0) + +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define GET_EMPTY_CHECK_START(sid, k) do {\ + if (reg->num_call == 0) {\ + k = STACK_AT(empty_check_stk[sid]);\ + }\ + else {\ + EMPTY_CHECK_START_SEARCH(sid, k);\ + }\ +} while(0) +#else + +#define GET_EMPTY_CHECK_START(sid, k) EMPTY_CHECK_START_SEARCH(sid, k) + +#endif + + +#define STACK_EMPTY_CHECK(isnull, sid, s) do {\ + StackType* k;\ + GET_EMPTY_CHECK_START(sid, k);\ + (isnull) = (k->u.empty_check.pstr == (s));\ +} while(0) + #define STACK_MEM_START_GET_PREV_END_ADDR(k /* STK_MEM_START*/, reg, addr) do {\ if (k->u.mem.prev_end == INVALID_STACK_INDEX) {\ (addr) = 0;\ }\ else {\ - if (MEM_STATUS_AT((reg)->bt_mem_end, k->zid))\ + if (MEM_STATUS_AT((reg)->push_mem_end, k->zid))\ (addr) = STACK_AT(k->u.mem.prev_end)->u.mem.pstr;\ else\ (addr) = (UChar* )k->u.mem.prev_end;\ }\ } while (0) -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT -#define STACK_EMPTY_CHECK_MEM(isnull,sid,s,reg) do {\ - StackType* k = stk;\ - while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_EMPTY_CHECK_MEM"); \ - if (k->type == STK_EMPTY_CHECK_START) {\ - if (k->zid == (sid)) {\ - if (k->u.empty_check.pstr != (s)) {\ - (isnull) = 0;\ - break;\ +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +#define STACK_EMPTY_CHECK_MEM(isnull, sid, s, reg) do {\ + StackType* k;\ + GET_EMPTY_CHECK_START(sid, k);\ + if (k->u.empty_check.pstr != (s)) {\ + (isnull) = 0;\ + }\ + else {\ + UChar* endp;\ + (isnull) = 1;\ + while (k < stk) {\ + if (k->type == STK_MEM_START &&\ + MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid)) {\ + STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ + if (endp == 0) {\ + (isnull) = 0; break;\ }\ - else {\ - UChar* endp;\ - (isnull) = 1;\ - while (k < stk) {\ - if (k->type == STK_MEM_START) {\ - STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ - if (endp == 0) {\ - (isnull) = 0; break;\ - }\ - else if (STACK_AT(k->u.mem.prev_start)->u.mem.pstr != endp) {\ - (isnull) = 0; break;\ - }\ - else if (endp != s) {\ - (isnull) = -1; /* empty, but position changed */ \ - }\ - }\ - k++;\ - }\ - break;\ + else if (STACK_AT(k->u.mem.prev_start)->u.mem.pstr != endp) {\ + (isnull) = 0; break;\ + }\ + else if (endp != s) {\ + (isnull) = -1; /* empty, but position changed */ \ }\ }\ + k++;\ }\ }\ } while(0) @@ -1968,7 +2076,8 @@ stack_double(int is_alloca, char** arg_alloc_base, (isnull) = 1;\ while (k < stk) {\ if (k->type == STK_MEM_START) {\ - if (level == 0) {\ + if (level == 0 && \ + MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid) !=0) {\ STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ if (endp == 0) {\ (isnull) = 0; break;\ @@ -2023,26 +2132,47 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ }\ } while(0) -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ -#define STACK_GET_REPEAT(sid, k) do {\ - int level = 0;\ - k = stk;\ +#define STACK_GET_REPEAT_COUNT_SEARCH(sid, c) do {\ + StackType* k = stk;\ while (1) {\ - k--;\ - STACK_BASE_CHECK(k, "STACK_GET_REPEAT"); \ - if (k->type == STK_REPEAT) {\ - if (level == 0) {\ - if (k->zid == (sid)) {\ - break;\ + (k)--;\ + STACK_BASE_CHECK(k, "STACK_GET_REPEAT_COUNT_SEARCH");\ + if ((k)->type == STK_REPEAT_INC) {\ + if ((k)->zid == (sid)) {\ + (c) = (k)->u.repeat_inc.count;\ + break;\ + }\ + }\ + else if ((k)->type == STK_RETURN) {\ + int level = -1;\ + while (1) {\ + (k)--;\ + if ((k)->type == STK_CALL_FRAME) {\ + level++;\ + if (level == 0) break;\ }\ + else if ((k)->type == STK_RETURN) level--;\ }\ }\ - else if (k->type == STK_CALL_FRAME) level--;\ - else if (k->type == STK_RETURN) level++;\ }\ } while(0) +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + +#define STACK_GET_REPEAT_COUNT(sid, c) do {\ + if (reg->num_call == 0) {\ + (c) = (STACK_AT(repeat_stk[sid]))->u.repeat_inc.count;\ + }\ + else {\ + STACK_GET_REPEAT_COUNT_SEARCH(sid, c);\ + }\ +} while(0) +#else +#define STACK_GET_REPEAT_COUNT(sid, c) STACK_GET_REPEAT_COUNT_SEARCH(sid, c) +#endif + #define STACK_RETURN(addr) do {\ int level = 0;\ StackType* k = stk;\ @@ -2444,6 +2574,8 @@ typedef struct { #define MATCH_DEBUG_OUT(offset) #endif +#define MATCH_AT_ERROR_RETURN(err_code) best_len = err_code; goto match_at_end + /* match data(str - end) from position (sstart). */ /* if sstart == str then set sprev to NULL. */ @@ -2463,20 +2595,20 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, static const void *opcode_to_label[] = { &&L_FINISH, &&L_END, - &&L_EXACT1, - &&L_EXACT2, - &&L_EXACT3, - &&L_EXACT4, - &&L_EXACT5, - &&L_EXACTN, - &&L_EXACTMB2N1, - &&L_EXACTMB2N2, - &&L_EXACTMB2N3, - &&L_EXACTMB2N, - &&L_EXACTMB3N, - &&L_EXACTMBN, - &&L_EXACT1_IC, - &&L_EXACTN_IC, + &&L_STR_1, + &&L_STR_2, + &&L_STR_3, + &&L_STR_4, + &&L_STR_5, + &&L_STR_N, + &&L_STR_MB2N1, + &&L_STR_MB2N2, + &&L_STR_MB2N3, + &&L_STR_MB2N, + &&L_STR_MB3N, + &&L_STR_MBN, + &&L_STR_1_IC, + &&L_STR_N_IC, &&L_CCLASS, &&L_CCLASS_MB, &&L_CCLASS_MIX, @@ -2514,12 +2646,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_BACKREF_WITH_LEVEL_IC, &&L_BACKREF_CHECK, &&L_BACKREF_CHECK_WITH_LEVEL, - &&L_MEMORY_START, - &&L_MEMORY_START_PUSH, - &&L_MEMORY_END_PUSH, - &&L_MEMORY_END_PUSH_REC, - &&L_MEMORY_END, - &&L_MEMORY_END_REC, + &&L_MEM_START, + &&L_MEM_START_PUSH, + &&L_MEM_END_PUSH, +#ifdef USE_CALL + &&L_MEM_END_PUSH_REC, +#endif + &&L_MEM_END, +#ifdef USE_CALL + &&L_MEM_END_REC, +#endif &&L_FAIL, &&L_JUMP, &&L_PUSH, @@ -2533,12 +2669,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_REPEAT_NG, &&L_REPEAT_INC, &&L_REPEAT_INC_NG, - &&L_REPEAT_INC_SG, - &&L_REPEAT_INC_NG_SG, &&L_EMPTY_CHECK_START, &&L_EMPTY_CHECK_END, &&L_EMPTY_CHECK_END_MEMST, +#ifdef USE_CALL &&L_EMPTY_CHECK_END_MEMST_PUSH, +#endif &&L_PREC_READ_START, &&L_PREC_READ_END, &&L_PREC_READ_NOT_START, @@ -2548,10 +2684,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_LOOK_BEHIND, &&L_LOOK_BEHIND_NOT_START, &&L_LOOK_BEHIND_NOT_END, - &&L_CALL, - &&L_RETURN, &&L_PUSH_SAVE_VAL, &&L_UPDATE_VAR, +#ifdef USE_CALL + &&L_CALL, + &&L_RETURN, +#endif #ifdef USE_CALLOUT &&L_CALLOUT_CONTENTS, &&L_CALLOUT_NAME, @@ -2569,15 +2707,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, char *alloc_base; StackType *stk_base, *stk, *stk_end; StackType *stkp; /* used as any purpose. */ - StackIndex si; - StackIndex *repeat_stk; StackIndex *mem_start_stk, *mem_end_stk; UChar* keep; + +#ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR + StackIndex *repeat_stk; + StackIndex *empty_check_stk; +#endif #ifdef USE_RETRY_LIMIT_IN_MATCH unsigned long retry_limit_in_match; unsigned long retry_in_match_counter; #endif - #ifdef USE_CALLOUT int of; #endif @@ -2663,15 +2803,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, rmt[0].rm_eo = (regoff_t )(s - str); for (i = 1; i <= num_mem; i++) { if (mem_end_stk[i] != INVALID_STACK_INDEX) { - if (MEM_STATUS_AT(reg->bt_mem_start, i)) - rmt[i].rm_so = (regoff_t )(STACK_AT(mem_start_stk[i])->u.mem.pstr - str); - else - rmt[i].rm_so = (regoff_t )((UChar* )((void* )(mem_start_stk[i])) - str); - - rmt[i].rm_eo = (regoff_t )((MEM_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - - str); + rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str); + rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i) - str); } else { rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; @@ -2684,14 +2817,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, region->end[0] = (int )(s - str); for (i = 1; i <= num_mem; i++) { if (mem_end_stk[i] != INVALID_STACK_INDEX) { - if (MEM_STATUS_AT(reg->bt_mem_start, i)) - region->beg[i] = (int )(STACK_AT(mem_start_stk[i])->u.mem.pstr - str); - else - region->beg[i] = (int )((UChar* )((void* )mem_start_stk[i]) - str); - - region->end[i] = (int )((MEM_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - str); + region->beg[i] = (int )(STACK_MEM_START(reg, i) - str); + region->end[i] = (int )(STACK_MEM_END(reg, i) - str); } else { region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; @@ -2719,10 +2846,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, stkp = stk_base; r = make_capture_history_tree(region->history_root, &stkp, stk, (UChar* )str, reg); - if (r < 0) { - best_len = r; /* error code */ - goto finish; - } + if (r < 0) MATCH_AT_ERROR_RETURN(r); } #endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_API_REGION_OPTION @@ -2747,9 +2871,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } /* default behavior: return first-matching result. */ - goto finish; + goto match_at_end; - CASE_OP(EXACT1) + CASE_OP(STR_1) DATA_ENSURE(1); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2757,7 +2881,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; NEXT_OUT; - CASE_OP(EXACT1_IC) + CASE_OP(STR_1_IC) { int len; UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; @@ -2778,7 +2902,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; NEXT_OUT; - CASE_OP(EXACT2) + CASE_OP(STR_2) DATA_ENSURE(2); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2789,7 +2913,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACT3) + CASE_OP(STR_3) DATA_ENSURE(3); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2802,7 +2926,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACT4) + CASE_OP(STR_4) DATA_ENSURE(4); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2817,7 +2941,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACT5) + CASE_OP(STR_5) DATA_ENSURE(5); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2834,7 +2958,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTN) + CASE_OP(STR_N) tlen = p->exact_n.n; DATA_ENSURE(tlen); ps = p->exact_n.s; @@ -2845,7 +2969,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTN_IC) + CASE_OP(STR_N_IC) { int len; UChar *q, *endp, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; @@ -2863,6 +2987,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, DATA_ENSURE(0); q = lowbuf; while (len-- > 0) { + if (ps >= endp) goto fail; if (*ps != *q) goto fail; ps++; q++; } @@ -2872,7 +2997,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB2N1) + CASE_OP(STR_MB2N1) DATA_ENSURE(2); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2882,7 +3007,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; NEXT_OUT; - CASE_OP(EXACTMB2N2) + CASE_OP(STR_MB2N2) DATA_ENSURE(4); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2897,7 +3022,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB2N3) + CASE_OP(STR_MB2N3) DATA_ENSURE(6); ps = p->exact.s; if (*ps != *s) goto fail; @@ -2916,7 +3041,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB2N) + CASE_OP(STR_MB2N) tlen = p->exact_n.n; DATA_ENSURE(tlen * 2); ps = p->exact_n.s; @@ -2930,7 +3055,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMB3N) + CASE_OP(STR_MB3N) tlen = p->exact_n.n; DATA_ENSURE(tlen * 3); ps = p->exact_n.s; @@ -2946,7 +3071,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(EXACTMBN) + CASE_OP(STR_MBN) tlen = p->exact_len_n.len; /* mb byte len */ tlen2 = p->exact_len_n.n; /* number of chars */ tlen2 *= tlen; @@ -2968,6 +3093,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, NEXT_OUT; CASE_OP(CCLASS_MB) + DATA_ENSURE(1); if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail; cclass_mb: @@ -2976,7 +3102,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar *ss; int mb_len; - DATA_ENSURE(1); mb_len = enclen(encode, s); DATA_ENSURE(mb_len); ss = s; @@ -3265,7 +3390,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, break; #endif default: - goto bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNDEFINED_BYTECODE); break; } @@ -3365,46 +3490,50 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; - CASE_OP(MEMORY_START_PUSH) + CASE_OP(MEM_START_PUSH) mem = p->memory_start.num; STACK_PUSH_MEM_START(mem, s); INC_OP; JUMP_OUT; - CASE_OP(MEMORY_START) + CASE_OP(MEM_START) mem = p->memory_start.num; mem_start_stk[mem] = (StackIndex )((void* )s); INC_OP; JUMP_OUT; - CASE_OP(MEMORY_END_PUSH) + CASE_OP(MEM_END_PUSH) mem = p->memory_end.num; STACK_PUSH_MEM_END(mem, s); INC_OP; JUMP_OUT; - CASE_OP(MEMORY_END) + CASE_OP(MEM_END) mem = p->memory_end.num; mem_end_stk[mem] = (StackIndex )((void* )s); INC_OP; JUMP_OUT; #ifdef USE_CALL - CASE_OP(MEMORY_END_PUSH_REC) - mem = p->memory_end.num; - STACK_GET_MEM_START(mem, stkp); /* should be before push mem-end. */ - si = GET_STACK_INDEX(stkp); - STACK_PUSH_MEM_END(mem, s); - mem_start_stk[mem] = si; - INC_OP; - JUMP_OUT; + CASE_OP(MEM_END_PUSH_REC) + { + StackIndex si; + + mem = p->memory_end.num; + STACK_GET_MEM_START(mem, stkp); /* should be before push mem-end. */ + si = GET_STACK_INDEX(stkp); + STACK_PUSH_MEM_END(mem, s); + mem_start_stk[mem] = si; + INC_OP; + JUMP_OUT; + } - CASE_OP(MEMORY_END_REC) + CASE_OP(MEM_END_REC) mem = p->memory_end.num; mem_end_stk[mem] = (StackIndex )((void* )s); STACK_GET_MEM_START(mem, stkp); - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) + if (MEM_STATUS_AT(reg->push_mem_start, mem)) mem_start_stk[mem] = GET_STACK_INDEX(stkp); else mem_start_stk[mem] = (StackIndex )((void* )stkp->u.mem.pstr); @@ -3432,20 +3561,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - STRING_CMP(pstart, s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + STRING_CMP(s, pstart, n); + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } } INC_OP; JUMP_OUT; @@ -3459,20 +3584,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - STRING_CMP_IC(case_fold_flag, pstart, &s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + STRING_CMP_IC(case_fold_flag, pstart, &s, n); + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } } INC_OP; JUMP_OUT; @@ -3489,24 +3610,19 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - swork = s; - STRING_CMP_VALUE(pstart, swork, n, is_fail); - if (is_fail) continue; - s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE(swork, pstart, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } break; /* success */ } if (i == tlen) goto fail; @@ -3526,24 +3642,19 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; - if (MEM_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (MEM_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); + pstart = STACK_MEM_START(reg, mem); + pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - swork = s; - STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); - if (is_fail) continue; - s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } break; /* success */ } if (i == tlen) goto fail; @@ -3560,6 +3671,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int len; int level; MemNumType* mems; + UChar* ssave; n = 0; backref_with_level: @@ -3567,10 +3679,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, tlen = p->backref_general.num; mems = tlen == 1 ? &(p->backref_general.n1) : p->backref_general.ns; - sprev = s; + ssave = s; if (backref_match_at_nested_level(reg, stk, stk_base, n, case_fold_flag, level, (int )tlen, mems, &s, end)) { - if (sprev < end) { + if (ssave != s) { + sprev = ssave; while (sprev + (len = enclen(encode, sprev)) < s) sprev += len; } @@ -3643,12 +3756,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH: case OP_REPEAT_INC: case OP_REPEAT_INC_NG: - case OP_REPEAT_INC_SG: - case OP_REPEAT_INC_NG_SG: INC_OP; break; default: - goto unexpected_bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNEXPECTED_BYTECODE); break; } #else @@ -3658,7 +3769,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } JUMP_OUT; -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT CASE_OP(EMPTY_CHECK_END_MEMST) { int is_empty; @@ -3683,7 +3794,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int is_empty; mem = p->empty_check_end.mem; /* mem: null check id */ -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg); #else STACK_EMPTY_CHECK_REC(is_empty, mem, s); @@ -3751,7 +3862,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, addr = p->push_if_peek_next.addr; c = p->push_if_peek_next.c; - if (c == *s) { + if (DATA_ENSURE_CHECK1 && c == *s) { STACK_PUSH_ALT(p + addr, s, sprev); INC_OP; JUMP_OUT; @@ -3764,10 +3875,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mem = p->repeat.id; /* mem: OP_REPEAT ID */ addr = p->repeat.addr; - STACK_ENSURE(1); - repeat_stk[mem] = GET_STACK_INDEX(stk); - STACK_PUSH_REPEAT(mem, p + 1); - + STACK_PUSH_REPEAT_INC(mem, 0); if (reg->repeat_range[mem].lower == 0) { STACK_PUSH_ALT(p + addr, s, sprev); } @@ -3778,10 +3886,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mem = p->repeat.id; /* mem: OP_REPEAT ID */ addr = p->repeat.addr; - STACK_ENSURE(1); - repeat_stk[mem] = GET_STACK_INDEX(stk); - STACK_PUSH_REPEAT(mem, p + 1); - + STACK_PUSH_REPEAT_INC(mem, 0); if (reg->repeat_range[mem].lower == 0) { STACK_PUSH_ALT(p + 1, s, sprev); p += addr; @@ -3792,73 +3897,52 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(REPEAT_INC) mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - si = repeat_stk[mem]; - stkp = STACK_AT(si); - - repeat_inc: - stkp->u.repeat.count++; - if (stkp->u.repeat.count >= reg->repeat_range[mem].upper) { + STACK_GET_REPEAT_COUNT(mem, n); + n++; + if (n >= reg->repeat_range[mem].upper) { /* end of repeat. Nothing to do. */ INC_OP; } - else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + else if (n >= reg->repeat_range[mem].lower) { INC_OP; STACK_PUSH_ALT(p, s, sprev); - p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */ + p = reg->repeat_range[mem].u.pcode; } else { - p = stkp->u.repeat.pcode; + p = reg->repeat_range[mem].u.pcode; } - STACK_PUSH_REPEAT_INC(si); + STACK_PUSH_REPEAT_INC(mem, n); CHECK_INTERRUPT_JUMP_OUT; - CASE_OP(REPEAT_INC_SG) - mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - goto repeat_inc; - CASE_OP(REPEAT_INC_NG) mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - si = repeat_stk[mem]; - stkp = STACK_AT(si); - - repeat_inc_ng: - stkp->u.repeat.count++; - if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { - if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - Operation* pcode = stkp->u.repeat.pcode; - - STACK_PUSH_REPEAT_INC(si); - STACK_PUSH_ALT(pcode, s, sprev); + STACK_GET_REPEAT_COUNT(mem, n); + n++; + STACK_PUSH_REPEAT_INC(mem, n); + if (n == reg->repeat_range[mem].upper) { + INC_OP; + } + else { + if (n >= reg->repeat_range[mem].lower) { + STACK_PUSH_ALT(reg->repeat_range[mem].u.pcode, s, sprev); INC_OP; } else { - p = stkp->u.repeat.pcode; - STACK_PUSH_REPEAT_INC(si); + p = reg->repeat_range[mem].u.pcode; } } - else if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { - STACK_PUSH_REPEAT_INC(si); - INC_OP; - } CHECK_INTERRUPT_JUMP_OUT; - CASE_OP(REPEAT_INC_NG_SG) - mem = p->repeat_inc.id; /* mem: OP_REPEAT ID */ - STACK_GET_REPEAT(mem, stkp); - si = GET_STACK_INDEX(stkp); - goto repeat_inc_ng; - CASE_OP(PREC_READ_START) - STACK_PUSH_POS(s, sprev); + STACK_PUSH_PREC_READ_START(s, sprev); INC_OP; JUMP_OUT; CASE_OP(PREC_READ_END) - STACK_EXEC_TO_VOID(stkp); + STACK_GET_PREC_READ_START(stkp); s = stkp->u.state.pstr; sprev = stkp->u.state.pstr_prev; + STACK_PUSH(STK_PREC_READ_END,0,0,0); INC_OP; JUMP_OUT; @@ -3997,14 +4081,14 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, OnigCalloutFunc func; OnigCalloutArgs args; - of = ONIG_CALLOUT_OF_NAME; - name_id = p->callout_name.id; - mem = p->callout_name.num; + of = ONIG_CALLOUT_OF_NAME; + mem = p->callout_name.num; callout_common_entry: e = onig_reg_callout_list_at(reg, mem); in = e->in; if (of == ONIG_CALLOUT_OF_NAME) { + name_id = p->callout_name.id; func = onig_get_callout_start_func(reg, mem); } else { @@ -4027,7 +4111,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, call_result = ONIGERR_INVALID_ARGUMENT; } best_len = call_result; - goto finish; + goto match_at_end; break; } } @@ -4053,7 +4137,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif CASE_OP(FINISH) - goto finish; + goto match_at_end; #ifdef ONIG_DEBUG_STATISTICS fail: @@ -4074,37 +4158,472 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, JUMP_OUT; DEFAULT_OP - goto bytecode_error; + MATCH_AT_ERROR_RETURN(ONIGERR_UNDEFINED_BYTECODE); } BYTECODE_INTERPRETER_END; - finish: + match_at_end: STACK_SAVE; return best_len; +} -#ifdef ONIG_DEBUG - stack_error: - STACK_SAVE; - return ONIGERR_STACK_BUG; -#endif +typedef struct { + regex_t* reg; + OnigRegion* region; +} RR; + +struct OnigRegSetStruct { + RR* rs; + int n; + int alloc; + OnigEncoding enc; + int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ + OnigLen anc_dmin; /* (SEMI_)END_BUF anchor distance */ + OnigLen anc_dmax; /* (SEMI_)END_BUF anchor distance */ + int all_low_high; + int anychar_inf; +}; - bytecode_error: - STACK_SAVE; - return ONIGERR_UNDEFINED_BYTECODE; +enum SearchRangeStatus { + SRS_DEAD = 0, + SRS_LOW_HIGH = 1, + SRS_ALL_RANGE = 2 +}; -#if defined(ONIG_DEBUG) && !defined(USE_DIRECT_THREADED_CODE) - unexpected_bytecode_error: - STACK_SAVE; - return ONIGERR_UNEXPECTED_BYTECODE; -#endif +typedef struct { + int state; /* value of enum SearchRangeStatus */ + UChar* low; + UChar* high; + UChar* low_prev; + UChar* sch_range; +} SearchRange; + +#define REGSET_MATCH_AND_RETURN_CHECK(upper_range) \ + r = match_at(reg, str, end, (upper_range), s, prev, msas + i); \ + if (r != ONIG_MISMATCH) {\ + if (r >= 0) {\ + goto match;\ + }\ + else goto finish; /* error */ \ + } -#ifdef USE_RETRY_LIMIT_IN_MATCH - retry_limit_in_match_over: - STACK_SAVE; - return ONIGERR_RETRY_LIMIT_IN_MATCH_OVER; +static inline int +regset_search_body_position_lead(OnigRegSet* set, + const UChar* str, const UChar* end, + const UChar* start, const UChar* range, /* match start range */ + const UChar* orig_range, /* data range */ + OnigOptionType option, MatchArg* msas, int* rmatch_pos) +{ + int r, n, i; + UChar *s, *prev; + UChar *low, *high, *low_prev; + UChar* sch_range; + regex_t* reg; + OnigEncoding enc; + SearchRange* sr; + + n = set->n; + enc = set->enc; + + s = (UChar* )start; + if (s > str) + prev = onigenc_get_prev_char_head(enc, str, s); + else + prev = (UChar* )NULL; + + sr = (SearchRange* )xmalloc(sizeof(*sr) * n); + CHECK_NULL_RETURN_MEMERR(sr); + + for (i = 0; i < n; i++) { + reg = set->rs[i].reg; + + sr[i].state = SRS_DEAD; + if (reg->optimize != OPTIMIZE_NONE) { + if (reg->dist_max != INFINITE_LEN) { + if (end - range > reg->dist_max) + sch_range = (UChar* )range + reg->dist_max; + else + sch_range = (UChar* )end; + + if (forward_search(reg, str, end, s, sch_range, &low, &high, &low_prev)) { + sr[i].state = SRS_LOW_HIGH; + sr[i].low = low; + sr[i].high = high; + sr[i].low_prev = low_prev; + sr[i].sch_range = sch_range; + } + } + else { + sch_range = (UChar* )end; + if (forward_search(reg, str, end, s, sch_range, + &low, &high, (UChar** )NULL)) { + goto total_active; + } + } + } + else { + total_active: + sr[i].state = SRS_ALL_RANGE; + sr[i].low = s; + sr[i].high = (UChar* )range; + sr[i].low_prev = prev; + } + } + +#define ACTIVATE_ALL_LOW_HIGH_SEARCH_THRESHOLD_LEN 500 + + if (set->all_low_high != 0 + && range - start > ACTIVATE_ALL_LOW_HIGH_SEARCH_THRESHOLD_LEN) { + do { + int try_count = 0; + for (i = 0; i < n; i++) { + if (sr[i].state == SRS_DEAD) continue; + + if (s < sr[i].low) continue; + if (s >= sr[i].high) { + if (forward_search(set->rs[i].reg, str, end, s, sr[i].sch_range, + &low, &high, &low_prev) != 0) { + sr[i].low = low; + sr[i].high = high; + sr[i].low_prev = low_prev; + if (s < low) continue; + } + else { + sr[i].state = SRS_DEAD; + continue; + } + } + + reg = set->rs[i].reg; + REGSET_MATCH_AND_RETURN_CHECK(orig_range); + try_count++; + } /* for (i) */ + + if (s >= range) break; + + if (try_count == 0) { + low = (UChar* )range; + for (i = 0; i < n; i++) { + if (sr[i].state == SRS_LOW_HIGH && low > sr[i].low) { + low = sr[i].low; + low_prev = sr[i].low_prev; + } + } + if (low == range) break; + + s = low; + prev = low_prev; + } + else { + prev = s; + s += enclen(enc, s); + } + } while (1); + } + else { + int prev_is_newline = 1; + do { + for (i = 0; i < n; i++) { + if (sr[i].state == SRS_DEAD) continue; + if (sr[i].state == SRS_LOW_HIGH) { + if (s < sr[i].low) continue; + if (s >= sr[i].high) { + if (forward_search(set->rs[i].reg, str, end, s, sr[i].sch_range, + &low, &high, &low_prev) != 0) { + sr[i].low = low; + sr[i].high = high; + /* sr[i].low_prev = low_prev; */ + if (s < low) continue; + } + else { + sr[i].state = SRS_DEAD; + continue; + } + } + } + + reg = set->rs[i].reg; + if ((reg->anchor & ANCR_ANYCHAR_INF) == 0 || prev_is_newline != 0) { + REGSET_MATCH_AND_RETURN_CHECK(orig_range); + } + } + + if (s >= range) break; + + if (set->anychar_inf != 0) + prev_is_newline = ONIGENC_IS_MBC_NEWLINE(set->enc, s, end); + + prev = s; + s += enclen(enc, s); + } while (1); + } + + xfree(sr); + return ONIG_MISMATCH; + + finish: + xfree(sr); + return r; + + match: + xfree(sr); + *rmatch_pos = (int )(s - str); + return i; +} + +static inline int +regset_search_body_regex_lead(OnigRegSet* set, + const UChar* str, const UChar* end, + const UChar* start, const UChar* orig_range, OnigRegSetLead lead, + OnigOptionType option, OnigMatchParam* mps[], int* rmatch_pos) +{ + int r; + int i; + int n; + int match_index; + const UChar* ep; + regex_t* reg; + OnigRegion* region; + + n = set->n; + + match_index = ONIG_MISMATCH; + ep = orig_range; + for (i = 0; i < n; i++) { + reg = set->rs[i].reg; + region = set->rs[i].region; + r = search_in_range(reg, str, end, start, ep, orig_range, region, option, mps[i]); + if (r > 0) { + if (str + r < ep) { + match_index = i; + *rmatch_pos = r; + if (lead == ONIG_REGSET_PRIORITY_TO_REGEX_ORDER) + break; + + ep = str + r; + } + } + else if (r == 0) { + match_index = i; + *rmatch_pos = r; + break; + } + } + + return match_index; +} + +extern int +onig_regset_search_with_param(OnigRegSet* set, + const UChar* str, const UChar* end, + const UChar* start, const UChar* range, + OnigRegSetLead lead, OnigOptionType option, OnigMatchParam* mps[], + int* rmatch_pos) +{ + int r; + int i; + UChar *s, *prev; + regex_t* reg; + OnigEncoding enc; + OnigRegion* region; + MatchArg* msas; + const UChar *orig_start = start; + const UChar *orig_range = range; + + if (set->n == 0) + return ONIG_MISMATCH; + + if (IS_POSIX_REGION(option)) + return ONIGERR_INVALID_ARGUMENT; + + r = 0; + enc = set->enc; + msas = (MatchArg* )NULL; + + for (i = 0; i < set->n; i++) { + reg = set->rs[i].reg; + region = set->rs[i].region; + ADJUST_MATCH_PARAM(reg, mps[i]); + if (IS_NOT_NULL(region)) { + r = onig_region_resize_clear(region, reg->num_mem + 1); + if (r != 0) goto finish_no_msa; + } + } + + if (start > end || start < str) goto mismatch_no_msa; + if (str < end) { + /* forward search only */ + if (range <= start) + return ONIGERR_INVALID_ARGUMENT; + } + + if (ONIG_IS_OPTION_ON(option, ONIG_OPTION_CHECK_VALIDITY_OF_STRING)) { + if (! ONIGENC_IS_VALID_MBC_STRING(enc, str, end)) { + r = ONIGERR_INVALID_WIDE_CHAR_VALUE; + goto finish_no_msa; + } + } + + if (set->anchor != OPTIMIZE_NONE && str < end) { + UChar *min_semi_end, *max_semi_end; + + if ((set->anchor & ANCR_BEGIN_POSITION) != 0) { + /* search start-position only */ + begin_position: + range = start + 1; + } + else if ((set->anchor & ANCR_BEGIN_BUF) != 0) { + /* search str-position only */ + if (start != str) goto mismatch_no_msa; + range = str + 1; + } + else if ((set->anchor & ANCR_END_BUF) != 0) { + min_semi_end = max_semi_end = (UChar* )end; + + end_buf: + if ((OnigLen )(max_semi_end - str) < set->anc_dmin) + goto mismatch_no_msa; + + if ((OnigLen )(min_semi_end - start) > set->anc_dmax) { + start = min_semi_end - set->anc_dmax; + if (start < end) + start = onigenc_get_right_adjust_char_head(enc, str, start); + } + if ((OnigLen )(max_semi_end - (range - 1)) < set->anc_dmin) { + range = max_semi_end - set->anc_dmin + 1; + } + if (start > range) goto mismatch_no_msa; + } + else if ((set->anchor & ANCR_SEMI_END_BUF) != 0) { + UChar* pre_end = ONIGENC_STEP_BACK(enc, str, end, 1); + + max_semi_end = (UChar* )end; + if (ONIGENC_IS_MBC_NEWLINE(enc, pre_end, end)) { + min_semi_end = pre_end; + +#ifdef USE_CRNL_AS_LINE_TERMINATOR + pre_end = ONIGENC_STEP_BACK(enc, str, pre_end, 1); + if (IS_NOT_NULL(pre_end) && + ONIGENC_IS_MBC_CRNL(enc, pre_end, end)) { + min_semi_end = pre_end; + } #endif + if (min_semi_end > str && start <= min_semi_end) { + goto end_buf; + } + } + else { + min_semi_end = (UChar* )end; + goto end_buf; + } + } + else if ((set->anchor & ANCR_ANYCHAR_INF_ML) != 0) { + goto begin_position; + } + } + else if (str == end) { /* empty string */ + start = end = str; + s = (UChar* )start; + prev = (UChar* )NULL; + + msas = (MatchArg* )xmalloc(sizeof(*msas) * set->n); + CHECK_NULL_RETURN_MEMERR(msas); + for (i = 0; i < set->n; i++) { + reg = set->rs[i].reg; + MATCH_ARG_INIT(msas[i], reg, option, set->rs[i].region, start, mps[i]); + } + for (i = 0; i < set->n; i++) { + reg = set->rs[i].reg; + if (reg->threshold_len == 0) { + REGSET_MATCH_AND_RETURN_CHECK(end); + } + } + + goto mismatch; + } + + if (lead == ONIG_REGSET_POSITION_LEAD) { + msas = (MatchArg* )xmalloc(sizeof(*msas) * set->n); + CHECK_NULL_RETURN_MEMERR(msas); + + for (i = 0; i < set->n; i++) { + MATCH_ARG_INIT(msas[i], set->rs[i].reg, option, set->rs[i].region, + orig_start, mps[i]); + } + + r = regset_search_body_position_lead(set, str, end, start, range, + orig_range, option, msas, rmatch_pos); + } + else { + r = regset_search_body_regex_lead(set, str, end, start, orig_range, + lead, option, mps, rmatch_pos); + } + if (r < 0) goto finish; + else goto match2; + + mismatch: + r = ONIG_MISMATCH; + finish: + for (i = 0; i < set->n; i++) { + if (IS_NOT_NULL(msas)) + MATCH_ARG_FREE(msas[i]); + if (IS_FIND_NOT_EMPTY(set->rs[i].reg->options) && + IS_NOT_NULL(set->rs[i].region)) { + onig_region_clear(set->rs[i].region); + } + } + if (IS_NOT_NULL(msas)) xfree(msas); + return r; + + mismatch_no_msa: + r = ONIG_MISMATCH; + finish_no_msa: + return r; + + match: + *rmatch_pos = (int )(s - str); + match2: + for (i = 0; i < set->n; i++) { + if (IS_NOT_NULL(msas)) + MATCH_ARG_FREE(msas[i]); + if (IS_FIND_NOT_EMPTY(set->rs[i].reg->options) && + IS_NOT_NULL(set->rs[i].region)) { + onig_region_clear(set->rs[i].region); + } + } + if (IS_NOT_NULL(msas)) xfree(msas); + return r; /* regex index */ } +extern int +onig_regset_search(OnigRegSet* set, const UChar* str, const UChar* end, + const UChar* start, const UChar* range, + OnigRegSetLead lead, OnigOptionType option, int* rmatch_pos) +{ + int r; + int i; + OnigMatchParam* mp; + OnigMatchParam** mps; + + mps = (OnigMatchParam** )xmalloc((sizeof(OnigMatchParam*) + sizeof(OnigMatchParam)) * set->n); + CHECK_NULL_RETURN_MEMERR(mps); + + mp = (OnigMatchParam* )(mps + set->n); + + for (i = 0; i < set->n; i++) { + onig_initialize_match_param(mp + i); + mps[i] = mp + i; + } + + r = onig_regset_search_with_param(set, str, end, start, range, lead, option, mps, + rmatch_pos); + for (i = 0; i < set->n; i++) + onig_free_match_param_content(mp + i); + + xfree(mps); + + return r; +} static UChar* slow_search(OnigEncoding enc, UChar* target, UChar* target_end, @@ -4146,9 +4665,11 @@ str_lower_case_match(OnigEncoding enc, int case_fold_flag, UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; while (t < tend) { + if (p >= end) return 0; lowlen = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &p, end, lowbuf); q = lowbuf; while (lowlen > 0) { + if (t >= tend) return 0; if (*t++ != *q++) return 0; lowlen--; } @@ -4162,16 +4683,11 @@ slow_search_ic(OnigEncoding enc, int case_fold_flag, UChar* target, UChar* target_end, const UChar* text, const UChar* text_end, UChar* text_range) { - UChar *s, *end; - - end = (UChar* )text_end; - end -= target_end - target - 1; - if (end > text_range) - end = text_range; + UChar *s; s = (UChar* )text; - while (s < end) { + while (s < text_range) { if (str_lower_case_match(enc, case_fold_flag, target, target_end, s, text_end)) return s; @@ -4325,60 +4841,6 @@ sunday_quick_search(regex_t* reg, const UChar* target, const UChar* target_end, } static UChar* -sunday_quick_search_case_fold(regex_t* reg, - const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, - const UChar* text_range) -{ - const UChar *s, *se, *end; - const UChar *tail; - int skip, tlen1; - int map_offset; - int case_fold_flag; - OnigEncoding enc; - -#ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, - "sunday_quick_search_case_fold: text: %p, text_end: %p, text_range: %p\n", text, text_end, text_range); -#endif - - enc = reg->enc; - case_fold_flag = reg->case_fold_flag; - - tail = target_end - 1; - tlen1 = (int )(tail - target); - end = text_range; - if (end + tlen1 > text_end) - end = text_end - tlen1; - - map_offset = reg->map_offset; - s = text; - - while (s < end) { - if (str_lower_case_match(enc, case_fold_flag, target, target_end, - s, text_end)) - return (UChar* )s; - - se = s + tlen1; - if (se + map_offset >= text_end) break; - skip = reg->map[*(se + map_offset)]; -#if 0 - p = s; - do { - s += enclen(enc, s); - } while ((s - p) < skip && s < end); -#else - /* This is faster than prev code for long text. ex: /(?i)Twain/ */ - s += skip; - if (s < end) - s = onigenc_get_right_adjust_char_head(enc, text, s); -#endif - } - - return (UChar* )NULL; -} - -static UChar* map_search(OnigEncoding enc, UChar map[], const UChar* text, const UChar* text_range) { @@ -4458,25 +4920,26 @@ onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, } static int -forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, - UChar* range, UChar** low, UChar** high, UChar** low_prev) +forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, + UChar* range, UChar** low, UChar** high, UChar** low_prev) { UChar *p, *pprev = (UChar* )NULL; #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "forward_search_range: str: %p, end: %p, s: %p, range: %p\n", - str, end, s, range); + fprintf(stderr, "forward_search: str: %p, end: %p, start: %p, range: %p\n", + str, end, start, range); #endif - p = s; - if (reg->dmin > 0) { + p = start; + if (reg->dist_min != 0) { + if (end - p <= reg->dist_min) + return 0; /* fail */ + if (ONIGENC_IS_SINGLEBYTE(reg->enc)) { - p += reg->dmin; + p += reg->dist_min; } else { - UChar *q = p + reg->dmin; - - if (q >= end) return 0; /* fail */ + UChar *q = p + reg->dist_min; while (p < q) p += enclen(reg->enc, p); } } @@ -4491,11 +4954,6 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, reg->exact, reg->exact_end, p, end, range); break; - case OPTIMIZE_STR_CASE_FOLD_FAST: - p = sunday_quick_search_case_fold(reg, reg->exact, reg->exact_end, p, end, - range); - break; - case OPTIMIZE_STR_FAST: p = sunday_quick_search(reg, reg->exact, reg->exact_end, p, end, range); break; @@ -4511,7 +4969,7 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, } if (p && p < range) { - if (p - reg->dmin < s) { + if (p - start < reg->dist_min) { retry_gate: pprev = p; p += enclen(reg->enc, p); @@ -4524,8 +4982,7 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, switch (reg->sub_anchor) { case ANCR_BEGIN_LINE: if (!ON_STR_BEGIN(p)) { - prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), p); + prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) goto retry_gate; } @@ -4546,35 +5003,34 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, #endif ) goto retry_gate; + break; } } - if (reg->dmax == 0) { + if (reg->dist_max == 0) { *low = p; if (low_prev) { - if (*low > s) - *low_prev = onigenc_get_prev_char_head(reg->enc, s, p); + if (*low > start) + *low_prev = onigenc_get_prev_char_head(reg->enc, start, p); else *low_prev = onigenc_get_prev_char_head(reg->enc, (pprev ? pprev : str), p); } + *high = p; } else { - if (reg->dmax != INFINITE_LEN) { - if (p - str < reg->dmax) { + if (reg->dist_max != INFINITE_LEN) { + if (p - str < reg->dist_max) { *low = (UChar* )str; if (low_prev) *low_prev = onigenc_get_prev_char_head(reg->enc, str, *low); } else { - *low = p - reg->dmax; - if (*low > s) { - *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, s, + *low = p - reg->dist_max; + if (*low > start) { + *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, start, *low, (const UChar** )low_prev); - if (low_prev && IS_NULL(*low_prev)) - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : s), *low); } else { if (low_prev) @@ -4583,14 +5039,18 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, } } } + /* no needs to adjust *high, *high is used as range check only */ + if (p - str < reg->dist_min) + *high = (UChar* )str; + else + *high = p - reg->dist_min; } - /* no needs to adjust *high, *high is used as range check only */ - *high = p - reg->dmin; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, - "forward_search_range success: low: %d, high: %d, dmin: %d, dmax: %d\n", - (int )(*low - str), (int )(*high - str), reg->dmin, reg->dmax); + "forward_search success: low: %d, high: %d, dmin: %u, dmax: %u\n", + (int )(*low - str), (int )(*high - str), + reg->dist_min, reg->dist_max); #endif return 1; /* success */ } @@ -4600,15 +5060,11 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, static int -backward_search_range(regex_t* reg, const UChar* str, const UChar* end, - UChar* s, const UChar* range, UChar* adjrange, - UChar** low, UChar** high) +backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s, + const UChar* range, UChar* adjrange, UChar** low, UChar** high) { UChar *p; - if (range == 0) goto fail; - - range += reg->dmin; p = s; retry: @@ -4620,7 +5076,6 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, break; case OPTIMIZE_STR_CASE_FOLD: - case OPTIMIZE_STR_CASE_FOLD_FAST: p = slow_search_backward_ic(reg->enc, reg->case_fold_flag, reg->exact, reg->exact_end, range, adjrange, end, p); @@ -4675,15 +5130,27 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, } } - /* no needs to adjust *high, *high is used as range check only */ - if (reg->dmax != INFINITE_LEN) { - *low = p - reg->dmax; - *high = p - reg->dmin; + if (reg->dist_max != INFINITE_LEN) { + if (p - str < reg->dist_max) + *low = (UChar* )str; + else + *low = p - reg->dist_max; + + if (reg->dist_min != 0) { + if (p - str < reg->dist_min) + *high = (UChar* )str; + else + *high = p - reg->dist_min; + } + else { + *high = p; + } + *high = onigenc_get_right_adjust_char_head(reg->enc, adjrange, *high); } #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "backward_search_range: low: %d, high: %d\n", + fprintf(stderr, "backward_search: low: %d, high: %d\n", (int )(*low - str), (int )(*high - str)); #endif return 1; /* success */ @@ -4691,7 +5158,7 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, fail: #ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "backward_search_range: fail.\n"); + fprintf(stderr, "backward_search: fail.\n"); #endif return 0; /* fail */ } @@ -4704,24 +5171,35 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, { int r; OnigMatchParam mp; + const UChar* data_range; onig_initialize_match_param(&mp); - r = onig_search_with_param(reg, str, end, start, range, region, option, &mp); + + /* The following is an expanded code of onig_search_with_param() */ + if (range > start) + data_range = range; + else + data_range = end; + + r = search_in_range(reg, str, end, start, range, data_range, region, + option, &mp); + onig_free_match_param_content(&mp); return r; } -extern int -onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, - const UChar* start, const UChar* range, OnigRegion* region, - OnigOptionType option, OnigMatchParam* mp) +static int +search_in_range(regex_t* reg, const UChar* str, const UChar* end, + const UChar* start, const UChar* range, /* match start range */ + const UChar* data_range, /* subject string range */ + OnigRegion* region, + OnigOptionType option, OnigMatchParam* mp) { int r; UChar *s, *prev; MatchArg msa; const UChar *orig_start = start; - const UChar *orig_range = range; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, @@ -4804,17 +5282,21 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, min_semi_end = max_semi_end = (UChar* )end; end_buf: - if ((OnigLen )(max_semi_end - str) < reg->anchor_dmin) + if ((OnigLen )(max_semi_end - str) < reg->anc_dist_min) goto mismatch_no_msa; if (range > start) { - if ((OnigLen )(min_semi_end - start) > reg->anchor_dmax) { - start = min_semi_end - reg->anchor_dmax; + if (reg->anc_dist_max != INFINITE_LEN && + min_semi_end - start > reg->anc_dist_max) { + start = min_semi_end - reg->anc_dist_max; if (start < end) start = onigenc_get_right_adjust_char_head(reg->enc, str, start); } - if ((OnigLen )(max_semi_end - (range - 1)) < reg->anchor_dmin) { - range = max_semi_end - reg->anchor_dmin + 1; + if (max_semi_end - (range - 1) < reg->anc_dist_min) { + if (max_semi_end - str + 1 < reg->anc_dist_min) + goto mismatch_no_msa; + else + range = max_semi_end - reg->anc_dist_min + 1; } if (start > range) goto mismatch_no_msa; @@ -4822,12 +5304,17 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, Backward search is used. */ } else { - if ((OnigLen )(min_semi_end - range) > reg->anchor_dmax) { - range = min_semi_end - reg->anchor_dmax; + if (reg->anc_dist_max != INFINITE_LEN && + min_semi_end - range > reg->anc_dist_max) { + range = min_semi_end - reg->anc_dist_max; } - if ((OnigLen )(max_semi_end - start) < reg->anchor_dmin) { - start = max_semi_end - reg->anchor_dmin; - start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start); + if (max_semi_end - start < reg->anc_dist_min) { + if (max_semi_end - str < reg->anc_dist_min) + goto mismatch_no_msa; + else { + start = max_semi_end - reg->anc_dist_min; + start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start); + } } if (range > start) goto mismatch_no_msa; } @@ -4895,29 +5382,33 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, if (reg->optimize != OPTIMIZE_NONE) { UChar *sch_range, *low, *high, *low_prev; - sch_range = (UChar* )range; - if (reg->dmax != 0) { - if (reg->dmax == INFINITE_LEN) + if (reg->dist_max != 0) { + if (reg->dist_max == INFINITE_LEN) sch_range = (UChar* )end; else { - sch_range += reg->dmax; - if (sch_range > end) sch_range = (UChar* )end; + if ((end - range) < reg->dist_max) + sch_range = (UChar* )end; + else { + sch_range = (UChar* )range + reg->dist_max; + } } } + else + sch_range = (UChar* )range; if ((end - start) < reg->threshold_len) goto mismatch; - if (reg->dmax != INFINITE_LEN) { + if (reg->dist_max != INFINITE_LEN) { do { - if (! forward_search_range(reg, str, end, s, sch_range, - &low, &high, &low_prev)) goto mismatch; + if (! forward_search(reg, str, end, s, sch_range, &low, &high, + &low_prev)) goto mismatch; if (s < low) { s = low; prev = low_prev; } while (s <= high) { - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); prev = s; s += enclen(reg->enc, s); } @@ -4925,12 +5416,12 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, goto mismatch; } else { /* check only. */ - if (! forward_search_range(reg, str, end, s, sch_range, - &low, &high, (UChar** )NULL)) goto mismatch; + if (! forward_search(reg, str, end, s, sch_range, &low, &high, + (UChar** )NULL)) goto mismatch; if ((reg->anchor & ANCR_ANYCHAR_INF) != 0) { do { - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); prev = s; s += enclen(reg->enc, s); @@ -4947,13 +5438,13 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, } do { - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); prev = s; s += enclen(reg->enc, s); } while (s < range); if (s == range) { /* because empty match with /$/. */ - MATCH_AND_RETURN_CHECK(orig_range); + MATCH_AND_RETURN_CHECK(data_range); } } else { /* backward search */ @@ -4964,19 +5455,30 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, if (reg->optimize != OPTIMIZE_NONE) { UChar *low, *high, *adjrange, *sch_start; + const UChar *min_range; + + if ((end - range) < reg->threshold_len) goto mismatch; if (range < end) adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); else adjrange = (UChar* )end; - if (reg->dmax != INFINITE_LEN && - (end - range) >= reg->threshold_len) { + if (end - range > reg->dist_min) + min_range = range + reg->dist_min; + else + min_range = end; + + if (reg->dist_max != INFINITE_LEN) { do { - sch_start = s + reg->dmax; - if (sch_start > end) sch_start = (UChar* )end; - if (backward_search_range(reg, str, end, sch_start, range, adjrange, - &low, &high) <= 0) + if (end - s > reg->dist_max) + sch_start = s + reg->dist_max; + else { + sch_start = onigenc_get_prev_char_head(reg->enc, str, end); + } + + if (backward_search(reg, str, end, sch_start, min_range, adjrange, + &low, &high) <= 0) goto mismatch; if (s > high) @@ -4991,22 +5493,10 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, goto mismatch; } else { /* check only. */ - if ((end - range) < reg->threshold_len) goto mismatch; + sch_start = onigenc_get_prev_char_head(reg->enc, str, end); - sch_start = s; - if (reg->dmax != 0) { - if (reg->dmax == INFINITE_LEN) - sch_start = (UChar* )end; - else { - sch_start += reg->dmax; - if (sch_start > end) sch_start = (UChar* )end; - else - sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, - start, sch_start); - } - } - if (backward_search_range(reg, str, end, sch_start, range, adjrange, - &low, &high) <= 0) goto mismatch; + if (backward_search(reg, str, end, sch_start, min_range, adjrange, + &low, &high) <= 0) goto mismatch; } } @@ -5062,6 +5552,22 @@ onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, } extern int +onig_search_with_param(regex_t* reg, const UChar* str, const UChar* end, + const UChar* start, const UChar* range, OnigRegion* region, + OnigOptionType option, OnigMatchParam* mp) +{ + const UChar* data_range; + + if (range > start) + data_range = range; + else + data_range = end; + + return search_in_range(reg, str, end, start, range, data_range, region, + option, mp); +} + +extern int onig_scan(regex_t* reg, const UChar* str, const UChar* end, OnigRegion* region, OnigOptionType option, int (*scan_callback)(int, int, OnigRegion*, void*), @@ -5163,6 +5669,202 @@ onig_copy_encoding(OnigEncoding to, OnigEncoding from) *to = *from; } +extern int +onig_regset_new(OnigRegSet** rset, int n, regex_t* regs[]) +{ +#define REGSET_INITIAL_ALLOC_SIZE 10 + + int i; + int r; + int alloc; + OnigRegSet* set; + RR* rs; + + *rset = 0; + + set = (OnigRegSet* )xmalloc(sizeof(*set)); + CHECK_NULL_RETURN_MEMERR(set); + + alloc = n > REGSET_INITIAL_ALLOC_SIZE ? n : REGSET_INITIAL_ALLOC_SIZE; + rs = (RR* )xmalloc(sizeof(set->rs[0]) * alloc); + if (IS_NULL(rs)) { + xfree(set); + return ONIGERR_MEMORY; + } + + set->rs = rs; + set->n = 0; + set->alloc = alloc; + + for (i = 0; i < n; i++) { + regex_t* reg = regs[i]; + + r = onig_regset_add(set, reg); + if (r != 0) { + for (i = 0; i < set->n; i++) { + OnigRegion* region = set->rs[i].region; + if (IS_NOT_NULL(region)) + onig_region_free(region, 1); + } + xfree(set->rs); + xfree(set); + return r; + } + } + + *rset = set; + return 0; +} + +static void +update_regset_by_reg(OnigRegSet* set, regex_t* reg) +{ + if (set->n == 1) { + set->enc = reg->enc; + set->anchor = reg->anchor; + set->anc_dmin = reg->anc_dist_min; + set->anc_dmax = reg->anc_dist_max; + set->all_low_high = + (reg->optimize == OPTIMIZE_NONE || reg->dist_max == INFINITE_LEN) ? 0 : 1; + set->anychar_inf = (reg->anchor & ANCR_ANYCHAR_INF) != 0 ? 1 : 0; + } + else { + int anchor; + + anchor = set->anchor & reg->anchor; + if (anchor != 0) { + OnigLen anc_dmin; + OnigLen anc_dmax; + + anc_dmin = set->anc_dmin; + anc_dmax = set->anc_dmax; + if (anc_dmin > reg->anc_dist_min) anc_dmin = reg->anc_dist_min; + if (anc_dmax < reg->anc_dist_max) anc_dmax = reg->anc_dist_max; + set->anc_dmin = anc_dmin; + set->anc_dmax = anc_dmax; + } + + set->anchor = anchor; + + if (reg->optimize == OPTIMIZE_NONE || reg->dist_max == INFINITE_LEN) + set->all_low_high = 0; + + if ((reg->anchor & ANCR_ANYCHAR_INF) != 0) + set->anychar_inf = 1; + } +} + +extern int +onig_regset_add(OnigRegSet* set, regex_t* reg) +{ + OnigRegion* region; + + if (IS_FIND_LONGEST(reg->options)) + return ONIGERR_INVALID_ARGUMENT; + + if (set->n != 0 && reg->enc != set->enc) + return ONIGERR_INVALID_ARGUMENT; + + if (set->n >= set->alloc) { + RR* nrs; + int new_alloc; + + new_alloc = set->alloc * 2; + nrs = (RR* )xrealloc(set->rs, sizeof(set->rs[0]) * new_alloc); + CHECK_NULL_RETURN_MEMERR(nrs); + + set->rs = nrs; + set->alloc = new_alloc; + } + + region = onig_region_new(); + CHECK_NULL_RETURN_MEMERR(region); + + set->rs[set->n].reg = reg; + set->rs[set->n].region = region; + set->n++; + + update_regset_by_reg(set, reg); + return 0; +} + +extern int +onig_regset_replace(OnigRegSet* set, int at, regex_t* reg) +{ + int i; + + if (at < 0 || at >= set->n) + return ONIGERR_INVALID_ARGUMENT; + + if (IS_NULL(reg)) { + onig_region_free(set->rs[at].region, 1); + for (i = at; i < set->n - 1; i++) { + set->rs[i].reg = set->rs[i+1].reg; + set->rs[i].region = set->rs[i+1].region; + } + set->n--; + } + else { + if (IS_FIND_LONGEST(reg->options)) + return ONIGERR_INVALID_ARGUMENT; + + if (set->n > 1 && reg->enc != set->enc) + return ONIGERR_INVALID_ARGUMENT; + + set->rs[at].reg = reg; + } + + for (i = 0; i < set->n; i++) + update_regset_by_reg(set, set->rs[i].reg); + + return 0; +} + +extern void +onig_regset_free(OnigRegSet* set) +{ + int i; + + for (i = 0; i < set->n; i++) { + regex_t* reg; + OnigRegion* region; + + reg = set->rs[i].reg; + region = set->rs[i].region; + onig_free(reg); + if (IS_NOT_NULL(region)) + onig_region_free(region, 1); + } + + xfree(set->rs); + xfree(set); +} + +extern int +onig_regset_number_of_regex(OnigRegSet* set) +{ + return set->n; +} + +extern regex_t* +onig_regset_get_regex(OnigRegSet* set, int at) +{ + if (at < 0 || at >= set->n) + return (regex_t* )0; + + return set->rs[at].reg; +} + +extern OnigRegion* +onig_regset_get_region(OnigRegSet* set, int at) +{ + if (at < 0 || at >= set->n) + return (OnigRegion* )0; + + return set->rs[at].region; +} + + #ifdef USE_DIRECT_THREADED_CODE extern int onig_init_for_match_at(regex_t* reg) @@ -5355,35 +6057,25 @@ onig_get_capture_range_in_callout(OnigCalloutArgs* a, int mem_num, int* begin, i const UChar* str; StackType* stk_base; int i; + StackIndex* mem_start_stk; + StackIndex* mem_end_stk; i = mem_num; reg = a->regex; str = a->string; stk_base = a->stk_base; + mem_start_stk = a->mem_start_stk; + mem_end_stk = a->mem_end_stk; if (i > 0) { if (a->mem_end_stk[i] != INVALID_STACK_INDEX) { - if (MEM_STATUS_AT(reg->bt_mem_start, i)) - *begin = (int )(STACK_AT(a->mem_start_stk[i])->u.mem.pstr - str); - else - *begin = (int )((UChar* )((void* )a->mem_start_stk[i]) - str); - - *end = (int )((MEM_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(a->mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )a->mem_end_stk[i])) - str); + *begin = (int )(STACK_MEM_START(reg, i) - str); + *end = (int )(STACK_MEM_END(reg, i) - str); } else { *begin = *end = ONIG_REGION_NOTPOS; } } - else if (i == 0) { -#if 0 - *begin = a->start - str; - *end = a->current - str; -#else - return ONIGERR_INVALID_ARGUMENT; -#endif - } else return ONIGERR_INVALID_ARGUMENT; @@ -5421,14 +6113,6 @@ onig_builtin_mismatch(OnigCalloutArgs* args ARG_UNUSED, void* user_data ARG_UNUS return ONIG_MISMATCH; } -#if 0 -extern int -onig_builtin_success(OnigCalloutArgs* args ARG_UNUSED, void* user_data ARG_UNUSED) -{ - return ONIG_CALLOUT_SUCCESS; -} -#endif - extern int onig_builtin_error(OnigCalloutArgs* args, void* user_data ARG_UNUSED) { @@ -5443,6 +6127,9 @@ onig_builtin_error(OnigCalloutArgs* args, void* user_data ARG_UNUSED) if (n >= 0) { n = ONIGERR_INVALID_CALLOUT_BODY; } + else if (onig_is_error_code_needs_param(n)) { + n = ONIGERR_INVALID_CALLOUT_BODY; + } return n; } diff --git a/src/regext.c b/src/regext.c index fa4b360..c46f630 100644 --- a/src/regext.c +++ b/src/regext.c @@ -2,7 +2,7 @@ regext.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,6 +29,7 @@ #include "regint.h" +#if 0 static void conv_ext0be32(const UChar* s, const UChar* end, UChar* conv) { @@ -158,6 +159,7 @@ conv_encoding(OnigEncoding from, OnigEncoding to, const UChar* s, const UChar* e return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION; } +#endif extern int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, @@ -169,9 +171,7 @@ onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; if (ci->pattern_enc != ci->target_enc) { - r = conv_encoding(ci->pattern_enc, ci->target_enc, pattern, pattern_end, - &cpat, &cpat_end); - if (r != 0) return r; + return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION; } else { cpat = (UChar* )pattern; diff --git a/src/reggnu.c b/src/reggnu.c index a124ae8..8a45078 100644 --- a/src/reggnu.c +++ b/src/reggnu.c @@ -2,7 +2,7 @@ reggnu.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regint.h b/src/regint.h index 56767e8..cc540da 100644 --- a/src/regint.h +++ b/src/regint.h @@ -4,7 +4,7 @@ regint.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -47,23 +47,18 @@ #endif #endif -#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ - (defined(__ppc__) && defined(__APPLE__)) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(__mc68020__) -#define PLATFORM_UNALIGNED_WORD_ACCESS -#endif - +#ifndef ONIG_DISABLE_DIRECT_THREADING #ifdef __GNUC__ #define USE_GOTO_LABELS_AS_VALUES #endif +#endif /* config */ /* spec. config */ #define USE_CALL #define USE_CALLOUT #define USE_BACKREF_WITH_LEVEL /* \k<name+n>, \k<name-n> */ -#define USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ +#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR #define USE_RETRY_LIMIT_IN_MATCH @@ -82,6 +77,8 @@ #define USE_VARIABLE_META_CHARS #define USE_POSIX_API_REGION_OPTION #define USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE +/* #define USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR */ + #include "regenc.h" @@ -197,49 +194,16 @@ typedef unsigned int uintptr_t; #define CHAR_MAP_SIZE 256 #define INFINITE_LEN ONIG_INFINITE_DISTANCE -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS - -#define PLATFORM_GET_INC(val,p,type) do{\ - val = *(type* )p;\ - (p) += sizeof(type);\ -} while(0) - -#else - -#define PLATFORM_GET_INC(val,p,type) do{\ - xmemcpy(&val, (p), sizeof(type));\ - (p) += sizeof(type);\ -} while(0) - -/* sizeof(OnigCodePoint) */ -#ifdef SIZEOF_SIZE_T -# define WORD_ALIGNMENT_SIZE SIZEOF_SIZE_T -#else -# define WORD_ALIGNMENT_SIZE SIZEOF_LONG -#endif - -#define GET_ALIGNMENT_PAD_SIZE(addr,pad_size) do {\ - (pad_size) = WORD_ALIGNMENT_SIZE - ((uintptr_t )(addr) % WORD_ALIGNMENT_SIZE);\ - if ((pad_size) == WORD_ALIGNMENT_SIZE) (pad_size) = 0;\ -} while (0) - -#define ALIGNMENT_RIGHT(addr) do {\ - (addr) += (WORD_ALIGNMENT_SIZE - 1);\ - (addr) -= ((uintptr_t )(addr) % WORD_ALIGNMENT_SIZE);\ -} while (0) - -#endif /* PLATFORM_UNALIGNED_WORD_ACCESS */ - #ifdef USE_CALLOUT typedef struct { - int flag; - OnigCalloutOf of; - int in; - int name_id; - const UChar* tag_start; - const UChar* tag_end; + int flag; + OnigCalloutOf of; + int in; + int name_id; + const UChar* tag_start; + const UChar* tag_end; OnigCalloutType type; OnigCalloutFunc start_func; OnigCalloutFunc end_func; @@ -272,7 +236,6 @@ enum OptimizeType { OPTIMIZE_STR, /* Slow Search */ OPTIMIZE_STR_FAST, /* Sunday quick search / BMH */ OPTIMIZE_STR_FAST_STEP_FORWARD, /* Sunday quick search / BMH */ - OPTIMIZE_STR_CASE_FOLD_FAST, /* Sunday quick search / BMH (ignore case) */ OPTIMIZE_STR_CASE_FOLD, /* Slow Search (ignore case) */ OPTIMIZE_MAP /* char map */ }; @@ -288,6 +251,8 @@ typedef unsigned int MemStatusType; #define MEM_STATUS_AT0(stats,n) \ ((n) > 0 && (n) < (int )MEM_STATUS_BITS_NUM ? ((stats) & ((MemStatusType )1 << n)) : ((stats) & 1)) +#define MEM_STATUS_IS_ALL_ON(stats) (((stats) & 1) != 0) + #define MEM_STATUS_ON(stats,n) do {\ if ((n) < (int )MEM_STATUS_BITS_NUM) {\ if ((n) != 0)\ @@ -302,8 +267,14 @@ typedef unsigned int MemStatusType; (stats) |= ((MemStatusType )1 << (n));\ } while (0) +#define MEM_STATUS_LIMIT_AT(stats,n) \ + ((n) < (int )MEM_STATUS_BITS_NUM ? ((stats) & ((MemStatusType )1 << n)) : 0) +#define MEM_STATUS_LIMIT_ON(stats,n) do {\ + if ((n) < (int )MEM_STATUS_BITS_NUM && (n) != 0) {\ + (stats) |= ((MemStatusType )1 << (n));\ + }\ +} while (0) -#define INT_MAX_LIMIT ((1UL << (SIZEOF_INT * 8 - 1)) - 1) #define IS_CODE_WORD_ASCII(enc,code) \ (ONIGENC_IS_CODE_ASCII(code) && ONIGENC_IS_CODE_WORD(enc,code)) @@ -348,22 +319,18 @@ typedef unsigned int MemStatusType; #define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \ ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) -#define REPEAT_INFINITE -1 -#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) +#define INFINITE_REPEAT -1 +#define IS_INFINITE_REPEAT(n) ((n) == INFINITE_REPEAT) /* bitset */ #define BITS_PER_BYTE 8 #define SINGLE_BYTE_SIZE (1 << BITS_PER_BYTE) -#define BITS_IN_ROOM (sizeof(Bits) * BITS_PER_BYTE) +#define BITS_IN_ROOM 32 /* 4 * BITS_PER_BYTE */ #define BITSET_SIZE (SINGLE_BYTE_SIZE / BITS_IN_ROOM) -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS -typedef unsigned int Bits; -#else -typedef unsigned char Bits; -#endif -typedef Bits BitSet[BITSET_SIZE]; -typedef Bits* BitSetRef; +typedef uint32_t Bits; +typedef Bits BitSet[BITSET_SIZE]; +typedef Bits* BitSetRef; #define SIZE_BITSET sizeof(BitSet) @@ -372,8 +339,8 @@ typedef Bits* BitSetRef; for (i = 0; i < (int )BITSET_SIZE; i++) { (bs)[i] = 0; } \ } while (0) -#define BS_ROOM(bs,pos) (bs)[pos / BITS_IN_ROOM] -#define BS_BIT(pos) (1 << (pos % BITS_IN_ROOM)) +#define BS_ROOM(bs,pos) (bs)[(unsigned int )(pos) >> 5] +#define BS_BIT(pos) (1u << ((unsigned int )(pos) & 0x1f)) #define BITSET_AT(bs, pos) (BS_ROOM(bs,pos) & BS_BIT(pos)) #define BITSET_SET_BIT(bs, pos) BS_ROOM(bs,pos) |= BS_BIT(pos) @@ -389,11 +356,13 @@ typedef struct _BBuf { #define BB_INIT(buf,size) bbuf_init((BBuf* )(buf), (size)) +/* #define BB_SIZE_INC(buf,inc) do{\ (buf)->alloc += (inc);\ (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ } while (0) +*/ #define BB_EXPAND(buf,low) do{\ do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ @@ -491,39 +460,34 @@ typedef struct _BBuf { /* operation code */ enum OpCode { - OP_FINISH = 0, /* matching process terminator (no more alternative) */ - OP_END = 1, /* pattern code terminator (success end) */ - - OP_EXACT1 = 2, /* single byte, N = 1 */ - OP_EXACT2, /* single byte, N = 2 */ - OP_EXACT3, /* single byte, N = 3 */ - OP_EXACT4, /* single byte, N = 4 */ - OP_EXACT5, /* single byte, N = 5 */ - OP_EXACTN, /* single byte */ - OP_EXACTMB2N1, /* mb-length = 2 N = 1 */ - OP_EXACTMB2N2, /* mb-length = 2 N = 2 */ - OP_EXACTMB2N3, /* mb-length = 2 N = 3 */ - OP_EXACTMB2N, /* mb-length = 2 */ - OP_EXACTMB3N, /* mb-length = 3 */ - OP_EXACTMBN, /* other length */ - - OP_EXACT1_IC, /* single byte, N = 1, ignore case */ - OP_EXACTN_IC, /* single byte, ignore case */ - + OP_FINISH = 0, /* matching process terminator (no more alternative) */ + OP_END = 1, /* pattern code terminator (success end) */ + OP_STR_1 = 2, /* single byte, N = 1 */ + OP_STR_2, /* single byte, N = 2 */ + OP_STR_3, /* single byte, N = 3 */ + OP_STR_4, /* single byte, N = 4 */ + OP_STR_5, /* single byte, N = 5 */ + OP_STR_N, /* single byte */ + OP_STR_MB2N1, /* mb-length = 2 N = 1 */ + OP_STR_MB2N2, /* mb-length = 2 N = 2 */ + OP_STR_MB2N3, /* mb-length = 2 N = 3 */ + OP_STR_MB2N, /* mb-length = 2 */ + OP_STR_MB3N, /* mb-length = 3 */ + OP_STR_MBN, /* other length */ + OP_STR_1_IC, /* single byte, N = 1, ignore case */ + OP_STR_N_IC, /* single byte, ignore case */ OP_CCLASS, OP_CCLASS_MB, OP_CCLASS_MIX, OP_CCLASS_NOT, OP_CCLASS_MB_NOT, OP_CCLASS_MIX_NOT, - OP_ANYCHAR, /* "." */ OP_ANYCHAR_ML, /* "." multi-line */ OP_ANYCHAR_STAR, /* ".*" */ OP_ANYCHAR_ML_STAR, /* ".*" multi-line */ OP_ANYCHAR_STAR_PEEK_NEXT, OP_ANYCHAR_ML_STAR_PEEK_NEXT, - OP_WORD, OP_WORD_ASCII, OP_NO_WORD, @@ -532,16 +496,13 @@ enum OpCode { OP_NO_WORD_BOUNDARY, OP_WORD_BEGIN, OP_WORD_END, - OP_TEXT_SEGMENT_BOUNDARY, - OP_BEGIN_BUF, OP_END_BUF, OP_BEGIN_LINE, OP_END_LINE, OP_SEMI_END_BUF, OP_BEGIN_POSITION, - OP_BACKREF1, OP_BACKREF2, OP_BACKREF_N, @@ -552,34 +513,35 @@ enum OpCode { OP_BACKREF_WITH_LEVEL_IC, /* \k<xxx+n>, \k<xxx-n> */ OP_BACKREF_CHECK, /* (?(n)), (?('name')) */ OP_BACKREF_CHECK_WITH_LEVEL, /* (?(n-level)), (?('name-level')) */ - - OP_MEMORY_START, - OP_MEMORY_START_PUSH, /* push back-tracker to stack */ - OP_MEMORY_END_PUSH, /* push back-tracker to stack */ - OP_MEMORY_END_PUSH_REC, /* push back-tracker to stack */ - OP_MEMORY_END, - OP_MEMORY_END_REC, /* push marker to stack */ - + OP_MEM_START, + OP_MEM_START_PUSH, /* push back-tracker to stack */ + OP_MEM_END_PUSH, /* push back-tracker to stack */ +#ifdef USE_CALL + OP_MEM_END_PUSH_REC, /* push back-tracker to stack */ +#endif + OP_MEM_END, +#ifdef USE_CALL + OP_MEM_END_REC, /* push marker to stack */ +#endif OP_FAIL, /* pop stack and move */ OP_JUMP, OP_PUSH, OP_PUSH_SUPER, OP_POP_OUT, #ifdef USE_OP_PUSH_OR_JUMP_EXACT - OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ + OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ #endif - OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */ - OP_REPEAT, /* {n,m} */ - OP_REPEAT_NG, /* {n,m}? (non greedy) */ + OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */ + OP_REPEAT, /* {n,m} */ + OP_REPEAT_NG, /* {n,m}? (non greedy) */ OP_REPEAT_INC, - OP_REPEAT_INC_NG, /* non greedy */ - OP_REPEAT_INC_SG, /* search and get in stack */ - OP_REPEAT_INC_NG_SG, /* search and get in stack (non greedy) */ + OP_REPEAT_INC_NG, /* non greedy */ OP_EMPTY_CHECK_START, /* null loop checker start */ OP_EMPTY_CHECK_END, /* null loop checker end */ OP_EMPTY_CHECK_END_MEMST, /* null loop checker end (with capture status) */ +#ifdef USE_CALL OP_EMPTY_CHECK_END_MEMST_PUSH, /* with capture status and push check-end */ - +#endif OP_PREC_READ_START, /* (?=...) start */ OP_PREC_READ_END, /* (?=...) end */ OP_PREC_READ_NOT_START, /* (?!...) start */ @@ -589,11 +551,12 @@ enum OpCode { OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */ OP_LOOK_BEHIND_NOT_START, /* (?<!...) start */ OP_LOOK_BEHIND_NOT_END, /* (?<!...) end */ - - OP_CALL, /* \g<name> */ - OP_RETURN, OP_PUSH_SAVE_VAL, OP_UPDATE_VAR, +#ifdef USE_CALL + OP_CALL, /* \g<name> */ + OP_RETURN, +#endif #ifdef USE_CALLOUT OP_CALLOUT_CONTENTS, /* (?{...}) (?{{...}}) */ OP_CALLOUT_NAME, /* (*name) (*name[tag](args...)) */ @@ -601,8 +564,8 @@ enum OpCode { }; enum SaveType { - SAVE_KEEP = 0, /* SAVE S */ - SAVE_S = 1, + SAVE_KEEP = 0, /* SAVE S */ + SAVE_S = 1, SAVE_RIGHT_RANGE = 2, }; @@ -642,116 +605,57 @@ typedef int ModeType; #define SIZE_UPDATE_VAR_TYPE sizeof(UpdateVarType) #define SIZE_MODE sizeof(ModeType) -#define GET_RELADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, RelAddrType) -#define GET_ABSADDR_INC(addr,p) PLATFORM_GET_INC(addr, p, AbsAddrType) -#define GET_LENGTH_INC(len,p) PLATFORM_GET_INC(len, p, LengthType) -#define GET_MEMNUM_INC(num,p) PLATFORM_GET_INC(num, p, MemNumType) -#define GET_REPEATNUM_INC(num,p) PLATFORM_GET_INC(num, p, RepeatNumType) -#define GET_OPTION_INC(option,p) PLATFORM_GET_INC(option, p, OnigOptionType) -#define GET_POINTER_INC(ptr,p) PLATFORM_GET_INC(ptr, p, PointerType) -#define GET_SAVE_TYPE_INC(type,p) PLATFORM_GET_INC(type, p, SaveType) -#define GET_UPDATE_VAR_TYPE_INC(type,p) PLATFORM_GET_INC(type, p, UpdateVarType) -#define GET_MODE_INC(mode,p) PLATFORM_GET_INC(mode, p, ModeType) - /* code point's address must be aligned address. */ #define GET_CODE_POINT(code,p) code = *((OnigCodePoint* )(p)) -#define GET_BYTE_INC(byte,p) do{\ - byte = *(p);\ - (p)++;\ -} while(0) /* op-code + arg size */ -#if 0 -#define SIZE_OP_ANYCHAR_STAR SIZE_OPCODE -#define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT (SIZE_OPCODE + 1) -#define SIZE_OP_JUMP (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_PUSH (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_PUSH_SUPER (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_POP_OUT SIZE_OPCODE -#ifdef USE_OP_PUSH_OR_JUMP_EXACT -#define SIZE_OP_PUSH_OR_JUMP_EXACT1 (SIZE_OPCODE + SIZE_RELADDR + 1) -#endif -#define SIZE_OP_PUSH_IF_PEEK_NEXT (SIZE_OPCODE + SIZE_RELADDR + 1) -#define SIZE_OP_REPEAT_INC (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_REPEAT_INC_NG (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_WORD_BOUNDARY (SIZE_OPCODE + SIZE_MODE) -#define SIZE_OP_PREC_READ_START SIZE_OPCODE -#define SIZE_OP_PREC_READ_NOT_START (SIZE_OPCODE + SIZE_RELADDR) -#define SIZE_OP_PREC_READ_END SIZE_OPCODE -#define SIZE_OP_PREC_READ_NOT_END SIZE_OPCODE -#define SIZE_OP_FAIL SIZE_OPCODE -#define SIZE_OP_MEMORY_START (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_START_PUSH (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END_PUSH (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END_PUSH_REC (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_MEMORY_END_REC (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_ATOMIC_START SIZE_OPCODE -#define SIZE_OP_ATOMIC_END SIZE_OPCODE -#define SIZE_OP_EMPTY_CHECK_START (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_EMPTY_CHECK_END (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_LOOK_BEHIND (SIZE_OPCODE + SIZE_LENGTH) -#define SIZE_OP_LOOK_BEHIND_NOT_START (SIZE_OPCODE + SIZE_RELADDR + SIZE_LENGTH) -#define SIZE_OP_LOOK_BEHIND_NOT_END SIZE_OPCODE -#define SIZE_OP_CALL (SIZE_OPCODE + SIZE_ABSADDR) -#define SIZE_OP_RETURN SIZE_OPCODE -#define SIZE_OP_PUSH_SAVE_VAL (SIZE_OPCODE + SIZE_SAVE_TYPE + SIZE_MEMNUM) -#define SIZE_OP_UPDATE_VAR (SIZE_OPCODE + SIZE_UPDATE_VAR_TYPE + SIZE_MEMNUM) - -#ifdef USE_CALLOUT -#define SIZE_OP_CALLOUT_CONTENTS (SIZE_OPCODE + SIZE_MEMNUM) -#define SIZE_OP_CALLOUT_NAME (SIZE_OPCODE + SIZE_MEMNUM + SIZE_MEMNUM) -#endif - -#else /* if 0 */ /* for relative address increment to go next op. */ -#define SIZE_INC_OP 1 - -#define SIZE_OP_ANYCHAR_STAR 1 -#define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT 1 -#define SIZE_OP_JUMP 1 -#define SIZE_OP_PUSH 1 -#define SIZE_OP_PUSH_SUPER 1 -#define SIZE_OP_POP_OUT 1 +#define SIZE_INC 1 + +#define OPSIZE_ANYCHAR_STAR 1 +#define OPSIZE_ANYCHAR_STAR_PEEK_NEXT 1 +#define OPSIZE_JUMP 1 +#define OPSIZE_PUSH 1 +#define OPSIZE_PUSH_SUPER 1 +#define OPSIZE_POP_OUT 1 #ifdef USE_OP_PUSH_OR_JUMP_EXACT -#define SIZE_OP_PUSH_OR_JUMP_EXACT1 1 -#endif -#define SIZE_OP_PUSH_IF_PEEK_NEXT 1 -#define SIZE_OP_REPEAT 1 -#define SIZE_OP_REPEAT_INC 1 -#define SIZE_OP_REPEAT_INC_NG 1 -#define SIZE_OP_WORD_BOUNDARY 1 -#define SIZE_OP_PREC_READ_START 1 -#define SIZE_OP_PREC_READ_NOT_START 1 -#define SIZE_OP_PREC_READ_END 1 -#define SIZE_OP_PREC_READ_NOT_END 1 -#define SIZE_OP_BACKREF 1 -#define SIZE_OP_FAIL 1 -#define SIZE_OP_MEMORY_START 1 -#define SIZE_OP_MEMORY_START_PUSH 1 -#define SIZE_OP_MEMORY_END_PUSH 1 -#define SIZE_OP_MEMORY_END_PUSH_REC 1 -#define SIZE_OP_MEMORY_END 1 -#define SIZE_OP_MEMORY_END_REC 1 -#define SIZE_OP_ATOMIC_START 1 -#define SIZE_OP_ATOMIC_END 1 -#define SIZE_OP_EMPTY_CHECK_START 1 -#define SIZE_OP_EMPTY_CHECK_END 1 -#define SIZE_OP_LOOK_BEHIND 1 -#define SIZE_OP_LOOK_BEHIND_NOT_START 1 -#define SIZE_OP_LOOK_BEHIND_NOT_END 1 -#define SIZE_OP_CALL 1 -#define SIZE_OP_RETURN 1 -#define SIZE_OP_PUSH_SAVE_VAL 1 -#define SIZE_OP_UPDATE_VAR 1 +#define OPSIZE_PUSH_OR_JUMP_EXACT1 1 +#endif +#define OPSIZE_PUSH_IF_PEEK_NEXT 1 +#define OPSIZE_REPEAT 1 +#define OPSIZE_REPEAT_INC 1 +#define OPSIZE_REPEAT_INC_NG 1 +#define OPSIZE_WORD_BOUNDARY 1 +#define OPSIZE_PREC_READ_START 1 +#define OPSIZE_PREC_READ_NOT_START 1 +#define OPSIZE_PREC_READ_END 1 +#define OPSIZE_PREC_READ_NOT_END 1 +#define OPSIZE_BACKREF 1 +#define OPSIZE_FAIL 1 +#define OPSIZE_MEM_START 1 +#define OPSIZE_MEM_START_PUSH 1 +#define OPSIZE_MEM_END_PUSH 1 +#define OPSIZE_MEM_END_PUSH_REC 1 +#define OPSIZE_MEM_END 1 +#define OPSIZE_MEM_END_REC 1 +#define OPSIZE_ATOMIC_START 1 +#define OPSIZE_ATOMIC_END 1 +#define OPSIZE_EMPTY_CHECK_START 1 +#define OPSIZE_EMPTY_CHECK_END 1 +#define OPSIZE_LOOK_BEHIND 1 +#define OPSIZE_LOOK_BEHIND_NOT_START 1 +#define OPSIZE_LOOK_BEHIND_NOT_END 1 +#define OPSIZE_CALL 1 +#define OPSIZE_RETURN 1 +#define OPSIZE_PUSH_SAVE_VAL 1 +#define OPSIZE_UPDATE_VAR 1 #ifdef USE_CALLOUT -#define SIZE_OP_CALLOUT_CONTENTS 1 -#define SIZE_OP_CALLOUT_NAME 1 +#define OPSIZE_CALLOUT_CONTENTS 1 +#define OPSIZE_CALLOUT_NAME 1 #endif -#endif /* if 0 */ #define MC_ESC(syn) (syn)->meta_char_table.esc @@ -882,7 +786,7 @@ typedef struct { } repeat; /* REPEAT, REPEAT_NG */ struct { MemNumType id; - } repeat_inc; /* REPEAT_INC, REPEAT_INC_SG, REPEAT_INC_NG, REPEAT_INC_NG_SG */ + } repeat_inc; /* REPEAT_INC, REPEAT_INC_NG */ struct { MemNumType mem; } empty_check_start; @@ -933,48 +837,58 @@ typedef struct { #endif } RegexExt; +typedef struct { + int lower; + int upper; + union { + Operation* pcode; /* address of repeated body */ + int offset; + } u; +} RepeatRange; + struct re_pattern_buffer { /* common members of BBuf(bytes-buffer) */ Operation* ops; #ifdef USE_DIRECT_THREADED_CODE enum OpCode* ocs; #endif - Operation* ops_curr; - unsigned int ops_used; /* used space for ops */ - unsigned int ops_alloc; /* allocated space for ops */ + Operation* ops_curr; + unsigned int ops_used; /* used space for ops */ + unsigned int ops_alloc; /* allocated space for ops */ unsigned char* string_pool; unsigned char* string_pool_end; - int num_mem; /* used memory(...) num counted from 1 */ - int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ - int num_null_check; /* OP_EMPTY_CHECK_START/END id counter */ - int num_call; /* number of subexp call */ - unsigned int capture_history; /* (?@...) flag (1-31) */ - unsigned int bt_mem_start; /* need backtrack flag */ - unsigned int bt_mem_end; /* need backtrack flag */ - int stack_pop_level; - int repeat_range_alloc; - OnigRepeatRange* repeat_range; - - OnigEncoding enc; - OnigOptionType options; - OnigSyntaxType* syntax; - OnigCaseFoldType case_fold_flag; - void* name_table; + int num_mem; /* used memory(...) num counted from 1 */ + int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ + int num_empty_check; /* OP_EMPTY_CHECK_START/END id counter */ + int num_call; /* number of subexp call */ + MemStatusType capture_history; /* (?@...) flag (1-31) */ + MemStatusType push_mem_start; /* need backtrack flag */ + MemStatusType push_mem_end; /* need backtrack flag */ + MemStatusType empty_status_mem; + int stack_pop_level; + int repeat_range_alloc; + RepeatRange* repeat_range; + + OnigEncoding enc; + OnigOptionType options; + OnigSyntaxType* syntax; + OnigCaseFoldType case_fold_flag; + void* name_table; /* optimization info (string search, char-map and anchors) */ int optimize; /* optimize flag */ int threshold_len; /* search str-length for apply optimize */ int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ - OnigLen anchor_dmin; /* (SEMI_)END_BUF anchor distance */ - OnigLen anchor_dmax; /* (SEMI_)END_BUF anchor distance */ + OnigLen anc_dist_min; /* (SEMI_)END_BUF anchor distance */ + OnigLen anc_dist_max; /* (SEMI_)END_BUF anchor distance */ int sub_anchor; /* start-anchor for exact or map */ unsigned char *exact; unsigned char *exact_end; unsigned char map[CHAR_MAP_SIZE]; /* used as BMH skip or char-map */ int map_offset; - OnigLen dmin; /* min-distance of exact or map */ - OnigLen dmax; /* max-distance of exact or map */ + OnigLen dist_min; /* min-distance of exact or map */ + OnigLen dist_max; /* max-distance of exact or map */ RegexExt* extp; }; diff --git a/src/regparse.c b/src/regparse.c index f1deea3..fed53f7 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -2,7 +2,7 @@ regparse.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -77,6 +77,7 @@ OnigSyntaxType OnigSyntaxOniguruma = { ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | + ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE @@ -198,6 +199,24 @@ onig_set_parse_depth_limit(unsigned int depth) return 0; } +#ifdef ONIG_DEBUG_PARSE +#define INC_PARSE_DEPTH(d) do {\ + (d)++;\ + if (env->max_parse_depth < (d)) env->max_parse_depth = d;\ + if ((d) > ParseDepthLimit) \ + return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ +} while (0) +#else +#define INC_PARSE_DEPTH(d) do {\ + (d)++;\ + if ((d) > ParseDepthLimit) \ + return ONIGERR_PARSE_DEPTH_LIMIT_OVER;\ +} while (0) +#endif + +#define DEC_PARSE_DEPTH(d) (d)-- + + static int bbuf_init(BBuf* buf, int size) { @@ -243,7 +262,8 @@ bbuf_clone(BBuf** rto, BBuf* from) return 0; } -static int backref_rel_to_abs(int rel_no, ScanEnv* env) +static int +backref_rel_to_abs(int rel_no, ScanEnv* env) { if (rel_no > 0) { return env->num_mem + rel_no; @@ -291,15 +311,6 @@ bitset_set_range(BitSetRef bs, int from, int to) } } -#if 0 -static void -bitset_set_all(BitSetRef bs) -{ - int i; - for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); } -} -#endif - static void bitset_invert(BitSetRef bs) { @@ -362,24 +373,6 @@ save_entry(ScanEnv* env, enum SaveType type, int* id) { int nid = env->save_num; -#if 0 - if (IS_NULL(env->saves)) { - int n = 10; - env->saves = (SaveItem* )xmalloc(sizeof(SaveItem) * n); - CHECK_NULL_RETURN_MEMERR(env->saves); - env->save_alloc_num = n; - } - else if (env->save_alloc_num <= nid) { - int n = env->save_alloc_num * 2; - SaveItem* p = (SaveItem* )xrealloc(env->saves, sizeof(SaveItem) * n); - CHECK_NULL_RETURN_MEMERR(p); - env->saves = p; - env->save_alloc_num = n; - } - - env->saves[nid].type = type; -#endif - env->save_num++; *id = nid; return 0; @@ -475,14 +468,14 @@ static int str_end_hash(st_str_end_key* x) { UChar *p; - int val = 0; + unsigned val = 0; p = x->s; while (p < x->end) { - val = val * 997 + (int )*p++; + val = val * 997 + (unsigned )*p++; } - return val + (val >> 5); + return (int) (val + (val >> 5)); } extern hash_table_type* @@ -565,15 +558,15 @@ static int callout_name_table_hash(st_callout_name_key* x) { UChar *p; - int val = 0; + unsigned int val = 0; p = x->s; while (p < x->end) { - val = val * 997 + (int )*p++; + val = val * 997 + (unsigned int )*p++; } /* use intptr_t for escape warning in Windows */ - return val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type; + return (int )(val + (val >> 5) + ((intptr_t )x->enc & 0xffff) + x->type); } extern hash_table_type* @@ -1093,6 +1086,35 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name, return e->back_num; } +static int +name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end, + int** nums) +{ + regex_t* reg; + NameEntry* e; + + reg = env->reg; + e = name_find(reg, name, name_end); + + if (IS_NULL(e)) { + onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, + (UChar* )name, (UChar* )name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + + switch (e->back_num) { + case 0: + break; + case 1: + *nums = &(e->back_ref1); + break; + default: + *nums = e->back_refs; + break; + } + return e->back_num; +} + extern int onig_name_to_backref_number(regex_t* reg, const UChar* name, const UChar* name_end, OnigRegion *region) @@ -1869,8 +1891,8 @@ callout_tag_table_new(CalloutTagTable** rt) } static int -callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end, - CalloutTagVal entry_val) +callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name, + UChar* name_end, CalloutTagVal entry_val) { int r; CalloutTagVal val; @@ -1879,8 +1901,11 @@ callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end, return ONIGERR_INVALID_CALLOUT_TAG_NAME; val = callout_tag_find(t, name, name_end); - if (val >= 0) + if (val >= 0) { + onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, + name, name_end); return ONIGERR_MULTIPLEX_DEFINED_NAME; + } r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val); if (r < 0) return r; @@ -1909,7 +1934,7 @@ ext_ensure_tag_table(regex_t* reg) } static int -callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, +callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, CalloutTagVal entry_val) { int r; @@ -1921,7 +1946,7 @@ callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, ext = onig_get_regex_ext(reg); CHECK_NULL_RETURN_MEMERR(ext); - r = callout_tag_entry_raw(ext->tag_table, name, name_end, entry_val); + r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val); e = onig_reg_callout_list_at(reg, (int )entry_val); CHECK_NULL_RETURN_MEMERR(e); @@ -1939,9 +1964,8 @@ callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, static void scan_env_clear(ScanEnv* env) { - MEM_STATUS_CLEAR(env->capture_history); - MEM_STATUS_CLEAR(env->bt_mem_start); - MEM_STATUS_CLEAR(env->bt_mem_end); + MEM_STATUS_CLEAR(env->cap_history); + MEM_STATUS_CLEAR(env->backtrack_mem); MEM_STATUS_CLEAR(env->backrefed_mem); env->error = (UChar* )NULL; env->error_end = (UChar* )NULL; @@ -1960,6 +1984,10 @@ scan_env_clear(ScanEnv* env) xmemset(env->mem_env_static, 0, sizeof(env->mem_env_static)); env->parse_depth = 0; +#ifdef ONIG_DEBUG_PARSE + env->max_parse_depth = 0; +#endif + env->backref_num = 0; env->keep_num = 0; env->save_num = 0; env->save_alloc_num = 0; @@ -1991,11 +2019,8 @@ scan_env_add_mem_entry(ScanEnv* env) } for (i = env->num_mem + 1; i < alloc; i++) { - p[i].node = NULL_NODE; -#if 0 - p[i].in = 0; - p[i].recursion = 0; -#endif + p[i].mem_node = NULL_NODE; + p[i].empty_repeat_node = NULL_NODE; } env->mem_env_dynamic = p; @@ -2011,7 +2036,7 @@ static int scan_env_set_mem_node(ScanEnv* env, int num, Node* node) { if (env->num_mem >= num) - SCANENV_MEMENV(env)[num].node = node; + SCANENV_MEMENV(env)[num].mem_node = node; else return ONIGERR_PARSER_BUG; return 0; @@ -2149,7 +2174,7 @@ node_new_ctype(int type, int not, OnigOptionType options) static Node* node_new_anychar(void) { - Node* node = node_new_ctype(CTYPE_ANYCHAR, 0, ONIG_OPTION_NONE); + Node* node = node_new_ctype(CTYPE_ANYCHAR, FALSE, ONIG_OPTION_NONE); return node; } @@ -2209,24 +2234,6 @@ onig_node_new_list(Node* left, Node* right) } extern Node* -onig_node_list_add(Node* list, Node* x) -{ - Node *n; - - n = onig_node_new_list(x, NULL); - if (IS_NULL(n)) return NULL_NODE; - - if (IS_NOT_NULL(list)) { - while (IS_NOT_NULL(NODE_CDR(list))) - list = NODE_CDR(list); - - NODE_CDR(list) = n; - } - - return n; -} - -extern Node* onig_node_new_alt(Node* left, Node* right) { Node* node = node_new(); @@ -2324,7 +2331,7 @@ node_new_backref(int back_num, int* backrefs, int by_name, for (i = 0; i < back_num; i++) { if (backrefs[i] <= env->num_mem && - IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].node)) { + IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) { NODE_STATUS_ADD(node, RECURSION); /* /...(\1).../ */ break; } @@ -2344,6 +2351,8 @@ node_new_backref(int back_num, int* backrefs, int by_name, for (i = 0; i < back_num; i++) p[i] = backrefs[i]; } + + env->backref_num++; return node; } @@ -2391,13 +2400,13 @@ node_new_quantifier(int lower, int upper, int by_number) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_QUANT); - QUANT_(node)->lower = lower; - QUANT_(node)->upper = upper; - QUANT_(node)->greedy = 1; - QUANT_(node)->empty_info = BODY_IS_NOT_EMPTY; - QUANT_(node)->head_exact = NULL_NODE; - QUANT_(node)->next_head_exact = NULL_NODE; - QUANT_(node)->is_refered = 0; + QUANT_(node)->lower = lower; + QUANT_(node)->upper = upper; + QUANT_(node)->greedy = 1; + QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; + QUANT_(node)->head_exact = NULL_NODE; + QUANT_(node)->next_head_exact = NULL_NODE; + QUANT_(node)->include_referred = 0; if (by_number != 0) NODE_STATUS_ADD(node, BY_NUMBER); @@ -2683,7 +2692,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[1] = NULL_NODE; r = ONIGERR_MEMORY; - ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, 0); + ns[0] = onig_node_new_anchor(ANCR_NO_TEXT_SEGMENT_BOUNDARY, FALSE); if (IS_NULL(ns[0])) goto err; r = node_new_true_anychar(&ns[1], env); @@ -2694,7 +2703,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[0] = x; ns[1] = NULL_NODE; - x = node_new_quantifier(0, REPEAT_INFINITE, 1); + x = node_new_quantifier(0, INFINITE_REPEAT, TRUE); if (IS_NULL(x)) goto err; NODE_BODY(x) = ns[0]; @@ -2763,7 +2772,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, ns[0] = x; - x = node_new_quantifier(lower, upper, 0); + x = node_new_quantifier(lower, upper, FALSE); if (IS_NULL(x)) goto err0; NODE_BODY(x) = ns[0]; @@ -2792,7 +2801,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent, x = make_alt(2, ns); if (IS_NULL(x)) goto err0; - if (is_range_cutter != 0) + if (is_range_cutter != FALSE) NODE_STATUS_ADD(x, SUPER); *node = x; @@ -2882,7 +2891,10 @@ make_range_clear(Node** node, ScanEnv* env) ns[0] = NULL_NODE; ns[1] = x; - r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, 0, env); +#define ID_NOT_USED_DONT_CARE_ME 0 + + r = node_new_update_var_gimmick(&ns[0], UPDATE_VAR_RIGHT_RANGE_INIT, + ID_NOT_USED_DONT_CARE_ME, env); if (r != 0) goto err; x = make_alt(2, ns); @@ -3001,7 +3013,7 @@ make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* qua id1 = GIMMICK_(ns[0])->id; r = make_absent_engine(&ns[1], id1, absent, body, lower, upper, possessive, - 0, env); + FALSE, env); if (r != 0) goto err; ns[2] = ns[3] = NULL_NODE; @@ -3044,7 +3056,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (expr == NULL_NODE) { /* default expr \O* */ - quant = node_new_quantifier(0, REPEAT_INFINITE, 0); + quant = node_new_quantifier(0, INFINITE_REPEAT, FALSE); if (IS_NULL(quant)) goto err0; r = node_new_true_anychar(&body, env); @@ -3086,7 +3098,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (r != 0) goto err; possessive = 1; - r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, REPEAT_INFINITE, + r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT, possessive, is_range_cutter, env); if (r != 0) goto err; @@ -3171,16 +3183,6 @@ node_str_cat_char(Node* node, UChar c) } extern void -onig_node_conv_to_str_node(Node* node, int flag) -{ - NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->flag = flag; - STR_(node)->capacity = 0; - STR_(node)->s = STR_(node)->buf; - STR_(node)->end = STR_(node)->buf; -} - -extern void onig_node_str_clear(Node* node) { if (STR_(node)->capacity != 0 && @@ -3188,10 +3190,11 @@ onig_node_str_clear(Node* node) xfree(STR_(node)->s); } - STR_(node)->capacity = 0; STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; } static Node* @@ -3201,10 +3204,12 @@ node_new_str(const UChar* s, const UChar* end) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_STRING); - STR_(node)->capacity = 0; STR_(node)->flag = 0; STR_(node)->s = STR_(node)->buf; STR_(node)->end = STR_(node)->buf; + STR_(node)->capacity = 0; + STR_(node)->case_min_len = 0; + if (onig_node_str_cat(node, s, end)) { onig_node_free(node); return NULL; @@ -3219,11 +3224,11 @@ onig_node_new_str(const UChar* s, const UChar* end) } static Node* -node_new_str_raw(UChar* s, UChar* end) +node_new_str_crude(UChar* s, UChar* end) { Node* node = node_new_str(s, end); CHECK_NULL_RETURN(node); - NODE_STRING_SET_RAW(node); + NODE_STRING_SET_CRUDE(node); return node; } @@ -3234,12 +3239,20 @@ node_new_empty(void) } static Node* -node_new_str_raw_char(UChar c) +node_new_str_crude_char(UChar c) { + int i; UChar p[1]; + Node* node; p[0] = c; - return node_new_str_raw(p, p + 1); + node = node_new_str_crude(p, p + 1); + + /* clear buf tail */ + for (i = 1; i < NODE_STRING_BUF_SIZE; i++) + STR_(node)->buf[i] = '\0'; + + return node; } static Node* @@ -3256,8 +3269,8 @@ str_node_split_last_char(Node* node, OnigEncoding enc) if (p && p > sn->s) { /* can be split. */ rn = node_new_str(p, sn->end); CHECK_NULL_RETURN(rn); - if (NODE_STRING_IS_RAW(node)) - NODE_STRING_SET_RAW(rn); + if (NODE_STRING_IS_CRUDE(node)) + NODE_STRING_SET_CRUDE(rn); sn->end = (UChar* )p; } @@ -3275,28 +3288,10 @@ str_node_can_be_split(Node* node, OnigEncoding enc) return 0; } -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR static int -node_str_head_pad(StrNode* sn, int num, UChar val) -{ - UChar buf[NODE_STRING_BUF_SIZE]; - int i, len; - - len = sn->end - sn->s; - onig_strcpy(buf, sn->s, sn->end); - onig_strcpy(&(sn->s[num]), buf, buf + len); - sn->end += num; - - for (i = 0; i < num; i++) { - sn->s[i] = val; - } -} -#endif - -extern int -onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) +scan_number(UChar** src, const UChar* end, OnigEncoding enc) { - unsigned int num, val; + int num, val; OnigCodePoint c; UChar* p = *src; PFETCH_READY; @@ -3305,8 +3300,8 @@ onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) while (! PEND) { PFETCH(c); if (IS_CODE_DIGIT_ASCII(enc, c)) { - val = (unsigned int )DIGITVAL(c); - if ((INT_MAX_LIMIT - val) / 10UL < num) + val = (int )DIGITVAL(c); + if ((INT_MAX - val) / 10 < num) return -1; /* overflow */ num = num * 10 + val; @@ -3321,26 +3316,27 @@ onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) } static int -scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, - int maxlen, OnigEncoding enc) +scan_hexadecimal_number(UChar** src, UChar* end, int minlen, int maxlen, + OnigEncoding enc, OnigCodePoint* rcode) { + OnigCodePoint code; OnigCodePoint c; - unsigned int num, val; + unsigned int val; int n; UChar* p = *src; PFETCH_READY; - num = 0; + code = 0; n = 0; while (! PEND && n < maxlen) { PFETCH(c); if (IS_CODE_XDIGIT_ASCII(enc, c)) { n++; - val = (unsigned int )XDIGITVAL(enc,c); - if ((INT_MAX_LIMIT - val) / 16UL < num) + val = (unsigned int )XDIGITVAL(enc, c); + if ((UINT_MAX - val) / 16UL < code) return ONIGERR_TOO_BIG_NUMBER; /* overflow */ - num = (num << 4) + XDIGITVAL(enc,c); + code = (code << 4) + val; } else { PUNFETCH; @@ -3351,36 +3347,46 @@ scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, if (n < minlen) return ONIGERR_INVALID_CODE_POINT_VALUE; + *rcode = code; *src = p; - return num; + return ONIG_NORMAL; } static int -scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, - OnigEncoding enc) +scan_octal_number(UChar** src, UChar* end, int minlen, int maxlen, + OnigEncoding enc, OnigCodePoint* rcode) { + OnigCodePoint code; OnigCodePoint c; - unsigned int num, val; + unsigned int val; + int n; UChar* p = *src; PFETCH_READY; - num = 0; - while (! PEND && maxlen-- != 0) { + code = 0; + n = 0; + while (! PEND && n < maxlen) { PFETCH(c); if (IS_CODE_DIGIT_ASCII(enc, c) && c < '8') { - val = ODIGITVAL(c); - if ((INT_MAX_LIMIT - val) / 8UL < num) - return -1; /* overflow */ + n++; + val = (unsigned int )ODIGITVAL(c); + if ((UINT_MAX - val) / 8UL < code) + return ONIGERR_TOO_BIG_NUMBER; /* overflow */ - num = (num << 3) + val; + code = (code << 3) + val; } else { PUNFETCH; break; } } + + if (n < minlen) + return ONIGERR_INVALID_CODE_POINT_VALUE; + + *rcode = code; *src = p; - return num; + return ONIG_NORMAL; } @@ -3877,19 +3883,19 @@ quantifier_type_num(QuantNode* q) if (q->greedy) { if (q->lower == 0) { if (q->upper == 1) return 0; - else if (IS_REPEAT_INFINITE(q->upper)) return 1; + else if (IS_INFINITE_REPEAT(q->upper)) return 1; } else if (q->lower == 1) { - if (IS_REPEAT_INFINITE(q->upper)) return 2; + if (IS_INFINITE_REPEAT(q->upper)) return 2; } } else { if (q->lower == 0) { if (q->upper == 1) return 3; - else if (IS_REPEAT_INFINITE(q->upper)) return 4; + else if (IS_INFINITE_REPEAT(q->upper)) return 4; } else if (q->lower == 1) { - if (IS_REPEAT_INFINITE(q->upper)) return 5; + if (IS_INFINITE_REPEAT(q->upper)) return 5; } } return -1; @@ -3915,68 +3921,70 @@ static enum ReduceType ReduceTypeTable[6][6] = { {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ }; -extern void -onig_reduce_nested_quantifier(Node* pnode, Node* cnode) +extern int +onig_reduce_nested_quantifier(Node* pnode) { int pnum, cnum; QuantNode *p, *c; + Node* cnode; + + cnode = NODE_BODY(pnode); p = QUANT_(pnode); c = QUANT_(cnode); pnum = quantifier_type_num(p); cnum = quantifier_type_num(c); if (pnum < 0 || cnum < 0) { - if ((p->lower == p->upper) && ! IS_REPEAT_INFINITE(p->upper)) { - if ((c->lower == c->upper) && ! IS_REPEAT_INFINITE(c->upper)) { - int n = onig_positive_int_multiply(p->lower, c->lower); - if (n >= 0) { - p->lower = p->upper = n; - NODE_BODY(pnode) = NODE_BODY(cnode); - goto remove_cnode; - } - } + if (p->lower == p->upper && c->lower == c->upper) { + int n = onig_positive_int_multiply(p->lower, c->lower); + if (n < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + + p->lower = p->upper = n; + NODE_BODY(pnode) = NODE_BODY(cnode); + goto remove_cnode; } - return ; + return 0; } switch(ReduceTypeTable[cnum][pnum]) { case RQ_DEL: *pnode = *cnode; + goto remove_cnode; break; case RQ_A: NODE_BODY(pnode) = NODE_BODY(cnode); - p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; + p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; + goto remove_cnode; break; case RQ_AQ: NODE_BODY(pnode) = NODE_BODY(cnode); - p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; + p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; + goto remove_cnode; break; case RQ_QQ: NODE_BODY(pnode) = NODE_BODY(cnode); p->lower = 0; p->upper = 1; p->greedy = 0; + goto remove_cnode; break; case RQ_P_QQ: - NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 0; - c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; - return ; + c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; break; case RQ_PQ_Q: - NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 1; - c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; - return ; + c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0; break; case RQ_ASIS: - NODE_BODY(pnode) = cnode; - return ; break; } + return 0; + remove_cnode: NODE_BODY(cnode) = NULL_NODE; onig_node_free(cnode); + return 0; } static int @@ -3995,7 +4003,7 @@ node_new_general_newline(Node** node, ScanEnv* env) alen = ONIGENC_CODE_TO_MBC(env->enc, 0x0a, buf + dlen); if (alen < 0) return alen; - crnl = node_new_str_raw(buf, buf + dlen + alen); + crnl = node_new_str_crude(buf, buf + dlen + alen); CHECK_NULL_RETURN_MEMERR(crnl); ncc = node_new_cclass(); @@ -4023,7 +4031,7 @@ node_new_general_newline(Node** node, ScanEnv* env) if (r != 0) goto err1; } - x = node_new_bag_if_else(crnl, 0, ncc); + x = node_new_bag_if_else(crnl, NULL_NODE, ncc); if (IS_NULL(x)) goto err1; *node = x; @@ -4032,7 +4040,7 @@ node_new_general_newline(Node** node, ScanEnv* env) enum TokenSyms { TK_EOT = 0, /* end of token */ - TK_RAW_BYTE = 1, + TK_CRUDE_BYTE = 1, TK_CHAR, TK_STRING, TK_CODE_POINT, @@ -4047,7 +4055,7 @@ enum TokenSyms { TK_ALT, TK_SUBEXP_OPEN, TK_SUBEXP_CLOSE, - TK_CC_OPEN, + TK_OPEN_CC, TK_QUOTE_OPEN, TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ TK_KEEP, /* \K */ @@ -4059,9 +4067,9 @@ enum TokenSyms { /* in cc */ TK_CC_CLOSE, TK_CC_RANGE, - TK_POSIX_BRACKET_OPEN, - TK_CC_AND, /* && */ - TK_CC_CC_OPEN /* [ */ + TK_CC_POSIX_BRACKET_OPEN, + TK_CC_AND, /* && */ + TK_CC_OPEN_CC /* [ */ }; typedef struct { @@ -4071,7 +4079,7 @@ typedef struct { UChar* backp; union { UChar* s; - int c; + UChar byte; OnigCodePoint code; int anchor; int subtype; @@ -4106,7 +4114,7 @@ typedef struct { static int -fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) +fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) { int low, up, syn_allow, non_low = 0; int r = 0; @@ -4131,7 +4139,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) } } - low = onig_scan_unsigned_number(&p, end, env->enc); + low = scan_number(&p, end, env->enc); if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (low > ONIG_MAX_REPEAT_NUM) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; @@ -4150,7 +4158,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) PFETCH(c); if (c == ',') { UChar* prev = p; - up = onig_scan_unsigned_number(&p, end, env->enc); + up = scan_number(&p, end, env->enc); if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; if (up > ONIG_MAX_REPEAT_NUM) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; @@ -4158,7 +4166,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) if (p == prev) { if (non_low != 0) goto invalid; - up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ + up = INFINITE_REPEAT; /* {n,} : {n,infinite} */ } } else { @@ -4173,12 +4181,12 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) if (PEND) goto invalid; PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { - if (c != MC_ESC(env->syntax)) goto invalid; + if (c != MC_ESC(env->syntax) || PEND) goto invalid; PFETCH(c); } if (c != '}') goto invalid; - if (!IS_REPEAT_INFINITE(up) && low > up) { + if (!IS_INFINITE_REPEAT(up) && low > up) { /* {n,m}+ supported case */ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL)) return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; @@ -4396,7 +4404,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, PFETCH(c); if (! IS_CODE_DIGIT_ASCII(enc, c)) goto err; PUNFETCH; - level = onig_scan_unsigned_number(&p, end, enc); + level = scan_number(&p, end, enc); if (level < 0) return ONIGERR_TOO_BIG_NUMBER; *rlevel = (level * flag); exist_level = 1; @@ -4417,7 +4425,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, end: if (r == 0) { if (*num_type != IS_NOT_NUM) { - *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + *rback_num = scan_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { if (*num_type == IS_REL_NUM) @@ -4445,7 +4453,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, static int fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int* rback_num, - enum REF_NUM* num_type, int ref) + enum REF_NUM* num_type, int is_ref) { int r, sign; int digit_count; @@ -4475,7 +4483,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, return ONIGERR_EMPTY_GROUP_NAME; if (IS_CODE_DIGIT_ASCII(enc, c)) { - if (ref == 1) + if (is_ref == TRUE) *num_type = IS_ABS_NUM; else { r = ONIGERR_INVALID_GROUP_NAME; @@ -4483,7 +4491,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, digit_count++; } else if (c == '-') { - if (ref == 1) { + if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = -1; pnum_head = p; @@ -4493,7 +4501,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } } else if (c == '+') { - if (ref == 1) { + if (is_ref == TRUE) { *num_type = IS_REL_NUM; sign = 1; pnum_head = p; @@ -4543,7 +4551,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } if (*num_type != IS_NOT_NUM) { - *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); + *rback_num = scan_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { if (*num_type == IS_REL_NUM) { @@ -4675,7 +4683,8 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, static int fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int num; + int r; + OnigCodePoint code; OnigCodePoint c, c2; OnigSyntaxType* syn = env->syntax; OnigEncoding enc = env->enc; @@ -4691,7 +4700,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PFETCH(c); tok->type = TK_CHAR; tok->base = 0; - tok->u.c = c; + tok->u.code = c; tok->escaped = 0; if (c == ']') { @@ -4708,7 +4717,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PFETCH(c); tok->escaped = 1; - tok->u.c = c; + tok->u.code = c; switch (c) { case 'w': tok->type = TK_CHAR_TYPE; @@ -4781,8 +4790,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { PINC; - num = scan_unsigned_octal_number(&p, end, 11, enc); - if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + r = scan_octal_number(&p, end, 0, 11, enc, &code); + if (r < 0) return r; if (!PEND) { c2 = PPEEK; if (IS_CODE_DIGIT_ASCII(enc, c2)) @@ -4793,7 +4802,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; tok->type = TK_CODE_POINT; tok->base = 8; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -4808,13 +4817,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); - if (num < 0) { - if (num == ONIGERR_TOO_BIG_NUMBER) - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - else - return num; - } + r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); + if (r < 0) return r; if (!PEND) { c2 = PPEEK; if (IS_CODE_XDIGIT_ASCII(enc, c2)) @@ -4825,7 +4829,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -4833,14 +4837,14 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 16; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; @@ -4849,14 +4853,14 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } break; @@ -4865,22 +4869,23 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { PUNFETCH; prev = p; - num = scan_unsigned_octal_number(&p, end, 3, enc); - if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER; + r = scan_octal_number(&p, end, 0, 3, enc, &code); + if (r < 0) return r; + if (code >= 256) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 8; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; default: PUNFETCH; - num = fetch_escaped_value(&p, end, env, &c2); - if (num < 0) return num; - if (tok->u.c != c2) { + r = fetch_escaped_value(&p, end, env, &c2); + if (r < 0) return r; + if (tok->u.code != c2) { tok->u.code = c2; tok->type = TK_CODE_POINT; } @@ -4894,7 +4899,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) PINC; if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']', enc, syn)) { - tok->type = TK_POSIX_BRACKET_OPEN; + tok->type = TK_CC_POSIX_BRACKET_OPEN; } else { PUNFETCH; @@ -4904,7 +4909,7 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) else { cc_in_cc: if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { - tok->type = TK_CC_CC_OPEN; + tok->type = TK_CC_OPEN_CC; } else { CC_ESC_WARN(env, (UChar* )"["); @@ -4927,7 +4932,8 @@ fetch_token_in_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env) static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { - int r, num; + int r; + OnigCodePoint code; OnigCodePoint c; OnigEncoding enc = env->enc; OnigSyntaxType* syn = env->syntax; @@ -4952,14 +4958,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->backp = p; PFETCH(c); - tok->u.c = c; + tok->u.code = c; tok->escaped = 1; switch (c) { case '*': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 0; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -4967,7 +4973,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 1; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5003,7 +5009,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; - r = fetch_interval_quantifier(&p, end, tok, env); + r = fetch_interval(&p, end, tok, env); if (r < 0) return r; /* error */ if (r == 0) goto greedy_check2; else if (r == 2) { /* {n} */ @@ -5191,8 +5197,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { PINC; - num = scan_unsigned_octal_number(&p, end, 11, enc); - if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + r = scan_octal_number(&p, end, 0, 11, enc, &code); + if (r < 0) return r; if (!PEND) { if (IS_CODE_DIGIT_ASCII(enc, PPEEK)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; @@ -5201,7 +5207,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -5216,13 +5222,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); - if (num < 0) { - if (num == ONIGERR_TOO_BIG_NUMBER) - return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - else - return num; - } + r = scan_hexadecimal_number(&p, end, 0, 8, enc, &code); + if (r < 0) return r; if (!PEND) { if (IS_CODE_XDIGIT_ASCII(enc, PPEEK)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; @@ -5231,7 +5232,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { PINC; tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } else { /* can't read nothing or invalid format */ @@ -5239,14 +5240,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { - num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 0, 2, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 16; - tok->u.c = num; + tok->u.byte = (UChar )code; } break; @@ -5255,14 +5256,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { - num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc); - if (num < 0) return num; + r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); + if (r < 0) return r; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } tok->type = TK_CODE_POINT; tok->base = 16; - tok->u.code = (OnigCodePoint )num; + tok->u.code = code; } break; @@ -5270,21 +5271,21 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '5': case '6': case '7': case '8': case '9': PUNFETCH; prev = p; - num = onig_scan_unsigned_number(&p, end, enc); - if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { + r = scan_number(&p, end, enc); + if (r < 0 || r > ONIG_MAX_BACKREF_NUM) { goto skip_backref; } if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && - (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ + (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */ if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { - if (num > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[num].node)) + if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; tok->u.backref.num = 1; - tok->u.backref.ref1 = num; + tok->u.backref.ref1 = r; tok->u.backref.by_name = 0; #ifdef USE_BACKREF_WITH_LEVEL tok->u.backref.exist_level = 0; @@ -5304,14 +5305,14 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '0': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { prev = p; - num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); - if (num < 0 || num >= 256) return ONIGERR_TOO_BIG_NUMBER; + r = scan_octal_number(&p, end, 0, (c == '0' ? 2:3), enc, &code); + if (r < 0 || r >= 256) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ - num = 0; /* but, it's not error */ + code = 0; /* but, it's not error */ } - tok->type = TK_RAW_BYTE; + tok->type = TK_CRUDE_BYTE; tok->base = 8; - tok->u.c = num; + tok->u.byte = (UChar )code; } else if (c != '0') { PINC; @@ -5336,7 +5337,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (r == 1) tok->u.backref.exist_level = 1; else tok->u.backref.exist_level = 0; #else - r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, 1); + r = fetch_name(c, &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) return r; @@ -5349,7 +5350,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].node)) + IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } tok->type = TK_BACKREF; @@ -5358,17 +5359,15 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.ref1 = back_num; } else { - num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + int num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].node)) + IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } @@ -5401,7 +5400,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, - &gnum, &num_type, 1); + &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type != IS_NOT_NUM) { @@ -5462,10 +5461,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) OnigCodePoint c2; PUNFETCH; - num = fetch_escaped_value(&p, end, env, &c2); - if (num < 0) return num; - /* set_raw: */ - if (tok->u.c != c2) { + r = fetch_escaped_value(&p, end, env, &c2); + if (r < 0) return r; + if (tok->u.code != c2) { tok->type = TK_CODE_POINT; tok->u.code = c2; } @@ -5477,7 +5475,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } else { - tok->u.c = c; + tok->u.code = c; tok->escaped = 0; #ifdef USE_VARIABLE_META_CHARS @@ -5514,7 +5512,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 0; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5525,7 +5523,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 1; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5542,7 +5540,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '{': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; - r = fetch_interval_quantifier(&p, end, tok, env); + r = fetch_interval(&p, end, tok, env); if (r < 0) return r; /* error */ if (r == 0) goto greedy_check2; else if (r == 2) { /* {n} */ @@ -5590,8 +5588,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { PINC; name = p; - r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, - &num_type, 0); + r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, + &gnum, &num_type, FALSE); if (r < 0) return r; tok->type = TK_CALL; @@ -5608,7 +5606,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.call.gnum = 0; tok->u.call.name = p; PINC; - if (! PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME; + if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION; tok->u.call.name_end = p; break; @@ -5623,7 +5621,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) { name = p; r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, - &gnum, &num_type, 1); + &gnum, &num_type, TRUE); if (r < 0) return r; if (num_type == IS_NOT_NUM) { @@ -5679,7 +5677,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) case '[': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; - tok->type = TK_CC_OPEN; + tok->type = TK_OPEN_CC; break; case ']': @@ -5890,6 +5888,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) int c, r; int ascii_mode; + int is_single; const OnigCodePoint *ranges; OnigCodePoint limit; OnigCodePoint sb_out; @@ -5911,6 +5910,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } r = 0; + is_single = ONIGENC_IS_SINGLEBYTE(enc); limit = ascii_mode ? ASCII_LIMIT : SINGLE_BYTE_SIZE; switch (ctype) { @@ -5927,19 +5927,25 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) case ONIGENC_CTYPE_ALNUM: if (not != 0) { for (c = 0; c < (int )limit; c++) { - if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { + if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } } for (c = limit; c < SINGLE_BYTE_SIZE; c++) { - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) + BITSET_SET_BIT(cc->bs, c); } - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + if (is_single == 0) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) - BITSET_SET_BIT(cc->bs, c); + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) { + if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } } } break; @@ -5949,21 +5955,25 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) case ONIGENC_CTYPE_WORD: if (not != 0) { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_CODE_TO_MBCLEN(enc, c) > 0 /* check invalid code point */ + /* check invalid code point */ + if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) && ! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } for (c = limit; c < SINGLE_BYTE_SIZE; c++) { - if (ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) + if (is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) BITSET_SET_BIT(cc->bs, c); } + if (ascii_mode != 0 && is_single == 0) + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < (int )limit; c++) { - if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + if ((is_single != 0 || ONIGENC_CODE_TO_MBCLEN(enc, c) == 1) + && ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) BITSET_SET_BIT(cc->bs, c); } - if (ascii_mode == 0) + if (ascii_mode == 0 && is_single == 0) ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } break; @@ -6055,10 +6065,12 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) { int r; OnigCodePoint c; - OnigEncoding enc = env->enc; - UChar *prev, *start, *p = *src; + OnigEncoding enc; + UChar *prev, *start, *p; - r = 0; + p = *src; + enc = env->enc; + r = ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; start = prev = p; while (!PEND) { @@ -6066,18 +6078,20 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) PFETCH_S(c); if (c == '}') { r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); - if (r < 0) break; + if (r >= 0) { + *src = p; + } + else { + onig_scan_env_set_error_string(env, r, *src, prev); + } - *src = p; return r; } else if (c == '(' || c == ')' || c == '{' || c == '|') { - r = ONIGERR_INVALID_CHAR_PROPERTY_NAME; break; } } - onig_scan_env_set_error_string(env, r, *src, prev); return r; } @@ -6093,7 +6107,7 @@ parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* en *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - r = add_ctype_to_cc(cc, ctype, 0, env); + r = add_ctype_to_cc(cc, ctype, FALSE, env); if (r != 0) return r; if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); @@ -6101,67 +6115,67 @@ parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* en } -enum CCSTATE { - CCS_VALUE, - CCS_RANGE, - CCS_COMPLETE, - CCS_START -}; +typedef enum { + CS_VALUE, + CS_RANGE, + CS_COMPLETE, + CS_START +} CSTATE; -enum CCVALTYPE { - CCV_SB, - CCV_CODE_POINT, - CCV_CLASS -}; +typedef enum { + CV_UNDEF, + CV_SB, + CV_MB, + CV_CPROP +} CVAL; static int -next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type, - enum CCSTATE* state, ScanEnv* env) +cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, + ScanEnv* env) { int r; - if (*state == CCS_RANGE) + if (*state == CS_RANGE) return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; - if (*state == CCS_VALUE && *type != CCV_CLASS) { - if (*type == CCV_SB) - BITSET_SET_BIT(cc->bs, (int )(*vs)); - else if (*type == CCV_CODE_POINT) { - r = add_code_range(&(cc->mbuf), env, *vs, *vs); + if (*state == CS_VALUE) { + if (*val == CV_SB) + BITSET_SET_BIT(cc->bs, (int )(*pcode)); + else if (*val == CV_MB) { + r = add_code_range(&(cc->mbuf), env, *pcode, *pcode); if (r < 0) return r; } } - *state = CCS_VALUE; - *type = CCV_CLASS; + *state = CS_VALUE; + *val = CV_CPROP; return 0; } static int -next_state_val(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, - int* from_israw, int to_israw, - enum CCVALTYPE intype, enum CCVALTYPE* type, - enum CCSTATE* state, ScanEnv* env) +cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, + int* from_raw, int to_raw, CVAL intype, CVAL* type, + CSTATE* state, ScanEnv* env) { int r; switch (*state) { - case CCS_VALUE: - if (*type == CCV_SB) { + case CS_VALUE: + if (*type == CV_SB) { if (*from > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; BITSET_SET_BIT(cc->bs, (int )(*from)); } - else if (*type == CCV_CODE_POINT) { + else if (*type == CV_MB) { r = add_code_range(&(cc->mbuf), env, *from, *from); if (r < 0) return r; } break; - case CCS_RANGE: + case CS_RANGE: if (intype == *type) { - if (intype == CCV_SB) { + if (intype == CV_SB) { if (*from > 0xff || to > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; @@ -6190,21 +6204,21 @@ next_state_val(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to, if (r < 0) return r; } ccs_range_end: - *state = CCS_COMPLETE; + *state = CS_COMPLETE; break; - case CCS_COMPLETE: - case CCS_START: - *state = CCS_VALUE; + case CS_COMPLETE: + case CS_START: + *state = CS_VALUE; break; default: break; } - *from_israw = to_israw; - *from = to; - *type = intype; + *from_raw = to_raw; + *from = to; + *type = intype; return 0; } @@ -6232,26 +6246,25 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, } static int -parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) { int r, neg, len, fetched, and_start; - OnigCodePoint v, vs; + OnigCodePoint in_code, curr_code; UChar *p; Node* node; CClassNode *cc, *prev_cc; CClassNode work_cc; - - enum CCSTATE state; - enum CCVALTYPE val_type, in_type; - int val_israw, in_israw; + int curr_raw, in_raw; + CSTATE state; + CVAL in_type; + CVAL curr_type; *np = NULL_NODE; - env->parse_depth++; - if (env->parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(env->parse_depth); + prev_cc = (CClassNode* )NULL; r = fetch_token_in_cc(tok, src, end, env); - if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { + if (r == TK_CHAR && tok->u.code == (OnigCodePoint )'^' && tok->escaped == 0) { neg = 1; r = fetch_token_in_cc(tok, src, end, env); } @@ -6274,47 +6287,44 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) cc = CCLASS_(node); and_start = 0; - state = CCS_START; + state = CS_START; + curr_type = CV_UNDEF; + p = *src; while (r != TK_CC_CLOSE) { fetched = 0; switch (r) { case TK_CHAR: any_char_in: - len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); - if (len > 1) { - in_type = CCV_CODE_POINT; - } - else if (len < 0) { + len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code); + if (len < 0) { r = len; goto err; } - else { - /* sb_char: */ - in_type = CCV_SB; - } - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_type = (len == 1) ? CV_SB : CV_MB; + in_code = tok->u.code; + in_raw = 0; goto val_entry2; break; - case TK_RAW_BYTE: + case TK_CRUDE_BYTE: /* tok->base != 0 : octal or hexadec. */ if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { + int i, j; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; UChar* psave = p; - int i, base = tok->base; + int base = tok->base; - buf[0] = tok->u.c; + buf[0] = tok->u.byte; for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; - if (r != TK_RAW_BYTE || tok->base != base) { + if (r != TK_CRUDE_BYTE || tok->base != base) { fetched = 1; break; } - buf[i] = tok->u.c; + buf[i] = tok->u.byte; } if (i < ONIGENC_MBC_MINLEN(env->enc)) { @@ -6322,6 +6332,9 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto err; } + /* clear buf tail */ + for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0'; + len = enclen(env->enc, buf); if (i < len) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; @@ -6336,58 +6349,63 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } if (i == 1) { - v = (OnigCodePoint )buf[0]; - goto raw_single; + in_code = (OnigCodePoint )buf[0]; + goto crude_single; } else { - v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); - in_type = CCV_CODE_POINT; + in_code = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); + in_type = CV_MB; } } else { - v = (OnigCodePoint )tok->u.c; - raw_single: - in_type = CCV_SB; + in_code = (OnigCodePoint )tok->u.byte; + crude_single: + in_type = CV_SB; } - in_israw = 1; + in_raw = 1; goto val_entry2; break; case TK_CODE_POINT: - v = tok->u.code; - in_israw = 1; + in_code = tok->u.code; + in_raw = 1; val_entry: - len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); + len = ONIGENC_CODE_TO_MBCLEN(env->enc, in_code); if (len < 0) { - r = len; - goto err; + if (state != CS_RANGE || + ! IS_SYNTAX_BV(env->syntax, + ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || + in_code < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { + r = len; + goto err; + } } - in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); + in_type = (len == 1 ? CV_SB : CV_MB); val_entry2: - r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type, - &state, env); + r = cc_char_next(cc, &curr_code, in_code, &curr_raw, in_raw, in_type, + &curr_type, &state, env); if (r != 0) goto err; break; - case TK_POSIX_BRACKET_OPEN: + case TK_CC_POSIX_BRACKET_OPEN: r = parse_posix_bracket(cc, &p, end, env); if (r < 0) goto err; if (r == 1) { /* is not POSIX bracket */ CC_ESC_WARN(env, (UChar* )"["); p = tok->backp; - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_code = tok->u.code; + in_raw = 0; goto val_entry; } - goto next_class; + goto next_cprop; break; case TK_CHAR_TYPE: r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env); if (r != 0) goto err; - next_class: - r = next_state_class(cc, &vs, &val_type, &state, env); + next_cprop: + r = cc_cprop_next(cc, &curr_code, &curr_type, &state, env); if (r != 0) goto err; break; @@ -6400,19 +6418,20 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env); if (r != 0) goto err; - goto next_class; + goto next_cprop; } break; case TK_CC_RANGE: - if (state == CCS_VALUE) { + if (state == CS_VALUE) { r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; if (r == TK_CC_CLOSE) { /* allow [x-] */ range_end_val: - v = (OnigCodePoint )'-'; - in_israw = 0; + in_code = (OnigCodePoint )'-'; + in_raw = 0; goto val_entry; } else if (r == TK_CC_AND) { @@ -6420,20 +6439,21 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto range_end_val; } - if (val_type == CCV_CLASS) { + if (curr_type == CV_CPROP) { r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; goto err; } - state = CCS_RANGE; + state = CS_RANGE; } - else if (state == CCS_START) { + else if (state == CS_START) { /* [-xa] is allowed */ - v = (OnigCodePoint )tok->u.c; - in_israw = 0; + in_code = tok->u.code; + in_raw = 0; r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; /* [--x] or [a&&-x] is warned. */ if (r == TK_CC_RANGE || and_start != 0) @@ -6441,15 +6461,17 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto val_entry; } - else if (state == CCS_RANGE) { + else if (state == CS_RANGE) { CC_ESC_WARN(env, (UChar* )"-"); - goto any_char_in; /* [!--x] is allowed */ + goto any_char_in; /* [!--] is allowed */ } - else { /* CCS_COMPLETE */ + else { /* CS_COMPLETE */ r = fetch_token_in_cc(tok, &p, end, env); if (r < 0) goto err; + fetched = 1; - if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ + if (r == TK_CC_CLOSE) + goto range_end_val; /* allow [a-b-] */ else if (r == TK_CC_AND) { CC_ESC_WARN(env, (UChar* )"-"); goto range_end_val; @@ -6464,12 +6486,19 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } break; - case TK_CC_CC_OPEN: /* [ */ + case TK_CC_OPEN_CC: /* [ */ { Node *anode; CClassNode* acc; - r = parse_char_class(&anode, tok, &p, end, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); + if (r != 0) goto err; + } + state = CS_COMPLETE; + + r = parse_cc(&anode, tok, &p, end, env); if (r != 0) { onig_node_free(anode); goto cc_open_err; @@ -6485,14 +6514,14 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) case TK_CC_AND: /* && */ { - if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, - &val_type, &state, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); if (r != 0) goto err; } /* initialize local variables */ and_start = 1; - state = CCS_START; + state = CS_START; if (IS_NOT_NULL(prev_cc)) { r = and_cclass(prev_cc, cc, env->enc); @@ -6525,9 +6554,9 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } - if (state == CCS_VALUE) { - r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, - &val_type, &state, env); + if (state == CS_VALUE) { + r = cc_char_next(cc, &curr_code, 0, &curr_raw, 0, curr_type, &curr_type, + &state, env); if (r != 0) goto err; } @@ -6560,7 +6589,7 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } } *src = p; - env->parse_depth--; + DEC_PARSE_DEPTH(env->parse_depth); return 0; err: @@ -6569,8 +6598,8 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) return r; } -static int parse_subexp(Node** top, PToken* tok, int term, - UChar** src, UChar* end, ScanEnv* env, int group_head); +static int parse_alts(Node** top, PToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env, int group_head); #ifdef USE_CALLOUT @@ -6673,7 +6702,7 @@ parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv } if (tag_start != tag_end) { - r = callout_tag_entry(env->reg, tag_start, tag_end, num); + r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); if (r != ONIG_NORMAL) return r; } @@ -6741,7 +6770,8 @@ parse_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* static int parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, - unsigned int types[], OnigValue vals[], ScanEnv* env) + int max_arg_num, unsigned int types[], OnigValue vals[], + ScanEnv* env) { #define MAX_CALLOUT_ARG_BYTE_LENGTH 128 @@ -6760,9 +6790,9 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; + c = 0; n = 0; while (n < ONIG_CALLOUT_MAX_ARGS_NUM) { - c = 0; cn = 0; esc = 0; eesc = 0; @@ -6795,7 +6825,7 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, size_t clen; add_char: - if (skip_mode == 0) { + if (skip_mode == FALSE) { clen = p - e; if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) return ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */ @@ -6809,7 +6839,10 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, } if (cn != 0) { - if (skip_mode == 0) { + if (max_arg_num >= 0 && n >= max_arg_num) + return ONIGERR_INVALID_CALLOUT_ARG; + + if (skip_mode == FALSE) { if ((types[n] & ONIG_TYPE_LONG) != 0) { int fixed = 0; if (cn > 0) { @@ -6941,7 +6974,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en /* read for single check only */ save = p; - arg_num = parse_callout_args(1, '}', &p, end, 0, 0, env); + arg_num = parse_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env); if (arg_num < 0) return arg_num; is_not_single = PPEEK_IS(cterm) ? 0 : 1; @@ -6955,7 +6988,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en types[i] = get_callout_arg_type_by_name_id(name_id, i); } - arg_num = parse_callout_args(0, '}', &p, end, types, vals, env); + arg_num = parse_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env); if (arg_num < 0) return arg_num; if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; @@ -6994,7 +7027,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en } if (tag_start != tag_end) { - r = callout_tag_entry(env->reg, tag_start, tag_end, num); + r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); if (r != ONIG_NORMAL) return r; } @@ -7055,17 +7088,17 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, group: r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(np, tok, term, &p, end, env, 0); + r = parse_alts(np, tok, term, &p, end, env, FALSE); if (r < 0) return r; *src = p; return 1; /* group */ break; case '=': - *np = onig_node_new_anchor(ANCR_PREC_READ, 0); + *np = onig_node_new_anchor(ANCR_PREC_READ, FALSE); break; case '!': /* preceding read */ - *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, 0); + *np = onig_node_new_anchor(ANCR_PREC_READ_NOT, FALSE); break; case '>': /* (?>...) stop backtrack */ *np = node_new_bag(BAG_STOP_BACKTRACK); @@ -7083,9 +7116,9 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; PFETCH(c); if (c == '=') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, 0); + *np = onig_node_new_anchor(ANCR_LOOK_BEHIND, FALSE); else if (c == '!') - *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, 0); + *np = onig_node_new_anchor(ANCR_LOOK_BEHIND_NOT, FALSE); else { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { UChar *name; @@ -7101,7 +7134,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, named_group2: name = p; r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, - &num_type, 0); + &num_type, FALSE); if (r < 0) return r; num = scan_env_add_mem_entry(env); @@ -7115,7 +7148,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, CHECK_NULL_RETURN_MEMERR(*np); BAG_(*np)->m.regnum = num; if (list_capture != 0) - MEM_STATUS_ON_SIMPLE(env->capture_history, num); + MEM_STATUS_ON_SIMPLE(env->cap_history, num); env->num_named++; } else { @@ -7150,7 +7183,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&absent, tok, term, &p, end, env, 1); + r = parse_alts(&absent, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(absent); return r; @@ -7237,7 +7270,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (r == 1) exist_level = 1; #else r = fetch_name((OnigCodePoint )(is_enclosed != 0 ? c : '('), - &p, end, &name_end, env, &back_num, &num_type, 1); + &p, end, &name_end, env, &back_num, &num_type, TRUE); #endif if (r < 0) { if (is_enclosed == 0) { @@ -7257,11 +7290,11 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (back_num > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[back_num].node)) + IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) return ONIGERR_INVALID_BACKREF; } - condition = node_new_backref_checker(1, &back_num, 0, + condition = node_new_backref_checker(1, &back_num, FALSE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif @@ -7271,22 +7304,20 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, int num; int* backs; - num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { int i; for (i = 0; i < num; i++) { if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEMENV(env)[backs[i]].node)) + IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) return ONIGERR_INVALID_BACKREF; } } - condition = node_new_backref_checker(num, backs, 1, + condition = node_new_backref_checker(num, backs, TRUE, #ifdef USE_BACKREF_WITH_LEVEL exist_level, level, #endif @@ -7328,7 +7359,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, condition_is_checker = 0; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&condition, tok, term, &p, end, env, 0); + r = parse_alts(&condition, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(condition); return r; @@ -7371,7 +7402,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, onig_node_free(condition); return r; } - r = parse_subexp(&target, tok, term, &p, end, env, 1); + r = parse_alts(&target, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(condition); onig_node_free(target); @@ -7414,6 +7445,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, } break; +#ifdef USE_CAPTURE_HISTORY case '@': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { @@ -7435,12 +7467,13 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; } BAG_(*np)->m.regnum = num; - MEM_STATUS_ON_SIMPLE(env->capture_history, num); + MEM_STATUS_ON_SIMPLE(env->cap_history, num); } else { return ONIGERR_UNDEFINED_GROUP_OPTION; } break; +#endif #ifdef USE_POSIXLINE_OPTION case 'p': @@ -7470,7 +7503,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, case 'm': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { - OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); + OPTION_NEGATE(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? TRUE : FALSE)); } else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA|ONIG_SYN_OP2_OPTION_RUBY)) { @@ -7506,16 +7539,16 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 0); - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 1); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, FALSE); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, TRUE); break; #ifdef USE_UNICODE_WORD_BREAK case 'w': if (! ONIGENC_IS_UNICODE_ENCODING(enc)) return ONIGERR_UNDEFINED_GROUP_OPTION; - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 0); - OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 1); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, FALSE); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, TRUE); break; #endif default: @@ -7545,7 +7578,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, &p, end, env, 0); + r = parse_alts(&target, tok, term, &p, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7592,7 +7625,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, CHECK_NULL_RETURN_MEMERR(*np); r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, &p, end, env, 0); + r = parse_alts(&target, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(target); return r; @@ -7602,7 +7635,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (NODE_TYPE(*np) == NODE_BAG) { if (BAG_(*np)->type == BAG_MEMORY) { - /* Don't move this to previous of parse_subexp() */ + /* Don't move this to previous of parse_alts() */ r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np); if (r != 0) return r; } @@ -7622,7 +7655,7 @@ static const char* ReduceQStr[] = { }; static int -set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) +assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env) { QuantNode* qn; @@ -7688,15 +7721,17 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) if (targetq_num >= 0 && nestq_num < 0) { if (targetq_num == 1 || targetq_num == 2) { /* * or + */ /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ - if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { + if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) { qn->upper = (qn->lower == 0 ? 1 : qn->lower); } } } else { + int r; + NODE_BODY(qnode) = target; - onig_reduce_nested_quantifier(qnode, target); - goto q_exit; + r = onig_reduce_nested_quantifier(qnode); + return r; } } break; @@ -7706,7 +7741,6 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) } NODE_BODY(qnode) = target; - q_exit: return 0; } @@ -7736,6 +7770,38 @@ clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) } #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ +#define ADD_CODE_INTO_CC(cc, code, enc) do {\ + if (ONIGENC_MBC_MINLEN(enc) > 1 || ONIGENC_CODE_TO_MBCLEN(enc, code) != 1) {\ + add_code_range_to_buf(&((cc)->mbuf), code, code);\ + }\ + else {\ + BITSET_SET_BIT((cc)->bs, code);\ + }\ +} while (0) + +extern int +onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, + int n, OnigCodePoint codes[]) +{ + int i; + Node* node; + CClassNode* cc; + + *rnode = NULL_NODE; + + node = node_new_cclass(); + CHECK_NULL_RETURN_MEMERR(node); + + cc = CCLASS_(node); + + for (i = 0; i < n; i++) { + ADD_CODE_INTO_CC(cc, codes[i], enc); + } + + *rnode = node; + return 0; +} + typedef struct { ScanEnv* env; CClassNode* cc; @@ -7749,37 +7815,31 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) IApplyCaseFoldArg* iarg; ScanEnv* env; CClassNode* cc; - BitSetRef bs; iarg = (IApplyCaseFoldArg* )arg; env = iarg->env; cc = iarg->cc; - bs = cc->bs; if (to_len == 1) { int is_in = onig_is_code_in_cc(env->enc, from, cc); #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || (is_in == 0 && IS_NCCLASS_NOT(cc))) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { - add_code_range(&(cc->mbuf), env, *to, *to); - } - else { - BITSET_SET_BIT(bs, *to); - } + ADD_CODE_INTO_CC(cc, *to, env->enc); } #else if (is_in != 0) { - if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { + if (ONIGENC_MBC_MINLEN(env->enc) > 1 || + ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) { if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); add_code_range(&(cc->mbuf), env, *to, *to); } else { if (IS_NCCLASS_NOT(cc)) { - BITSET_CLEAR_BIT(bs, *to); + BITSET_CLEAR_BIT(cc->bs, *to); } else - BITSET_SET_BIT(bs, *to); + BITSET_SET_BIT(cc->bs, *to); } } #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ @@ -7787,34 +7847,65 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) else { int r, i, len; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - Node *snode = NULL_NODE; if (onig_is_code_in_cc(env->enc, from, cc) #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS && !IS_NCCLASS_NOT(cc) #endif ) { + int n, j, m, index; + Node* list_node; + Node* ns[3]; + + n = 0; for (i = 0; i < to_len; i++) { - len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); - if (i == 0) { - snode = onig_node_new_str(buf, buf + len); - CHECK_NULL_RETURN_MEMERR(snode); - - /* char-class expanded multi-char only - compare with string folded at match time. */ - NODE_STRING_SET_AMBIG(snode); + OnigCodePoint code; + Node* csnode; + CClassNode* cs_cc; + + index = onigenc_unicode_fold1_key(&to[i]); + if (index >= 0) { + csnode = node_new_cclass(); + cs_cc = CCLASS_(csnode); + if (IS_NULL(csnode)) { + err_free_ns: + for (j = 0; j < n; j++) onig_node_free(ns[j]); + return ONIGERR_MEMORY; + } + m = FOLDS1_UNFOLDS_NUM(index); + for (j = 0; j < m; j++) { + code = FOLDS1_UNFOLDS(index)[j]; + ADD_CODE_INTO_CC(cs_cc, code, env->enc); + } + ADD_CODE_INTO_CC(cs_cc, to[i], env->enc); + ns[n++] = csnode; } else { - r = onig_node_str_cat(snode, buf, buf + len); - if (r < 0) { - onig_node_free(snode); - return r; + len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); + if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) { + csnode = onig_node_new_str(buf, buf + len); + if (IS_NULL(csnode)) goto err_free_ns; + + NODE_STRING_SET_CASE_EXPANDED(csnode); + ns[n++] = csnode; + } + else { + r = onig_node_str_cat(ns[n-1], buf, buf + len); + if (r < 0) goto err_free_ns; } } } - *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE); - CHECK_NULL_RETURN_MEMERR(*(iarg->ptail)); + if (n == 1) + list_node = ns[0]; + else + list_node = make_list(n, ns); + + *(iarg->ptail) = onig_node_new_alt(list_node, NULL_NODE); + if (IS_NULL(*(iarg->ptail))) { + onig_node_free(list_node); + return ONIGERR_MEMORY; + } iarg->ptail = &(NODE_CDR((*(iarg->ptail)))); } } @@ -7826,14 +7917,18 @@ static int parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, ScanEnv* env, int group_head) { - int r, len, group = 0; + int r, len, group; Node* qn; Node** tp; + unsigned int parse_depth; + group = 0; *np = NULL; if (tok->type == (enum TokenSyms )term) goto end_of_token; + parse_depth = env->parse_depth; + switch (tok->type) { case TK_ALT: case TK_EOT: @@ -7866,7 +7961,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = BAG_(*np)->o.options; r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_subexp(&target, tok, term, src, end, env, 0); + r = parse_alts(&target, tok, term, src, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7881,7 +7976,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; - if (tok->escaped) goto tk_raw_byte; + if (tok->escaped) goto tk_crude_byte; else goto tk_byte; break; @@ -7906,44 +8001,37 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, } break; - case TK_RAW_BYTE: - tk_raw_byte: + case TK_CRUDE_BYTE: + tk_crude_byte: { - *np = node_new_str_raw_char((UChar )tok->u.c); + *np = node_new_str_crude_char(tok->u.byte); CHECK_NULL_RETURN_MEMERR(*np); len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { - if (len == enclen(env->enc, STR_(*np)->s)) {/* should not enclen_end() */ + if (len == enclen(env->enc, STR_(*np)->s)) { r = fetch_token(tok, src, end, env); - NODE_STRING_CLEAR_RAW(*np); - goto string_end; + goto tk_crude_byte_end; } } r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_RAW_BYTE) { - /* Don't use this, it is wrong for little endian encodings. */ -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR - int rem; - if (len < ONIGENC_MBC_MINLEN(env->enc)) { - rem = ONIGENC_MBC_MINLEN(env->enc) - len; - (void )node_str_head_pad(STR_(*np), rem, (UChar )0); - if (len + rem == enclen(env->enc, STR_(*np)->s)) { - NODE_STRING_CLEAR_RAW(*np); - goto string_end; - } - } -#endif + if (r != TK_CRUDE_BYTE) return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - } - r = node_str_cat_char(*np, (UChar )tok->u.c); + r = node_str_cat_char(*np, tok->u.byte); if (r < 0) return r; len++; } + + tk_crude_byte_end: + if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + + NODE_STRING_CLEAR_CRUDE(*np); + goto string_end; } break; @@ -7953,7 +8041,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); if (len < 0) return len; #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG - *np = node_new_str_raw(buf, buf + len); + *np = node_new_str_crude(buf, buf + len); #else *np = node_new_str(buf, buf + len); #endif @@ -7996,7 +8084,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); + add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env); if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); } break; @@ -8013,11 +8101,11 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (r != 0) return r; break; - case TK_CC_OPEN: + case TK_OPEN_CC: { CClassNode* cc; - r = parse_char_class(np, tok, src, end, env); + r = parse_cc(np, tok, src, end, env); if (r != 0) return r; cc = CCLASS_(*np); @@ -8055,7 +8143,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_ANYCHAR_ANYTIME: *np = node_new_anychar(); CHECK_NULL_RETURN_MEMERR(*np); - qn = node_new_quantifier(0, REPEAT_INFINITE, 0); + qn = node_new_quantifier(0, INFINITE_REPEAT, FALSE); CHECK_NULL_RETURN_MEMERR(qn); NODE_BODY(qn) = *np; *np = qn; @@ -8158,6 +8246,8 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (is_invalid_quantifier_target(*tp)) return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; + INC_PARSE_DEPTH(parse_depth); + qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, r == TK_INTERVAL); CHECK_NULL_RETURN_MEMERR(qn); @@ -8169,9 +8259,10 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, else { target = *tp; } - r = set_quantifier(qn, target, group, env); + r = assign_quantifier_body(qn, target, group, env); if (r < 0) { onig_node_free(qn); + *tp = NULL_NODE; return r; } @@ -8224,6 +8315,8 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, Node *node, **headp; *top = NULL; + INC_PARSE_DEPTH(env->parse_depth); + r = parse_exp(&node, tok, term, src, end, env, group_head); if (r < 0) { onig_node_free(node); @@ -8234,7 +8327,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, *top = node; } else { - *top = node_new_list(node, NULL); + *top = node_new_list(node, NULL); if (IS_NULL(*top)) { onig_node_free(node); return ONIGERR_MEMORY; @@ -8242,7 +8335,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, headp = &(NODE_CDR(*top)); while (r != TK_EOT && r != term && r != TK_ALT) { - r = parse_exp(&node, tok, term, src, end, env, 0); + r = parse_exp(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8260,21 +8353,20 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, } } + DEC_PARSE_DEPTH(env->parse_depth); return r; } /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ static int -parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) +parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env, int group_head) { int r; Node *node, **headp; *top = NULL; - env->parse_depth++; - if (env->parse_depth > ParseDepthLimit) - return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + INC_PARSE_DEPTH(env->parse_depth); r = parse_branch(&node, tok, term, src, end, env, group_head); if (r < 0) { @@ -8296,7 +8388,7 @@ parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, while (r == TK_ALT) { r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_branch(&node, tok, term, src, end, env, 0); + r = parse_branch(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8323,7 +8415,7 @@ parse_subexp(Node** top, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_PARSER_BUG; } - env->parse_depth--; + DEC_PARSE_DEPTH(env->parse_depth); return r; } @@ -8335,7 +8427,7 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) r = fetch_token(&tok, src, end, env); if (r < 0) return r; - r = parse_subexp(top, &tok, TK_EOT, src, end, env, 0); + r = parse_alts(top, &tok, TK_EOT, src, end, env, FALSE); if (r < 0) return r; return 0; diff --git a/src/regparse.h b/src/regparse.h index b7a2867..1525ccb 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -4,7 +4,7 @@ regparse.h - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,7 +32,7 @@ #include "regint.h" #define NODE_STRING_MARGIN 16 -#define NODE_STRING_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ +#define NODE_STRING_BUF_SIZE 20 /* sizeof(CClassNode) - sizeof(int)*4 */ #define NODE_BACKREFS_SIZE 6 /* node type */ @@ -66,27 +66,32 @@ enum GimmickType { #endif }; -enum BodyEmpty { - BODY_IS_NOT_EMPTY = 0, - BODY_IS_EMPTY = 1, - BODY_IS_EMPTY_MEM = 2, - BODY_IS_EMPTY_REC = 3 +enum BodyEmptyType { + BODY_IS_NOT_EMPTY = 0, + BODY_IS_EMPTY_POSSIBILITY = 1, + BODY_IS_EMPTY_POSSIBILITY_MEM = 2, + BODY_IS_EMPTY_POSSIBILITY_REC = 3 }; +struct _Node; + typedef struct { NodeType node_type; int status; + struct _Node* parent; UChar* s; UChar* end; unsigned int flag; - int capacity; /* (allocated size - 1) or 0: use buf[] */ UChar buf[NODE_STRING_BUF_SIZE]; + int capacity; /* (allocated size - 1) or 0: use buf[] */ + int case_min_len; } StrNode; typedef struct { NodeType node_type; int status; + struct _Node* parent; unsigned int flags; BitSet bs; @@ -96,20 +101,22 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; int lower; int upper; int greedy; - enum BodyEmpty empty_info; + enum BodyEmptyType emptiness; struct _Node* head_exact; struct _Node* next_head_exact; - int is_refered; /* include called node. don't eliminate even if {0} */ + int include_referred; /* include called node. don't eliminate even if {0} */ } QuantNode; typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; enum BagType type; @@ -152,6 +159,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; /* to BagNode : BAG_MEMORY */ int by_number; @@ -166,6 +174,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; int back_num; int back_static[NODE_BACKREFS_SIZE]; @@ -176,6 +185,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; int type; @@ -186,6 +196,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* car; struct _Node* cdr; @@ -194,6 +205,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; int ctype; int not; @@ -204,6 +216,7 @@ typedef struct { typedef struct { NodeType node_type; int status; + struct _Node* parent; enum GimmickType type; int detail_type; @@ -216,6 +229,7 @@ typedef struct _Node { struct { NodeType node_type; int status; + struct _Node* parent; struct _Node* body; } base; @@ -252,10 +266,6 @@ typedef struct _Node { #define NODE_BIT_CALL NODE_TYPE2BIT(NODE_CALL) #define NODE_BIT_GIMMICK NODE_TYPE2BIT(NODE_GIMMICK) -#define NODE_IS_SIMPLE_TYPE(node) \ - ((NODE_TYPE2BIT(NODE_TYPE(node)) & \ - (NODE_BIT_STRING | NODE_BIT_CCLASS | NODE_BIT_CTYPE | NODE_BIT_BACKREF)) != 0) - #define NODE_TYPE(node) ((node)->u.base.node_type) #define NODE_SET_TYPE(node, ntype) (node)->u.base.node_type = (ntype) @@ -284,26 +294,21 @@ typedef struct _Node { #define ANCR_ANYCHAR_INF_MASK (ANCR_ANYCHAR_INF | ANCR_ANYCHAR_INF_ML) #define ANCR_END_BUF_MASK (ANCR_END_BUF | ANCR_SEMI_END_BUF) -#define NODE_STRING_RAW (1<<0) /* by backslashed number */ -#define NODE_STRING_AMBIG (1<<1) -#define NODE_STRING_GOOD_AMBIG (1<<2) -#define NODE_STRING_DONT_GET_OPT_INFO (1<<3) +#define NODE_STRING_CRUDE (1<<0) +#define NODE_STRING_CASE_EXPANDED (1<<1) +#define NODE_STRING_CASE_FOLD_MATCH (1<<2) #define NODE_STRING_LEN(node) (int )((node)->u.str.end - (node)->u.str.s) -#define NODE_STRING_SET_RAW(node) (node)->u.str.flag |= NODE_STRING_RAW -#define NODE_STRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NODE_STRING_RAW -#define NODE_STRING_SET_AMBIG(node) (node)->u.str.flag |= NODE_STRING_AMBIG -#define NODE_STRING_SET_GOOD_AMBIG(node) (node)->u.str.flag |= NODE_STRING_GOOD_AMBIG -#define NODE_STRING_SET_DONT_GET_OPT_INFO(node) \ - (node)->u.str.flag |= NODE_STRING_DONT_GET_OPT_INFO -#define NODE_STRING_IS_RAW(node) \ - (((node)->u.str.flag & NODE_STRING_RAW) != 0) -#define NODE_STRING_IS_AMBIG(node) \ - (((node)->u.str.flag & NODE_STRING_AMBIG) != 0) -#define NODE_STRING_IS_GOOD_AMBIG(node) \ - (((node)->u.str.flag & NODE_STRING_GOOD_AMBIG) != 0) -#define NODE_STRING_IS_DONT_GET_OPT_INFO(node) \ - (((node)->u.str.flag & NODE_STRING_DONT_GET_OPT_INFO) != 0) +#define NODE_STRING_SET_CRUDE(node) (node)->u.str.flag |= NODE_STRING_CRUDE +#define NODE_STRING_CLEAR_CRUDE(node) (node)->u.str.flag &= ~NODE_STRING_CRUDE +#define NODE_STRING_SET_CASE_EXPANDED(node) (node)->u.str.flag |= NODE_STRING_CASE_EXPANDED +#define NODE_STRING_SET_CASE_FOLD_MATCH(node) (node)->u.str.flag |= NODE_STRING_CASE_FOLD_MATCH +#define NODE_STRING_IS_CRUDE(node) \ + (((node)->u.str.flag & NODE_STRING_CRUDE) != 0) +#define NODE_STRING_IS_CASE_EXPANDED(node) \ + (((node)->u.str.flag & NODE_STRING_CASE_EXPANDED) != 0) +#define NODE_STRING_IS_CASE_FOLD_MATCH(node) \ + (((node)->u.str.flag & NODE_STRING_CASE_FOLD_MATCH) != 0) #define BACKREFS_P(br) \ (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static) @@ -314,7 +319,7 @@ typedef struct _Node { #define NODE_ST_CLEN_FIXED (1<<2) #define NODE_ST_MARK1 (1<<3) #define NODE_ST_MARK2 (1<<4) -#define NODE_ST_STOP_BT_SIMPLE_REPEAT (1<<5) +#define NODE_ST_STRICT_REAL_REPEAT (1<<5) #define NODE_ST_RECURSION (1<<6) #define NODE_ST_CALLED (1<<7) #define NODE_ST_ADDR_FIXED (1<<8) @@ -330,6 +335,7 @@ typedef struct _Node { #define NODE_ST_FIXED_OPTION (1<<18) #define NODE_ST_PROHIBIT_RECURSION (1<<19) #define NODE_ST_SUPER (1<<20) +#define NODE_ST_EMPTY_STATUS_CHECK (1<<21) #define NODE_STATUS(node) (((Node* )node)->u.base.status) @@ -357,9 +363,12 @@ typedef struct _Node { #define NODE_IS_SUPER(node) ((NODE_STATUS(node) & NODE_ST_SUPER) != 0) #define NODE_IS_PROHIBIT_RECURSION(node) \ ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) -#define NODE_IS_STOP_BT_SIMPLE_REPEAT(node) \ - ((NODE_STATUS(node) & NODE_ST_STOP_BT_SIMPLE_REPEAT) != 0) +#define NODE_IS_STRICT_REAL_REPEAT(node) \ + ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0) +#define NODE_IS_EMPTY_STATUS_CHECK(node) \ + ((NODE_STATUS(node) & NODE_ST_EMPTY_STATUS_CHECK) != 0) +#define NODE_PARENT(node) ((node)->u.base.parent) #define NODE_BODY(node) ((node)->u.base.body) #define NODE_QUANT_BODY(node) ((node)->body) #define NODE_BAG_BODY(node) ((node)->body) @@ -372,11 +381,8 @@ typedef struct _Node { (senv)->mem_env_dynamic : (senv)->mem_env_static) typedef struct { - Node* node; -#if 0 - int in; - int recursion; -#endif + Node* mem_node; + Node* empty_repeat_node; } MemEnv; typedef struct { @@ -388,9 +394,8 @@ typedef struct { OnigCaseFoldType case_fold_flag; OnigEncoding enc; OnigSyntaxType* syntax; - MemStatusType capture_history; - MemStatusType bt_mem_start; - MemStatusType bt_mem_end; + MemStatusType cap_history; + MemStatusType backtrack_mem; /* backtrack/recursion */ MemStatusType backrefed_mem; UChar* pattern; UChar* pattern_end; @@ -408,7 +413,10 @@ typedef struct { MemEnv mem_env_static[SCANENV_MEMENV_SIZE]; MemEnv* mem_env_dynamic; unsigned int parse_depth; - +#ifdef ONIG_DEBUG_PARSE + unsigned int max_parse_depth; +#endif + int backref_num; int keep_num; int save_num; int save_alloc_num; @@ -429,9 +437,7 @@ extern int onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map)); extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); -extern int onig_scan_unsigned_number P_((UChar** src, const UChar* end, OnigEncoding enc)); -extern void onig_reduce_nested_quantifier P_((Node* pnode, Node* cnode)); -extern void onig_node_conv_to_str_node P_((Node* node, int raw)); +extern int onig_reduce_nested_quantifier P_((Node* pnode)); extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end)); extern void onig_node_free P_((Node* node)); @@ -439,13 +445,13 @@ extern Node* onig_node_new_bag P_((enum BagType type)); extern Node* onig_node_new_anchor P_((int type, int ascii_mode)); extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); extern Node* onig_node_new_list P_((Node* left, Node* right)); -extern Node* onig_node_list_add P_((Node* list, Node* x)); extern Node* onig_node_new_alt P_((Node* left, Node* right)); extern void onig_node_str_clear P_((Node* node)); extern int onig_names_free P_((regex_t* reg)); extern int onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); extern int onig_free_shared_cclass_table P_((void)); extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); +extern int onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]); extern OnigLen onig_get_tiny_min_len(Node* node, unsigned int inhibit_node_types, int* invalid_node); #ifdef USE_CALLOUT diff --git a/src/regposerr.c b/src/regposerr.c index e389531..e1747c5 100644 --- a/src/regposerr.c +++ b/src/regposerr.c @@ -2,7 +2,7 @@ regposerr.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regposix.c b/src/regposix.c index 09e16ac..b3e78ff 100644 --- a/src/regposix.c +++ b/src/regposix.c @@ -2,7 +2,7 @@ regposix.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regsyntax.c b/src/regsyntax.c index d4420cc..513c7f7 100644 --- a/src/regsyntax.c +++ b/src/regsyntax.c @@ -2,7 +2,7 @@ regsyntax.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regtrav.c b/src/regtrav.c index 58a17f5..8307695 100644 --- a/src/regtrav.c +++ b/src/regtrav.c @@ -2,7 +2,7 @@ regtrav.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2004 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/regversion.c b/src/regversion.c index 594a52c..de993d3 100644 --- a/src/regversion.c +++ b/src/regversion.c @@ -2,7 +2,7 @@ regversion.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -2,7 +2,7 @@ sjis.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -149,10 +149,6 @@ code_to_mbc(OnigCodePoint code, UChar *buf) if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff)); *p++ = (UChar )(code & 0xff); -#if 0 - if (enclen(ONIG_ENCODING_SJIS, buf) != (p - buf)) - return REGERR_INVALID_CODE_POINT_VALUE; -#endif return (int )(p - buf); } @@ -179,31 +175,6 @@ mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, } } -#if 0 -static int -is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end); - -} -#endif - -#if 0 -static int -is_code_ctype(OnigCodePoint code, unsigned int ctype) -{ - if (code < 128) - return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) { - return (code_to_mbclen(code) > 1 ? TRUE : FALSE); - } - } - - return FALSE; -} -#endif - static UChar* left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/sjis_prop.c b/src/sjis_prop.c index 3a88a38..e33fbb2 100644 --- a/src/sjis_prop.c +++ b/src/sjis_prop.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -pt -T -L ANSI-C -N onigenc_sjis_lookup_property_name --output-file gperf2.tmp sjis_prop.gperf */ +/* Command-line: gperf -pt -T -L ANSI-C -N onigenc_sjis_lookup_property_name --output-file gperf2.tmp sjis_prop.gperf */ /* Computed positions: -k'1,3' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ diff --git a/src/unicode.c b/src/unicode.c index 5820319..474436a 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -2,7 +2,7 @@ unicode.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -356,16 +356,15 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (fn = 0; fn < 2; fn++) { int index; cs[fn][0] = FOLDS2_FOLD(buk->index)[fn]; + ncs[fn] = 1; index = onigenc_unicode_fold1_key(&cs[fn][0]); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i]; } - ncs[fn] = m + 1; + ncs[fn] += m; } - else - ncs[fn] = 1; } for (i = 0; i < ncs[0]; i++) { @@ -393,16 +392,15 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (fn = 0; fn < 3; fn++) { int index; cs[fn][0] = FOLDS3_FOLD(buk->index)[fn]; + ncs[fn] = 1; index = onigenc_unicode_fold1_key(&cs[fn][0]); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { cs[fn][i+1] = FOLDS1_UNFOLDS(index)[i]; } - ncs[fn] = m + 1; + ncs[fn] += m; } - else - ncs[fn] = 1; } for (i = 0; i < ncs[0]; i++) { diff --git a/src/unicode_egcb_data.c b/src/unicode_egcb_data.c index 6a74c77..3c49422 100644 --- a/src/unicode_egcb_data.c +++ b/src/unicode_egcb_data.c @@ -1,6 +1,6 @@ /* unicode_egcb_data.c: Generated by make_unicode_egcb_data.py. */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,7 +25,7 @@ * SUCH DAMAGE. */ -#define GRAPHEME_BREAK_PROPERTY_VERSION 12_1_0 +#define GRAPHEME_BREAK_PROPERTY_VERSION 120100 /* CR diff --git a/src/unicode_fold1_key.c b/src/unicode_fold1_key.c index b84b528..171a0fa 100644 --- a/src/unicode_fold1_key.c +++ b/src/unicode_fold1_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold1_key unicode_fold1_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold1_key unicode_fold1_key.gperf */ /* Computed positions: -k'1-3' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -2983,7 +2983,7 @@ onigenc_unicode_fold1_key(OnigCodePoint codes[]) 4026 }; - if (0 == 0) + { int key = hash(codes); diff --git a/src/unicode_fold2_key.c b/src/unicode_fold2_key.c index 2310f0a..c39b19d 100644 --- a/src/unicode_fold2_key.c +++ b/src/unicode_fold2_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold2_key unicode_fold2_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold2_key unicode_fold2_key.gperf */ /* Computed positions: -k'3,6' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -211,7 +211,7 @@ onigenc_unicode_fold2_key(OnigCodePoint codes[]) 129 }; - if (0 == 0) + { int key = hash(codes); diff --git a/src/unicode_fold3_key.c b/src/unicode_fold3_key.c index 0e02a62..295c447 100644 --- a/src/unicode_fold3_key.c +++ b/src/unicode_fold3_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_fold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold3_key unicode_fold3_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1 -N onigenc_unicode_fold3_key unicode_fold3_key.gperf */ /* Computed positions: -k'3,6,9' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -121,7 +121,7 @@ onigenc_unicode_fold3_key(OnigCodePoint codes[]) 0 }; - if (0 == 0) + { int key = hash(codes); diff --git a/src/unicode_fold_data.c b/src/unicode_fold_data.c index 0dbf9ae..68694b0 100644 --- a/src/unicode_fold_data.c +++ b/src/unicode_fold_data.c @@ -1,7 +1,7 @@ /* This file was generated by make_unicode_fold_data.py. */ #include "regenc.h" -#define UNICODE_CASEFOLD_VERSION 12_1_0 +#define UNICODE_CASEFOLD_VERSION 120100 OnigCodePoint OnigUnicodeFolds1[] = { diff --git a/src/unicode_property_data.c b/src/unicode_property_data.c index 5c1c8a9..0083dd6 100644 --- a/src/unicode_property_data.c +++ b/src/unicode_property_data.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf1.tmp unicode_property_data.gperf */ +/* Command-line: gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf1.tmp unicode_property_data.gperf */ /* Computed positions: -k'1-3,5-6,12,16,$' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ @@ -29580,7 +29580,8 @@ unicode_lookup_property_name (register const char *str, register size_t len) -#define UNICODE_PROPERTY_VERSION 12_1_0 +#define UNICODE_PROPERTY_VERSION 120100 +#define UNICODE_EMOJI_VERSION 1201 #define PROPERTY_NAME_MAX_SIZE 59 #define CODE_RANGES_NUM 568 diff --git a/src/unicode_property_data_posix.c b/src/unicode_property_data_posix.c index eddc108..e299e85 100644 --- a/src/unicode_property_data_posix.c +++ b/src/unicode_property_data_posix.c @@ -1,5 +1,5 @@ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf2.tmp unicode_property_data_posix.gperf */ +/* Command-line: gperf -T -C -c -t -j1 -L ANSI-C --ignore-case --pic -Q unicode_prop_name_pool -N unicode_lookup_property_name --output-file gperf2.tmp unicode_property_data_posix.gperf */ /* Computed positions: -k'1,3' */ #if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ diff --git a/src/unicode_unfold_key.c b/src/unicode_unfold_key.c index b2228e0..51a037b 100644 --- a/src/unicode_unfold_key.c +++ b/src/unicode_unfold_key.c @@ -1,7 +1,7 @@ /* This file was converted by gperf_unfold_key_conv.py from gperf output file. */ /* ANSI-C code produced by gperf version 3.1 */ -/* Command-line: /usr/local/bin/gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1,0 -N onigenc_unicode_unfold_key unicode_unfold_key.gperf */ +/* Command-line: gperf -n -C -T -c -t -j1 -L ANSI-C -F,-1,0 -N onigenc_unicode_unfold_key unicode_unfold_key.gperf */ /* Computed positions: -k'1-3' */ @@ -9,7 +9,7 @@ /* This gperf source file was generated by make_unicode_fold_data.py */ /*- - * Copyright (c) 2017-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2017-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -3288,7 +3288,7 @@ onigenc_unicode_unfold_key(OnigCodePoint code) {0x1e907, 4005, 1} }; - if (0 == 0) + { int key = hash(&code); diff --git a/src/unicode_wb_data.c b/src/unicode_wb_data.c index 7778157..8e1a267 100644 --- a/src/unicode_wb_data.c +++ b/src/unicode_wb_data.c @@ -1,6 +1,6 @@ /* unicode_wb_data.c: Generated by make_unicode_wb_data.py. */ /*- - * Copyright (c) 2019 K.Kosako <kkosako0 AT gmail DOT com> + * Copyright (c) 2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,7 +25,7 @@ * SUCH DAMAGE. */ -#define WORD_BREAK_PROPERTY_VERSION 12_1_0 +#define WORD_BREAK_PROPERTY_VERSION 120100 /* ALetter diff --git a/src/utf16_be.c b/src/utf16_be.c index 22bf74d..d99af71 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -2,7 +2,7 @@ utf16_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -103,7 +103,25 @@ utf16be_mbc_enc_len(const UChar* p) static int is_valid_mbc_string(const UChar* s, const UChar* end) { - return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end); + while (s < end) { + int len = utf16be_mbc_enc_len(s); + if (len == 4) { + if (s + 2 >= end) + return FALSE; + if (! UTF16_IS_SURROGATE_SECOND(*(s+2))) + return FALSE; + } + else + if (UTF16_IS_SURROGATE_SECOND(*s)) + return FALSE; + + s += len; + } + + if (s != end) + return FALSE; + else + return TRUE; } static int @@ -146,7 +164,15 @@ utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) static int utf16be_code_to_mbclen(OnigCodePoint code) { - return (code > 0xffff ? 4 : 2); + if (code > 0xffff) { + if (code > 0x10ffff) + return ONIGERR_INVALID_CODE_POINT_VALUE; + else + return 4; + } + else { + return 2; + } } static int @@ -201,39 +227,6 @@ utf16be_mbc_case_fold(OnigCaseFoldType flag, pp, end, fold); } -#if 0 -static int -utf16be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += EncLen_UTF16[*p]; - - if (*p == 0) { - int c, v; - - p++; - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf16be_left_adjust_char_head(const UChar* start, const UChar* s) { @@ -243,7 +236,8 @@ utf16be_left_adjust_char_head(const UChar* start, const UChar* s) s--; } - if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) + if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1 && + UTF16_IS_SURROGATE_FIRST(*(s-2))) s -= 2; return (UChar* )s; diff --git a/src/utf16_le.c b/src/utf16_le.c index 4b231c6..c6edd94 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -2,7 +2,7 @@ utf16_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -95,7 +95,15 @@ static const int EncLen_UTF16[] = { static int utf16le_code_to_mbclen(OnigCodePoint code) { - return (code > 0xffff ? 4 : 2); + if (code > 0xffff) { + if (code > 0x10ffff) + return ONIGERR_INVALID_CODE_POINT_VALUE; + else + return 4; + } + else { + return 2; + } } static int @@ -110,7 +118,16 @@ is_valid_mbc_string(const UChar* p, const UChar* end) const UChar* end1 = end - 1; while (p < end1) { - p += utf16le_mbc_enc_len(p); + int len = utf16le_mbc_enc_len(p); + if (len == 4) { + if (p + 3 < end && ! UTF16_IS_SURROGATE_SECOND(*(p + 3))) + return FALSE; + } + else + if (UTF16_IS_SURROGATE_SECOND(*(p + 1))) + return FALSE; + + p += len; } if (p != end) @@ -210,39 +227,6 @@ utf16le_mbc_case_fold(OnigCaseFoldType flag, fold); } -#if 0 -static int -utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, - const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += EncLen_UTF16[*(p+1)]; - - if (*(p+1) == 0) { - int c, v; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf16le_left_adjust_char_head(const UChar* start, const UChar* s) { @@ -252,7 +236,8 @@ utf16le_left_adjust_char_head(const UChar* start, const UChar* s) s--; } - if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) + if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1 && + UTF16_IS_SURROGATE_FIRST(*(s-1))) s -= 2; return (UChar* )s; diff --git a/src/utf32_be.c b/src/utf32_be.c index dd17d3b..67e50a2 100644 --- a/src/utf32_be.c +++ b/src/utf32_be.c @@ -2,7 +2,7 @@ utf32_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -119,39 +119,6 @@ utf32be_mbc_case_fold(OnigCaseFoldType flag, fold); } -#if 0 -static int -utf32be_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += 4; - - if (*(p+2) == 0 && *(p+1) == 0 && *p == 0) { - int c, v; - - p += 3; - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf32be_left_adjust_char_head(const UChar* start, const UChar* s) { diff --git a/src/utf32_le.c b/src/utf32_le.c index d9fe3c6..2ae2275 100644 --- a/src/utf32_le.c +++ b/src/utf32_le.c @@ -2,7 +2,7 @@ utf32_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -120,38 +120,6 @@ utf32le_mbc_case_fold(OnigCaseFoldType flag, fold); } -#if 0 -static int -utf32le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end) -{ - const UChar* p = *pp; - - (*pp) += 4; - - if (*(p+1) == 0 && *(p+2) == 0 && *(p+3) == 0) { - int c, v; - - if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) { - return TRUE; - } - - c = *p; - v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c, - (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER)); - if ((v | BIT_CTYPE_LOWER) != 0) { - /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */ - if (c >= 0xaa && c <= 0xba) - return FALSE; - else - return TRUE; - } - return (v != 0 ? TRUE : FALSE); - } - - return FALSE; -} -#endif - static UChar* utf32le_left_adjust_char_head(const UChar* start, const UChar* s) { @@ -2,7 +2,7 @@ utf8.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -97,33 +97,6 @@ is_valid_mbc_string(const UChar* p, const UChar* end) return TRUE; } -#if 0 -static int -is_mbc_newline(const UChar* p, const UChar* end) -{ - if (p < end) { - if (*p == 0x0a) return 1; - -#ifdef USE_UNICODE_ALL_LINE_TERMINATORS -#ifndef USE_CRNL_AS_LINE_TERMINATOR - if (*p == 0x0d) return 1; -#endif - if (p + 1 < end) { - if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */ - return 1; - if (p + 2 < end) { - if ((*(p+2) == 0xa8 || *(p+2) == 0xa9) - && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */ - return 1; - } - } -#endif - } - - return 0; -} -#endif - static OnigCodePoint mbc_to_code(const UChar* p, const UChar* end) { |